In [ ]:
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.

imports and global variables (see requirements.txt for dependencies)¶

In [24]:
# !pip install -r requirements.txt

# --- Standard Library Imports ---
import gzip
import io
import ipaddress
import json
import os
import subprocess
import statistics
import sys
import time
import urllib.error
import urllib.parse
import urllib.request

# --- Third-Party Library Imports ---
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import pandas as pd
import radix
import requests
from bs4 import BeautifulSoup
from sklearn.neighbors import BallTree
import duckdb
from geopy import distance

# constants
SPEED_IN_FIBRE = 299_792 / 1.467 # 1.467 refraction index fibre optics https://www.thorlabs.com/newgrouppage9.cfm?objectgroup_id=949
ALPHA = 1
EARTH_RADIUS_KM = 6371 # Radius of Earth

# parameters
LATENCY_THRESHOLD = 3 # RTT maximum between latency neighbor hop and PoP
TRACEROUTE_MAX_LATENCY = 12 # RTT maximum between VP and PoP
DISK_DISTANCE_KM = 100 # airport grouping radius

DISK_DISTANCE_RAD = DISK_DISTANCE_KM / EARTH_RADIUS_KM # convert distance to radians

Read in data and combine with external datasets¶

In [3]:
# for reproducability we share a single combined parquet containing all traceroute results

traceroutes_df = pd.read_parquet('./data/combined.parquet', engine='pyarrow') # engine='fastparquet'

traceroutes_df.head()
Out[3]:
tx_hostname dst hop_count probe_count hop_addr hop_probe_ttl hop_rtt hop_reply_ttl
0 aep3-ar.ark.caida.org 1.0.0.0 9 13 192.168.1.1 1 1.281 64
1 aep3-ar.ark.caida.org 1.0.0.0 9 13 192.168.0.1 2 1.606 63
2 aep3-ar.ark.caida.org 1.0.0.0 9 13 181.96.113.217 7 13.811 249
3 aep3-ar.ark.caida.org 1.0.0.0 9 13 181.13.127.254 8 9.127 248
4 aep3-ar.ark.caida.org 1.0.0.0 9 13 1.0.0.0 9 12.355 56
In [3]:
# # extract hop_name column for retrieving OI data
#
# unique_hop_addr = traceroutes_df['hop_addr'].dropna().unique()
# pd.DataFrame(unique_hop_addr).to_csv('./data/unique_hops.csv', index=False, header=False)
In [ ]:
# add OpenINTEL PTR recrods to traceroute data

hop_names = pd.read_csv('./data/ptr_lookup.csv')

traceroutes_df = pd.merge(traceroutes_df, hop_names, how='left', left_on='hop_addr', right_on='ip4_address')

# rename ptr_name to hop_name
traceroutes_df = traceroutes_df.rename(columns={'ptr_name': 'hop_name'})
In [9]:
# load in MAnycastR census data

census = pd.read_csv('./data/IPv4_full_19_04.csv.gz', compression='gzip') #  data from 2025 April 19
census = census[census['iGreedyICMPv4'] > 1] # filter on iGreedy confirmed anycast

census.head()
Out[9]:
prefix MAnycastICMPv4 MAnycastTCPv4 MAnycastUDPv4 iGreedyICMPv4 iGreedyTCPv4
1 1.0.0.0/24 27 27 26 71 28
136 1.1.1.0/24 28 28 27 71 28
229 1.10.10.0/24 1 0 1 3 0
2081 1.12.0.0/24 5 0 4 5 0
2082 1.12.12.0/24 3 1 0 3 1
In [6]:
# add prefix column

traceroutes_df['prefix'] = traceroutes_df['dst'].apply(lambda x: '.'.join(x.split('.')[:3]) + '.0/24')
In [7]:
# filter traceroutes by those confirmed using iGreedy (we filter out MAncyast2 candidate anycast targets as we do not consider traceroute as an anycast detection technique)

traceroutes_df = traceroutes_df[traceroutes_df['prefix'].isin(census['prefix'])]
In [8]:
# read in locations of Ark VPs
ark_locs = pd.read_csv('./data/arklocs.txt', header=None, names=['tx_hostname', 'vp_lat', 'vp_lon', 'tx_city', 'vp_asn'])
ark_locs['tx_hostname'] = ark_locs['tx_hostname'].str.split(".ark.caida.org").str[0]
ark_locs['vp_asn'] = ark_locs['vp_asn'].apply(lambda x: '-' if pd.isna(x) else str(int(float(x))))
ark_locs['tx_airport'] = ark_locs['tx_hostname'].str[:3]

ark_locs.head()
Out[8]:
tx_hostname vp_lat vp_lon tx_city vp_asn tx_airport
0 hlz2-nz -37.79 175.28 Hamilton 9500 hlz
1 hkg4-cn 22.36 114.12 Hong Kong 212238 hkg
2 lax4-us 33.92 -118.39 Los Angeles 212238 lax
3 bfi-us 47.61 -122.33 Seattle 209 bfi
4 drs-de 51.03 13.73 Dresden 680 drs
In [9]:
# how many distinct ASes?

ark_locs['vp_asn'].nunique()
Out[9]:
178
In [10]:
# how many distinct cities?

ark_locs['tx_city'].nunique(0)
Out[10]:
205
In [11]:
ark_locs['country_code'] = ark_locs['tx_hostname'].str.split("-").str[1]
ark_locs['country_code'].nunique()
Out[11]:
66
In [12]:
# drop country code

ark_locs.drop(columns='country_code', inplace=True)
In [13]:
# add ark locations
traceroutes_df['tx_hostname'] = traceroutes_df['tx_hostname'].str.split(".ark.caida.org").str[0]
traceroutes_df = pd.merge(traceroutes_df, ark_locs, on='tx_hostname', how='left')

traceroutes_df.head()
Out[13]:
tx_hostname dst hop_count probe_count hop_addr hop_probe_ttl hop_rtt hop_reply_ttl ip4_address hop_name prefix vp_lat vp_lon tx_city vp_asn tx_airport
0 aep3-ar 1.0.0.0 9 13 192.168.1.1 1 1.281 64 NaN NaN 1.0.0.0/24 -34.6 -58.38 Buenos Aires 7303 aep
1 aep3-ar 1.0.0.0 9 13 192.168.0.1 2 1.606 63 NaN NaN 1.0.0.0/24 -34.6 -58.38 Buenos Aires 7303 aep
2 aep3-ar 1.0.0.0 9 13 181.96.113.217 7 13.811 249 181.96.113.217 host217.181-96-113.telecom.net.ar. 1.0.0.0/24 -34.6 -58.38 Buenos Aires 7303 aep
3 aep3-ar 1.0.0.0 9 13 181.13.127.254 8 9.127 248 181.13.127.254 host254.181-13-127.telecom.net.ar. 1.0.0.0/24 -34.6 -58.38 Buenos Aires 7303 aep
4 aep3-ar 1.0.0.0 9 13 1.0.0.0 9 12.355 56 NaN NaN 1.0.0.0/24 -34.6 -58.38 Buenos Aires 7303 aep

exploratory data analysis¶

In [14]:
# how many VPs participated?

traceroutes_df['tx_hostname'].nunique()
Out[14]:
274
In [15]:
# how many ASes are these in?

traceroutes_df['vp_asn'].nunique()
Out[15]:
178
In [16]:
# how many targets did we measure?

traceroutes_df['dst'].nunique()
Out[16]:
13735
In [17]:
# how many tx_hostname, dst pairs (i.e., how many traceroutes?)

traceroutes_df[['tx_hostname', 'dst']].drop_duplicates().shape[0]
Out[17]:
3354073
In [18]:
# what is the average number of VPs that completed a traceroute towards a target

traceroutes_df.groupby('dst')['tx_hostname'].nunique().mean()
Out[18]:
244.19898070622497
In [19]:
# what is the average number of hops captured?

traceroutes_df.groupby(['tx_hostname', 'dst']).size().mean()
Out[19]:
8.123025944873591
In [20]:
# how many unique hop addresses?

traceroutes_df['hop_addr'].nunique()
Out[20]:
99393
In [21]:
# how many unique hop addresses have PTR records?

traceroutes_df[~traceroutes_df['hop_name'].isna()]['hop_addr'].nunique()
Out[21]:
45106
In [22]:
# how many hops have PTR records?

(~traceroutes_df['hop_name'].isna()).sum()
Out[22]:
14336720
In [23]:
# how many hops do not have PTR records?

traceroutes_df['hop_name'].isna().sum()
Out[23]:
12908502
In [24]:
 # how many unique PTR records did we observe?
traceroutes_df['hop_name'].nunique()
Out[24]:
41758

Calculate distances and RTTS¶

In [25]:
# get RTT to the PoP for each traceroute

dst_hops_df = traceroutes_df[traceroutes_df['dst'] == traceroutes_df['hop_addr']]

dst_hops_df = dst_hops_df[['tx_hostname', 'dst', 'hop_rtt']].rename(
    columns={'hop_rtt': 'pop_rtt'}
)
dst_hops_df = dst_hops_df.sort_values(by='pop_rtt').drop_duplicates(subset=['tx_hostname', 'dst']) # remove duplicate RTTs observed between an (Ark, PoP) pair -> keep the lowest RTT

traceroutes_df = pd.merge(traceroutes_df, dst_hops_df, on=['tx_hostname', 'dst'], how='left')
In [26]:
def rtt_to_distance(rtt_ms):
    one_way_time_s = (rtt_ms / 10_00) / 2  # to seconds and divide by 2 to get one-way latency
    distance_km = one_way_time_s * SPEED_IN_FIBRE
    return distance_km
In [27]:
# calculate distance between VP and hop
traceroutes_df['ark_to_hop'] = traceroutes_df['hop_rtt'].apply(rtt_to_distance)

# calculate distance between VP and PoP
traceroutes_df['ark_to_pop'] = traceroutes_df['pop_rtt'].apply(rtt_to_distance)

# calculate distance between hop and PoP
#traceroutes_df['hop_to_pop'] = np.maximum(0, result_df['ark_to_pop'] - result_df['ark_to_hop']) # this value is meaningless as the delta RTT between the two is unreliable

load in ipinfo data¶

In [28]:
# load in ipinfo data (weekly snapshots)

# our methodology requires an IP to location database with city-level data
# this example uses IPInfo's geolocation dataset
# they provide free access for academic, education, and non-commercial projects under certain terms (https://ipinfo.io/use-cases/ip-data-for-academic-research)

json_file = './data/2025-03-31_standard_location.json.gz' # monthly snapshot

def ipv4_to_uint32(ip_str):
    """Converts an IPv4 string to an unsigned 32-bit integer."""
    if ip_str is None:
        return None
    try:
        # Convert to IPv4Address object first to validate
        ip_obj = ipaddress.ip_address(ip_str)
        if ip_obj.version == 4:
            return int(ip_obj) # Returns the integer representation
        else:
            return None # Not an IPv4 address
    except ValueError:
        # Handle cases where the string is not a valid IP address at all
        return None

# load data in duck DB
db_file = 'ip_locations_ipv4.duckdb'
table_name = 'locations_ipv4'

# if db_file exists, we are done
if os.path.exists(db_file):
    print(f"Database file {db_file} already exists. Skipping loading.")
else:
    print(f"Loading IPv4 ipinfo data from {json_file} into DuckDB table {table_name}")
    start_time = time.time()

    # create the database file
    con = duckdb.connect(database=db_file, read_only=False)

    # add ipv4 translation function to duckdb
    con.create_function('ipv4_to_uint32_udf', ipv4_to_uint32, [duckdb.typing.VARCHAR], duckdb.typing.UINTEGER)

    # create table schema
    con.execute(f"""
        DROP TABLE IF EXISTS {table_name};
        CREATE TABLE {table_name} (
            start_ip_str VARCHAR,
            end_ip_str VARCHAR,
            join_key VARCHAR,
            city VARCHAR,
            region VARCHAR,
            country VARCHAR,
            latitude DOUBLE,
            longitude DOUBLE,
            postal_code VARCHAR,
            timezone VARCHAR,
            start_ip_uint32 UINTEGER,
            end_ip_uint32 UINTEGER
        );
    """)

    # load in data (excluding ipv6 addresses)
    try:
        con.execute(f"""
            INSERT INTO {table_name} (
                start_ip_str, end_ip_str, join_key, city, region, country,
                latitude, longitude, postal_code, timezone,
                start_ip_uint32, end_ip_uint32
            )
            SELECT
                start_ip,
                end_ip,
                join_key,
                city,
                region,
                country,
                CAST(latitude AS DOUBLE),
                CAST(longitude AS DOUBLE),
                postal_code,
                timezone,
                ipv4_to_uint32_udf(start_ip) AS start_ip_u32,
                ipv4_to_uint32_udf(end_ip) AS end_ip_u32
            FROM read_json_auto('{json_file}', filename=true)
            WHERE
                -- filter out non-ipv4 addresses
                start_ip LIKE '%.%.%.%' AND start_ip NOT LIKE '%:%' AND
                end_ip LIKE '%.%.%.%'   AND end_ip   NOT LIKE '%:%'
        """)
    except Exception as e:
        print(f"An error occurred during INSERT: {e}")
        con.close()
        raise

    print("Creating index on IP ranges...")
    con.execute(f"CREATE INDEX ip_range_idx_u32 ON {table_name} (start_ip_uint32, end_ip_uint32);")
    con.close()

    end_time = time.time()
    print(f"IPInfo (IPV4) Database creation took: {end_time - start_time:.2f} seconds")
Database file ip_locations_ipv4.duckdb already exists. Skipping loading.
In [29]:
# get unique addresses
unique_addr = traceroutes_df['hop_addr'].dropna().unique()
print(f"Found {len(unique_addr)} unique hop addresses out of {len(traceroutes_df)} total hops.")

# get u32 values for lookup
addr_data = []
for hop in unique_addr:
    ip_uint32 = ipv4_to_uint32(hop)
    if ip_uint32 is not None: # Keep only valid IPv4 conversions
        addr_data.append({'hop_addr': hop, 'hop_addr_uint32': ip_uint32})

unique_addr_df = pd.DataFrame(addr_data)
unique_addr_df['hop_addr_uint32'] = unique_addr_df['hop_addr_uint32'].astype('uint32')

del unique_addr
del addr_data
Found 99393 unique hop addresses out of 27245222 total hops.
In [30]:
columns_to_load = [
    'city',
    'latitude',
    'longitude',
    'start_ip_uint32',
    'end_ip_uint32'
]
select_cols_str = ", ".join(columns_to_load)

dtypes_to_load = {
    'city': 'string',
    'latitude': 'float32',
    'longitude': 'float32',
    'start_ip_uint32': 'uint32',
    'end_ip_uint32': 'uint32'
}


print(f"Loading selected columns from {table_name} into Pandas...")
start_load_time = time.time()
con = None

try:
    con = duckdb.connect(database=db_file, read_only=True)
    print("  Connected to DuckDB.")

    # cast df
    select_expressions = []
    for col in columns_to_load:
        dtype = dtypes_to_load.get(col)
        if dtype:
            # data types
            sql_type = dtype.replace('string', 'VARCHAR') \
                            .replace('float32', 'FLOAT') \
                            .replace('float64', 'DOUBLE') \
                            .replace('uint32', 'UINTEGER') \
                            .replace('Int64', 'BIGINT')
            select_expressions.append(f"CAST({col} AS {sql_type}) AS {col}")
        else:
            select_expressions.append(col)

    select_query_cols_str = ",\n    ".join(select_expressions)

    query = f"""
    SELECT
        {select_query_cols_str}
    FROM {table_name}
    WHERE start_ip_uint32 IS NOT NULL -- exclude NaN values
    ORDER BY start_ip_uint32 ASC; -- sort by start_ip_uint32
    """

    ipinfo_df = con.execute(query).fetchdf()
    print(f"Successfully loaded {len(ipinfo_df)} rows.")

except Exception as e:
    print(f"An error occurred loading data from DuckDB: {e}")
finally:
    if con:
        con.close()
        print("  DuckDB connection closed.")

end_load_time = time.time()

print(f"\nLoading and initial sorting took: {end_load_time - start_load_time:.2f} seconds")
print("\nLoaded ip2location DataFrame Info:")
ipinfo_df.info(memory_usage='deep') # Check memory usage
Loading selected columns from locations_ipv4 into Pandas...
  Connected to DuckDB.
FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))
Successfully loaded 27274159 rows.
  DuckDB connection closed.

Loading and initial sorting took: 5.72 seconds

Loaded ip2location DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27274159 entries, 0 to 27274158
Data columns (total 5 columns):
 #   Column           Dtype  
---  ------           -----  
 0   city             object 
 1   latitude         float32
 2   longitude        float32
 3   start_ip_uint32  uint32 
 4   end_ip_uint32    uint32 
dtypes: float32(2), object(1), uint32(2)
memory usage: 2.1 GB
In [31]:
# filter out bogons

BOGON_RANGES_CIDR = [
    "0.0.0.0/8",          # Current network (RFC 6890) - Often treated as bogon
    "10.0.0.0/8",         # Private Use (RFC 1918)
    "100.64.0.0/10",      # Shared Address Space (RFC 6598)
    "127.0.0.0/8",        # Loopback (RFC 6890)
    "169.254.0.0/16",     # Link-Local (RFC 3927)
    "172.16.0.0/12",      # Private Use (RFC 1918)
    "192.0.0.0/24",       # IETF Protocol Assignments (RFC 6890)
    "192.0.2.0/24",       # TEST-NET-1 (RFC 5737)
    "192.88.99.0/24",     # 6to4 Relay Anycast (Deprecated but often blocked, RFC 7526)
    "192.168.0.0/16",     # Private Use (RFC 1918)
    "198.18.0.0/15",      # Network Interconnect Device Benchmark Testing (RFC 2544)
    "198.51.100.0/24",    # TEST-NET-2 (RFC 5737)
    "203.0.113.0/24",     # TEST-NET-3 (RFC 5737)
    "224.0.0.0/4",        # Multicast (RFC 5771)
    "240.0.0.0/4",        # Reserved for Future Use (RFC 1112)
]

# get u32 ranges of bogons
BOGON_INT_RANGES = []
for cidr in BOGON_RANGES_CIDR:
    network = ipaddress.ip_network(cidr, strict=False)
    start_int = int(network.network_address)
    end_int = int(network.broadcast_address)
    BOGON_INT_RANGES.append((start_int, end_int))

print(f"Defined {len(BOGON_INT_RANGES)} bogon integer ranges.\n")


def filter_bogons_uint32(df: pd.DataFrame, ip_column_name: str) -> pd.DataFrame:
    """
    Filters a DataFrame to remove rows where the uint32 IP address falls
    within defined bogon ranges.
    """
    if ip_column_name not in df.columns:
        raise ValueError(f"Column '{ip_column_name}' not found in DataFrame.")

    if not pd.api.types.is_numeric_dtype(df[ip_column_name]):
         raise TypeError(f"Column '{ip_column_name}' must be a numeric type (ideally uint32).")

    print(f"Filtering bogons in column '{ip_column_name}'...")
    initial_rows = len(df)

    bogon_mask = pd.Series(False, index=df.index)
    ip_series = df[ip_column_name]

    # if the address is in any bogon range, return true
    for start_int, end_int in BOGON_INT_RANGES:
        in_range_mask = ip_series.between(start_int, end_int)
        bogon_mask |= in_range_mask

    num_bogons = bogon_mask.sum()
    num_valid = initial_rows - num_bogons

    print(f"Initial rows: {initial_rows}")
    print(f"Bogon addresses identified: {num_bogons}")
    print(f"Non-bogon addresses remaining: {num_valid}")

    # return non bogon addresses
    df_filtered = df[~bogon_mask]

    return df_filtered

unique_addr_df = filter_bogons_uint32(unique_addr_df, 'hop_addr_uint32')
Defined 15 bogon integer ranges.

Filtering bogons in column 'hop_addr_uint32'...
Initial rows: 99393
Bogon addresses identified: 6282
Non-bogon addresses remaining: 93111
In [32]:
# unique hop addresses df

unique_addr_df = unique_addr_df.sort_values('hop_addr_uint32').reset_index(drop=True)

# add location information
unique_addr_df = pd.merge_asof(
    unique_addr_df,
    ipinfo_df,
    left_on='hop_addr_uint32',
    right_on='start_ip_uint32',
    direction='backward'
)
In [33]:
cols_to_merge = ['hop_addr', 'city', 'latitude', 'longitude']

# add location data to full dataset
traceroutes_df = pd.merge(
    traceroutes_df,
    unique_addr_df[cols_to_merge],
    on='hop_addr',
    how='left'
)

rename_map = {
    "city": "ipinfo_city",
    "latitude": "ipinfo_lat",
    "longitude": "ipinfo_lon"
}

traceroutes_df = traceroutes_df.rename(columns=rename_map)
In [34]:
# how many hops have IPInfo locations?

(~traceroutes_df['ipinfo_city'].isna()).sum()
Out[34]:
24412016
In [35]:
# how many hops have no IPInfo locations (i.e., bogons)?

traceroutes_df['ipinfo_city'].isna().sum()
Out[35]:
2833206

Get ASN data¶

In [36]:
def find_day_url_v4():
    #https://data.caida.org/datasets/routing/routeviews-prefix2as/2025/04/routeviews-rv2-20250418-1200.pfx2as.gz
    base_url = 'http://data.caida.org/datasets/routing/routeviews-prefix2as/{}/{:02d}'.format(2025, 4)
    daymask = '{}{:02d}{:02d}'.format(2025, 4, 19)
    dayurl = None
    try:
        page = requests.get(base_url, timeout=10).text
        soup = BeautifulSoup(page, 'html.parser')
        for node in soup.find_all('a'):
            href = node.get('href')
            if href and href.endswith('pfx2as.gz'):
                find_dayurl = f"{base_url}/{href}" # Construct full URL
                if daymask in find_dayurl:
                    dayurl = find_dayurl
                    break
    except requests.exceptions.RequestException as e:
        print(f"Error fetching CAIDA directory page {base_url}: {e}")
        sys.exit(99)

    if dayurl is None:
        print('Unable to find day-specific URL for {}'.format(day.isoformat()))
        sys.exit(99)
    return dayurl
In [37]:
# adds ASN and BGP prefix from CAIDA data to a csv with ipaddresses
def fetch_caida_data_as_radix():
    caida_url = find_day_url_v4()
    caida_radix = radix.Radix()

    try:
        sys.stdout.write('Retrieving {} ... '.format(caida_url))
        sys.stdout.flush()
        #
        # # Retrieve gzipped file

        # Read the response as bytes

        response = urllib.request.urlopen(caida_url)
        gzFile = io.BytesIO()
        gzFile.write(response.read())

        print('OK')
    except os.error as err:
        print('Failed to download or parse {}'.format(caida_url))
        print(err)
        sys.exit(99)

    gzFile.seek(0, os.SEEK_END)
    print('Read {} bytes of compressed data'.format(gzFile.tell()))
    gzFile.seek(0, os.SEEK_SET)

    # Iterate over decompressed content
    pfx2as = gzip.GzipFile(fileobj=gzFile, mode='rb')

    pfx_count = 0

    for line in pfx2as:
        line = line.decode('utf-8')

        line = line.rstrip('\n')

        caida_vals = line.split('\t')

        if len(caida_vals) != 3:
            print('Invalid line in data retrieved from CAIDA ({})'.format(line))

        prefix = '{}/{}'.format(caida_vals[0], caida_vals[1])
        asn = caida_vals[2]


        rnode = caida_radix.add(prefix)
        rnode.data["AS"] = asn

        pfx_count += 1

        if (pfx_count % 100000) == 0:
            sys.stdout.write('+')
            sys.stdout.flush()
        elif (pfx_count % 10000) == 0:
            sys.stdout.write('-')
            sys.stdout.flush()

    pfx2as.close()

    print()
    print('Processed {} prefixes'.format(pfx_count))

    return caida_radix
In [38]:
print(f"\nFound {len(unique_addr_df)} unique non-null IP strings to look up.") # unique hop addresses, excluding bogons


if 'lookup_radix4' not in locals():
    lookup_radix4 = fetch_caida_data_as_radix()

# perform lookup
print(f"\nPerforming {len(unique_addr_df)} lookups in Radix tree...")
start_lookup_time = time.time()
lookup_results = []
successful_lookups = 0
failed_lookups = 0

for ip in unique_addr_df['hop_addr']:
    try:
        # search_best finds the longest matching prefix
        rnode = lookup_radix4.search_best(ip)
        if rnode: # Check if a node was found
            lookup_results.append({
                'hop_addr': ip, # The IP we looked up
                'hop_asn': rnode.data.get('AS', '-'), # Get AS, default to '-'
            })
            successful_lookups += 1
        else:
            # No matching prefix found in the tree
            lookup_results.append({'hop_addr': ip, 'hop_asn': '-'})
            failed_lookups += 1
    except ValueError: # Handle invalid IP format errors from radix lookup
         lookup_results.append({'hop_addr': ip, 'hop_asn': '-'})
         if failed_lookups < 10: print(f"Radix lookup failed for invalid IP format: {ip}")
         failed_lookups += 1
    except Exception as lookup_err: # Catch other potential errors
         lookup_results.append({'hop_addr': ip, 'hop_asn': '-'})
         if failed_lookups < 10: print(f"Radix lookup failed for {ip}: {lookup_err}")
         failed_lookups += 1

end_lookup_time = time.time()
print(f"Lookups completed in {end_lookup_time - start_lookup_time:.2f} seconds.")
print(f"  Successful: {successful_lookups}")
print(f"  Failed/Not Found: {failed_lookups}")

asn_df = pd.DataFrame(lookup_results)

print("\nMerging ASN/Prefix data back into the original DataFrame...")

# drop hop_asn if it already exists
if 'hop_asn' in traceroutes_df.columns:
    traceroutes_df = traceroutes_df.drop(columns=['hop_asn'])

traceroutes_df = pd.merge(
    traceroutes_df,
    asn_df[['hop_addr', 'hop_asn']],
    on='hop_addr',
    how='left'
)

# unknown ASes have value '-'
traceroutes_df['hop_asn'].fillna('-', inplace=True)
print("Merge complete.")
Found 93111 unique non-null IP strings to look up.
Retrieving http://data.caida.org/datasets/routing/routeviews-prefix2as/2025/04/routeviews-rv2-20250419-0800.pfx2as.gz ... OK
Read 3585254 bytes of compressed data
---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---
Processed 1035774 prefixes

Performing 93111 lookups in Radix tree...
Lookups completed in 9.77 seconds.
  Successful: 84677
  Failed/Not Found: 8434

Merging ASN/Prefix data back into the original DataFrame...
/var/folders/k_/b2zc8kg571bcqh2l9j8mj82h0000gn/T/ipykernel_48014/2055499213.py:58: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  traceroutes_df['hop_asn'].fillna('-', inplace=True)
Merge complete.
In [39]:
# how many ASes in the dataset?

traceroutes_df['hop_asn'].nunique()
Out[39]:
1827
In [40]:
# how many source ASNs?

ark_locs['vp_asn'].nunique()
Out[40]:
178

PTR record translation (Hoiho)¶

In [ ]:
perl = False # flip this to True to run the perl script (requires Perl and the JSON::XS module)
In [41]:
if perl:
    # configuration files
    ptr_records_file = './data/ptr_records.csv'
    hoiho_script = './data/hoiho/hoiho-apply.pl'
    hoiho_db = './data/hoiho/2024-08.midar-iff.geo-re.jsonl'

    # store unique PTR records
    print(f"\nSaving unique PTR records to {ptr_records_file}...")
    traceroutes_df['hop_name'] = traceroutes_df['hop_name'].str.rstrip('.') # OI data has a trailing dot
    unique_hop_names = traceroutes_df['hop_name'].unique()
    pd.DataFrame(unique_hop_names).to_csv(ptr_records_file, index=False, header=False)
    print(f"Saved {len(unique_hop_names)} unique names.")

    # Check if required files/scripts exist before running
    if not os.path.exists(hoiho_script):
         print(f"ERROR: Hoiho script not found at '{hoiho_script}'")
    elif not os.path.exists(hoiho_db):
         print(f"ERROR: Hoiho database not found at '{hoiho_db}'")
    elif not os.path.exists(ptr_records_file):
         print(f"ERROR: Input PTR file not found at '{ptr_records_file}'")
    else:
        print(f"\nExecuting Hoiho Perl script: {hoiho_script}...")
        command = f"cat {ptr_records_file} | perl {hoiho_script} {hoiho_db}"
        try:
            # run perl command
            process_result = subprocess.run(
                command,
                shell=True,
                capture_output=True,
                text=True,
                check=True
            )

            # Get the standard output from the completed process
            ptr_translations_content = process_result.stdout
            print(f"Perl script executed successfully. Output length: {len(ptr_translations_content)} chars.")

            # Check for errors printed to stderr
            if process_result.stderr:
                 print("\n--- Warnings/Errors from Perl script (stderr) ---")
                 print(process_result.stderr)
                 print("-------------------------------------------------")

            print("\nLoading PTR translations from script output...")
            if ptr_translations_content:
                ptr_map = pd.read_csv(
                    io.StringIO(ptr_translations_content),
                    sep=' ',
                    on_bad_lines='skip',
                    names=['hop_name', 'location', 'code', 'ptr_lat', 'ptr_lon', 'ptr_city'],
                    skipinitialspace=True
                )
                print(f"Loaded {len(ptr_map)} translation entries.")
            else:
                print("Warning: Perl script produced no output. PTR map will be empty.")
                ptr_map = pd.DataFrame(columns=['hop_name', 'location', 'code', 'ptr_lat', 'ptr_lon', 'ptr_city'])


        except FileNotFoundError:
            print(f"ERROR: 'perl' command not found. Is Perl installed and in your system's PATH?")
        except subprocess.CalledProcessError as e:
            print(f"ERROR: Perl script execution failed with exit code {e.returncode}.")
            print("--- Perl script STDOUT ---")
            print(e.stdout)
            print("--- Perl script STDERR ---")
            print(e.stderr)
        except Exception as e:
            print(f"An unexpected error occurred during subprocess execution: {e}")
else:
    traceroutes_df['hop_name'] = traceroutes_df['hop_name'].str.rstrip('.') # OI data has a trailing dot
    ptr_map = pd.read_csv('./data/hoiho/hoiho_translations.csv')

print("\nMerging PTR translations into final DataFrame...")
cols_to_merge = ['hop_name', 'ptr_lat', 'ptr_lon', 'ptr_city']

traceroutes_df = pd.merge(
    traceroutes_df,
    ptr_map[cols_to_merge],
    on='hop_name',
    how='left'
)
print("Merge complete.")
Saving unique PTR records to ./data/ptr_records.csv...
Saved 41759 unique names.

Executing Hoiho Perl script: ./data/hoiho/hoiho-apply.pl...
Perl script executed successfully. Output length: 1452294 chars.

--- Warnings/Errors from Perl script (stderr) ---
Use of uninitialized value in printf at ./data/hoiho/hoiho-apply.pl line 165, <STDIN> line 18677.
Use of uninitialized value in printf at ./data/hoiho/hoiho-apply.pl line 165, <STDIN> line 18849.

-------------------------------------------------

Loading PTR translations from script output...
Loaded 19613 translation entries.

Merging PTR translations into final DataFrame...
Merge complete.
In [42]:
# how many PTR records have a city location?

(~ptr_map['ptr_city'].isna()).sum()
Out[42]:
16565
In [43]:
# how many PTR records have no city location?

ptr_map['ptr_city'].isna().sum()
Out[43]:
3048
In [44]:
# how many hops have PTR city locations?

(~traceroutes_df['ptr_city'].isna()).sum()
Out[44]:
3340097
In [45]:
# how many hops have NO PTR city locations?

(traceroutes_df['ptr_city'].isna()).sum()
Out[45]:
23905125

Validate hop locations¶

In [46]:
# loc_valid

# calculate distance between 'vp_lat', 'vp_lon' and 'latitude', 'longitude'
# verify distance is smaller than 'ark_to_hop' (which is in km)

def haversine_vectorized(lat1, lon1, lat2, lon2, earth_radius_km=6371):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees), returning distance in kilometers.
    Vectorized version for Pandas Series.
    """
    # Convert decimal degrees to radians
    lat1 = pd.to_numeric(lat1, errors='coerce')
    lon1 = pd.to_numeric(lon1, errors='coerce')
    lat2 = pd.to_numeric(lat2, errors='coerce')
    lon2 = pd.to_numeric(lon2, errors='coerce')

    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    # Use arctan2 for numerical stability
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance_km = earth_radius_km * c
    return distance_km

### verify IPinfo locations
# calculate distance
traceroutes_df['calculated_distance_km'] = haversine_vectorized(
    traceroutes_df['vp_lat'],
    traceroutes_df['vp_lon'],
    traceroutes_df['ipinfo_lat'],
    traceroutes_df['ipinfo_lon']
)


# distance between points must be less than the maximum distance travelled given the latency difference
traceroutes_df['ipinfo_loc_valid'] = traceroutes_df['calculated_distance_km'] < traceroutes_df['ark_to_hop']


### verify PTR record locations
traceroutes_df['calculated_distance_km'] = haversine_vectorized(
    traceroutes_df['vp_lat'],
    traceroutes_df['vp_lon'],
    traceroutes_df['ptr_lat'],
    traceroutes_df['ptr_lon']
)

traceroutes_df['ptr_loc_valid'] = traceroutes_df['calculated_distance_km'] < traceroutes_df['ark_to_hop']

traceroutes_df.drop(columns=['calculated_distance_km'], inplace=True)

# ptr valids

valid_count = traceroutes_df['ptr_loc_valid'].sum()
total_count = (~traceroutes_df['ptr_city'].isna()).sum()

print(f"PTR locations {valid_count} valid out of {total_count}")

# ipinfo valids

valid_count = traceroutes_df['ipinfo_loc_valid'].sum()
total_count = (~traceroutes_df['ipinfo_city'].isna()).sum()

print(f"IPInfo locations {valid_count} valid out of {total_count}")

either_valid = (traceroutes_df['ptr_loc_valid'] | traceroutes_df['ipinfo_loc_valid']).sum()
print(f"Locations with either PTR or IPInfo valid locations: {either_valid}")
PTR locations 3281087 valid out of 3340097
IPInfo locations 20713439 valid out of 24412016
Locations with either PTR or IPInfo valid locations: 20795606
In [47]:
# how many unique hop addresses have a valid ipinfo location?

traceroutes_df[traceroutes_df['ipinfo_loc_valid']]['hop_addr'].nunique()
Out[47]:
91890
In [48]:
# how many unique hop addresses have a valid PTR location?

traceroutes_df[traceroutes_df['ptr_loc_valid']]['hop_addr'].nunique()
Out[48]:
16929
In [49]:
# how many unique hop address for ipinfo excluding those with ptr location

unique_ptr_loc_valid = traceroutes_df[traceroutes_df['ptr_loc_valid']]['hop_addr'].unique()

unique_ip_info_valid = traceroutes_df[traceroutes_df['ipinfo_loc_valid']]['hop_addr'].unique()

only_in_ipinfo = np.setdiff1d(unique_ip_info_valid, unique_ptr_loc_valid)

len(only_in_ipinfo)
Out[49]:
75143
In [50]:
in_both = np.intersect1d(unique_ptr_loc_valid, unique_ip_info_valid)
len(in_both)
Out[50]:
16747
In [51]:
union = np.union1d(unique_ptr_loc_valid, unique_ip_info_valid)
len(union)
Out[51]:
92072
In [52]:
# get preferred locations (ptr if valid, else ipinfo if valid, else NaN)

conditions = [
    traceroutes_df['ptr_loc_valid'] == True,
    traceroutes_df['ipinfo_loc_valid'] == True
]

# For hop_city
city_choices = [
    traceroutes_df['ptr_city'],
    traceroutes_df['ipinfo_city']
]
# For hop_lat
lat_choices = [
    traceroutes_df['ptr_lat'],
    traceroutes_df['ipinfo_lat']
]

lon_choices = [
    traceroutes_df['ptr_lon'],
    traceroutes_df['ipinfo_lon']
]

# get hop location info
traceroutes_df['hop_city'] = np.select(conditions, city_choices, default=pd.NA)
traceroutes_df['hop_lat'] = np.select(conditions, lat_choices, default=np.nan)
traceroutes_df['hop_lon'] = np.select(conditions, lon_choices, default=np.nan)

traceroutes_df = traceroutes_df.drop(columns=
    [
        'ipinfo_city', 'ipinfo_lat', 'ipinfo_lon',
        'ptr_city', 'ptr_lat', 'ptr_lon',
        'ptr_loc_valid', 'ipinfo_loc_valid'
    ]
)

print(f"no locations available {(traceroutes_df['hop_city'].isna()).sum()}")
print(f"locations available {(~traceroutes_df['hop_city'].isna()).sum()}")
no locations available 6449628
locations available 20795594

Obtain results using 'latency neighbors'¶

In [53]:
# detect neighbors

print(f"Latency threshold for latency neighbors: {LATENCY_THRESHOLD} ms")

traceroutes_df['is_neighbor'] = abs(traceroutes_df['pop_rtt'] - traceroutes_df['hop_rtt']) < LATENCY_THRESHOLD
Latency threshold for latency neighbors: 3 ms
In [54]:
# for geolocating PoPs we only use VPs within 12ms of the anycast PoP
traceroutes_df_f = traceroutes_df[traceroutes_df['pop_rtt'] < TRACEROUTE_MAX_LATENCY]
traceroutes_df_f
Out[54]:
tx_hostname dst hop_count probe_count hop_addr hop_probe_ttl hop_rtt hop_reply_ttl ip4_address hop_name ... vp_asn tx_airport pop_rtt ark_to_hop ark_to_pop hop_asn hop_city hop_lat hop_lon is_neighbor
0 aep3-ar 1.0.0.0 9 13 192.168.1.1 1 1.281 64 NaN NaN ... 7303 aep 5.976 130.890781 610.619288 - <NA> NaN NaN False
1 aep3-ar 1.0.0.0 9 13 192.168.0.1 2 1.606 63 NaN NaN ... 7303 aep 5.976 164.098825 610.619288 - <NA> NaN NaN False
2 aep3-ar 1.0.0.0 9 13 181.96.113.217 7 13.811 249 181.96.113.217 host217.181-96-113.telecom.net.ar ... 7303 aep 5.976 1411.188586 610.619288 7303 Buenos Aires -34.613152 -58.377232 False
3 aep3-ar 1.0.0.0 9 13 181.13.127.254 8 9.127 248 181.13.127.254 host254.181-13-127.telecom.net.ar ... 7303 aep 5.976 932.584044 610.619288 7303 Ciudad del Libertador General San Martín -34.577888 -58.538639 False
4 aep3-ar 1.0.0.0 9 13 1.0.0.0 9 12.355 56 NaN NaN ... 7303 aep 5.976 1262.416551 610.619288 13335 <NA> NaN NaN False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27245190 oak5-us 192.188.22.6 14 20 75.101.33.185 7 2.236 249 75.101.33.185 100.ae1.nrd1.equinix-sj.sonic.net ... 46375 oak 4.146 228.471340 423.632458 46375 San Jose 37.339390 -121.894958 True
27245191 oak5-us 192.188.22.6 14 20 206.223.116.137 8 2.320 57 206.223.116.137 eqx-sj.es.net ... 46375 oak 4.146 237.054342 423.632458 - San Jose 37.339390 -121.894958 True
27245192 oak5-us 192.188.22.6 14 20 198.129.100.30 12 4.476 53 198.129.100.30 lbnl59-cr6--esnetwest-se425.ip.es.net ... 46375 oak 4.146 457.351395 423.632458 292 San Jose 37.339390 -121.894958 True
27245193 oak5-us 192.188.22.6 14 20 198.129.100.29 13 4.431 243 198.129.100.29 site--esnetwest-se425.ip.es.net ... 46375 oak 4.146 452.753358 423.632458 292 San Jose 37.339390 -121.894958 True
27245194 oak5-us 192.188.22.6 14 20 192.188.22.6 14 4.146 51 192.188.22.6 nmx1.es.net ... 46375 oak 4.146 423.632458 423.632458 3443 San Jose 37.339390 -121.894958 True

12040885 rows × 24 columns

In [55]:
# how many valid tx_hostname,dst pairs remain?

traceroutes_df_f[['tx_hostname', 'dst']].drop_duplicates().shape[0]
Out[55]:
1705930
In [56]:
# how many valid dsts?

traceroutes_df_f['dst'].nunique()
Out[56]:
13706
In [57]:
# find cases where the VP itself is a latency neighbor

print("Finding Ark VP latency neighbors")
is_dest_hop = traceroutes_df_f['dst'] == traceroutes_df_f['hop_addr']
is_fast_dest_hop = is_dest_hop & (traceroutes_df_f['hop_rtt'] < LATENCY_THRESHOLD)

fast_dest_df = traceroutes_df_f[is_fast_dest_hop]
fast_dest_results_list = fast_dest_df.apply(
    lambda row: pd.Series({
        'tx_hostname': row['tx_hostname'],
        'dst': row['dst'],
        'inferred_city': row['tx_city'],
        'vp_lat': row['vp_lat'],
        'vp_lon': row['vp_lon'],
        'hop_lat': row['vp_lat'], # use vp coords as hop coords
        'hop_lon': row['vp_lon']
    }), axis=1
).drop_duplicates().to_dict('records')
print(f"Found {len(fast_dest_results_list)} routes resolved by latency neighbor VPs")


fast_dest_results_df = pd.DataFrame(fast_dest_results_list)
Finding Ark VP latency neighbors
Found 939187 routes resolved by latency neighbor VPs
In [58]:
# Filter out hop == dst, to identify latency neighbor hops
hops_df_f = traceroutes_df_f[traceroutes_df_f['dst'] != traceroutes_df_f['hop_addr']]
print(f"DataFrame shape after removing all destination hops: {hops_df_f.shape}")

def find_closest_neighbor_data(group):
    """
    Finds the data (city, lat, lon, latitude, longitude) of the latency neighbor
    closest to the destination (highest TTL), ensuring the location is valid.
    Returns a Series with the required data or NAs.
    """
    # Filter on latency neighbors of the anycast address
    neighbors = group[group['is_neighbor'] == True]

    # Filter on neighbors with city geolocation data (already validated)
    valid_city_neighbors = neighbors.dropna(subset=['hop_city'])

    if not valid_city_neighbors.empty:
        # Get the neighbor location closest to the VP (lowest TTL)
        closest_neighbor_to_vp = valid_city_neighbors.sort_values('hop_probe_ttl', ascending=True).iloc[0]
        # we only use latency neighbors if there exists one within 9ms of the VP (to avoid false geolocations due to RTT noise)

        # no neighbor within 9ms
        if closest_neighbor_to_vp['hop_rtt'] > 9:
            return pd.Series({
                'inferred_city': pd.NA,
                'vp_lat': pd.NA,
                'vp_lon': pd.NA,
                'hop_lat': pd.NA,
                'hop_lon': pd.NA
            }, index=['inferred_city', 'vp_lat', 'vp_lon', 'hop_lat', 'hop_lon'])

        closest_neighbor_to_pop = valid_city_neighbors.sort_values('hop_probe_ttl', ascending=False).iloc[0]

        # return location
        return pd.Series({
            'inferred_city': closest_neighbor_to_pop['hop_city'],
            'vp_lat': closest_neighbor_to_pop['vp_lat'],
            'vp_lon': closest_neighbor_to_pop['vp_lon'],
            'hop_lat': closest_neighbor_to_pop['hop_lat'],
            'hop_lon': closest_neighbor_to_pop['hop_lon']
        })
    else:
        #nNo valid location found
        return pd.Series({
            'inferred_city': pd.NA,
            'vp_lat': pd.NA,
            'vp_lon': pd.NA,
            'hop_lat': pd.NA,
            'hop_lon': pd.NA
        }, index=['inferred_city', 'vp_lat', 'vp_lon', 'hop_lat', 'hop_lon'])

# Group by tx_hostname, dst pairs
print("\nApplying neighbor logic to remaining routes...")
neighbor_results_df = hops_df_f.groupby(
        ['tx_hostname', 'dst'], observed=True, dropna=False
        ).apply(find_closest_neighbor_data).reset_index()

# combine results
print("\nCombining results from VP latency neighbors and hop latency neighbor logic...")
# Concatenate the two sets of results
inferred_df = pd.concat([fast_dest_results_df, neighbor_results_df], ignore_index=True)

# Prioritize fast_dest_results_df over neighbor_results_df
inferred_df = inferred_df.drop_duplicates(subset=['tx_hostname', 'dst'], keep='first')

# remove NaNs
inferred_df = inferred_df[~inferred_df['inferred_city'].isna()]

inferred_df # may take a while
DataFrame shape after removing all destination hops: (10274463, 24)

Applying neighbor logic to remaining routes...
/var/folders/k_/b2zc8kg571bcqh2l9j8mj82h0000gn/T/ipykernel_48014/1495268981.py:54: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  neighbor_results_df = hops_df_f.groupby(
Combining results from VP latency neighbors and hop latency neighbor logic...
Out[58]:
tx_hostname dst inferred_city vp_lat vp_lon hop_lat hop_lon
0 dar-tz 192.31.196.1 Dar es Salaam -6.79 39.21 -6.79 39.21
1 dar-tz 192.33.14.30 Dar es Salaam -6.79 39.21 -6.79 39.21
2 dar-tz 192.58.128.30 Dar es Salaam -6.79 39.21 -6.79 39.21
3 dar-tz 194.0.17.1 Dar es Salaam -6.79 39.21 -6.79 39.21
4 dar-tz 194.0.27.12 Dar es Salaam -6.79 39.21 -6.79 39.21
... ... ... ... ... ... ... ...
2641136 zrh4-ch 97.74.111.51 Zürich 47.38 8.54 47.366669 8.55
2641137 zrh4-ch 97.74.96.0 Zürich 47.38 8.54 47.366669 8.55
2641138 zrh4-ch 97.74.97.35 Zürich 47.38 8.54 47.366669 8.55
2641139 zrh4-ch 97.74.98.0 Zürich 47.38 8.54 47.366669 8.55
2641140 zrh4-ch 97.74.99.57 Zürich 47.38 8.54 47.366669 8.55

1436047 rows × 7 columns

Airport clustering¶

In [4]:
airport_df = pd.read_csv('./data/airports.csv', sep='\t')

print("--- Pre-processing Airport Data ---")
start_time = time.time()

# split coords
coords = airport_df['lat long'].str.split(' ', expand=True)
airport_df['a_lat'] = pd.to_numeric(coords[0], errors='coerce')
airport_df['a_lon'] = pd.to_numeric(coords[1], errors='coerce')
airport_df['a_heuristic'] = pd.to_numeric(airport_df['pop'].str.split(' ', expand=True)[0], errors='coerce')

# convert airport coords to radians
airport_df['a_lat_rad'] = np.radians(airport_df['a_lat'])
airport_df['a_lon_rad'] = np.radians(airport_df['a_lon'])

# numpy arrays for fast lookups
airport_coords_rad = airport_df[['a_lat_rad', 'a_lon_rad']].values
airport_heuristics = airport_df['a_heuristic'].values
airport_lat = airport_df['a_lat'].values
airport_lon = airport_df['a_lon'].values
airport_iata = airport_df['#IATA'].values
airport_name = airport_df['name'].values


print(f"Airport pre-processing took {time.time() - start_time:.2f}s")

airport_df
--- Pre-processing Airport Data ---
Airport pre-processing took 0.01s
Out[4]:
#IATA size name lat long country_code city pop heuristic 1 2 3 a_lat a_lon a_heuristic a_lat_rad a_lon_rad
0 ASH large_airport Ashburn Airport 39.043611 -77.4875 US Ashburn 43511 0 0 0 NaN NaN NaN NaN 39.043611 -77.487500 43511 0.681440 -1.352412
1 AMX large_airport Amsterdam Airport Schiphol 52.3086013794 4.7638897896 NL Amsterdam 741636 0 0 0 NaN NaN NaN NaN 52.308601 4.763890 741636 0.912957 0.083146
2 AMS large_airport Amsterdam Airport Schiphol 52.3086013794 4.7638897896 NL Amsterdam 741636 0 0 0 NaN NaN NaN NaN 52.308601 4.763890 741636 0.912957 0.083146
3 AMZ large_airport Amsterdam Airport Schiphol 52.3086013794 4.7638897896 NL Amsterdam 741636 0 0 0 NaN NaN NaN NaN 52.308601 4.763890 741636 0.912957 0.083146
4 PIX large_airport San Francisco International Airport 37.6189994812 -122.375 US San Francisco 805235 0 -122.389979 37.615223 NaN NaN NaN NaN 37.618999 -122.375000 805235 0.656575 -2.135847
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3023 CPM large_airport Los Angeles International Airport 33.94250107 -118.4079971 US Los Angeles 3792621 0 0 0 NaN NaN NaN NaN 33.942501 -118.407997 3792621 0.592408 -2.066609
3024 OXR large_airport Los Angeles International Airport 33.94250107 -118.4079971 US Los Angeles 3792621 0 0 0 NaN NaN NaN NaN 33.942501 -118.407997 3792621 0.592408 -2.066609
3025 RHV medium_airport Evelio Javier Airport 10.765999794 121.932998657 PH San Jose 118807 2 0 0 NaN NaN NaN NaN 10.766000 121.932999 118807 0.187902 2.128132
3026 STO large_airport Stockholm-Arlanda Airport 59.6519012451 17.9186000824 SE Stockholm 1253309 0 0 0 NaN NaN NaN NaN 59.651901 17.918600 1253309 1.041122 0.312739
3027 CNJ large_airport Newark Liberty International Airport 40.6925010681 -74.1687011719 US Newark 277140 0 0 0 NaN NaN NaN NaN 40.692501 -74.168701 277140 0.710218 -1.294488

3028 rows × 16 columns

In [5]:
# get inferred locations radian coordinates
inferred_df['hop_lat'] = pd.to_numeric(inferred_df['hop_lat'], errors='coerce')
inferred_df['hop_lon'] = pd.to_numeric(inferred_df['hop_lon'], errors='coerce')

inferred_df['hop_lat_rad'] = np.radians(inferred_df['hop_lat'])
inferred_df['hop_lon_rad'] = np.radians(inferred_df['hop_lon'])
inferred_coords_rad = inferred_df[['hop_lat_rad', 'hop_lon_rad']].values

inferred_coords_rad
Out[5]:
array([[-0.11850786,  0.6843436 ],
       [-0.11850786,  0.6843436 ],
       [-0.11850786,  0.6843436 ],
       ...,
       [ 0.82670432,  0.14922565],
       [ 0.82670432,  0.14922565],
       [ 0.82670432,  0.14922565]], shape=(1477348, 2))
In [6]:
# create ball tree
print("\n--- Building BallTree ---")
tree = BallTree(airport_coords_rad, metric='haversine')

print("\n--- Querying nearby airports using BallTree ---")
start_time = time.time()
# find distances (and indices) of airports within the radius
nearby_indices_list, nearby_distances_list_rad = tree.query_radius(
    inferred_coords_rad, r=DISK_DISTANCE_RAD, return_distance=True
)

nearby_distances_list_km = [dist_rad * EARTH_RADIUS_KM for dist_rad in nearby_distances_list_rad]
print(f"BallTree query_radius took {time.time() - start_time:.2f}s")
--- Building BallTree ---

--- Querying nearby airports using BallTree ---
BallTree query_radius took 9.05s
In [7]:
# fallback airports (closest airport)
print("\n--- Finding the absolute closest airport (fallback) ---")
start_time = time.time()
# query returns distances (in radians) and indices for the k=1 nearest neighbor
closest_dist_rad, closest_idx = tree.query(inferred_coords_rad, k=1)
# closest_dist_km = closest_dist_rad[:, 0] * EARTH_RADIUS_KM # Distances in km
closest_indices = closest_idx[:, 0] # Get the index array
print(f"BallTree query (k=1) took {time.time() - start_time:.2f}s")
--- Finding the absolute closest airport (fallback) ---
BallTree query (k=1) took 15.09s
In [8]:
results = []
print("\n--- Calculating heuristic for nearby airports ---")
start_time = time.time()

for i in range(len(inferred_coords_rad)): # iterate through all final latency neighbors
    # retrieve the airports within the radius, and their distance from the latency neighbor
    nearby_indices = nearby_indices_list[i]
    nearby_distances_km = nearby_distances_list_km[i]

    # retrieve fallback airport for this point
    fallback_idx = closest_indices[i]
    fallback_airport_details = {
        'lat': airport_lat[fallback_idx],
        'lon': airport_lon[fallback_idx],
        'name': airport_name[fallback_idx],
        'iata': airport_iata[fallback_idx]
    }

    # heuristic calculation
    best_p = -1.0
    best_idx_in_disk = -1 # -1 -> no suitable airport found with heuristic

    if len(nearby_indices) > 0:
        # Filter again based on exact distance (though query_radius should be accurate)
        # This ensures we strictly adhere to DISK_DISTANCE_KM
        within_disk_mask = nearby_distances_km < DISK_DISTANCE_KM
        candidate_indices = nearby_indices[within_disk_mask]

        if len(candidate_indices) > 0:
            candidate_distances = nearby_distances_km[within_disk_mask]
            candidate_heuristics = airport_heuristics[candidate_indices]

            # Calculate totals for the denominator using ONLY candidates within the disk
            total_heuristic = np.sum(candidate_heuristics)
            # Sum of (Disk Distance - Actual Distance) for candidates
            total_inverse_distance_weighted = np.sum(DISK_DISTANCE_KM - candidate_distances)

            # Check denominators are valid AND non-zero
            if total_heuristic > 0 and total_inverse_distance_weighted > 1e-9: # Use small epsilon for float comparison
                # Calculate probability p for each candidate
                p_heuristic = ALPHA * (candidate_heuristics / total_heuristic)
                p_distance = (1 - ALPHA) * ((DISK_DISTANCE_KM - candidate_distances) / total_inverse_distance_weighted)
                p = p_heuristic + p_distance

                # Find the index *among candidates* with the highest p
                best_candidate_local_idx = np.argmax(p)
                # Map back to the original airport_df index
                best_idx_in_disk = candidate_indices[best_candidate_local_idx]

    if best_idx_in_disk != -1:
        # return best airport based on heuristic
        results.append({
            'lat': airport_lat[best_idx_in_disk],
            'lon': airport_lon[best_idx_in_disk],
            'name': airport_name[best_idx_in_disk],
            'iata': airport_iata[best_idx_in_disk]
        })
    else:
        # no airport in disk, use the closest airport
        results.append(fallback_airport_details)


print(f"Heuristic calculation loop took {time.time() - start_time:.2f}s")

print("\n--- Assembling final results ---")
# join results with inferred_df
results_df = pd.DataFrame(results, index=inferred_df.index)
final_df = inferred_df.join(results_df)
final_df.rename(columns={'lat': 'inferred_airport_lat',
                         'lon': 'inferred_airport_lon',
                         'name': 'inferred_airport_name',
                         'iata': 'inferred_airport_iata'}, inplace=True)
final_df
--- Calculating heuristic for nearby airports ---
Heuristic calculation loop took 19.81s

--- Assembling final results ---
Out[8]:
tx_hostname dst inferred_city vp_lat vp_lon hop_lat hop_lon clustered_inferred_city clustered_lat clustered_lon hop_lat_rad hop_lon_rad inferred_airport_lat inferred_airport_lon inferred_airport_name inferred_airport_iata
0 dar-tz 192.31.196.1 Dar es Salaam -6.79 39.21 -6.790000 39.21 Dar es Salaam -6.790000 39.21 -0.118508 0.684344 -6.878056 39.203333 Dar es Salaam TIX
1 dar-tz 192.33.14.30 Dar es Salaam -6.79 39.21 -6.790000 39.21 Dar es Salaam -6.790000 39.21 -0.118508 0.684344 -6.878056 39.203333 Dar es Salaam TIX
2 dar-tz 192.58.128.30 Dar es Salaam -6.79 39.21 -6.790000 39.21 Dar es Salaam -6.790000 39.21 -0.118508 0.684344 -6.878056 39.203333 Dar es Salaam TIX
3 dar-tz 194.0.17.1 Dar es Salaam -6.79 39.21 -6.790000 39.21 Dar es Salaam -6.790000 39.21 -0.118508 0.684344 -6.878056 39.203333 Dar es Salaam TIX
4 dar-tz 194.0.27.12 Dar es Salaam -6.79 39.21 -6.790000 39.21 Dar es Salaam -6.790000 39.21 -0.118508 0.684344 -6.878056 39.203333 Dar es Salaam TIX
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1477343 zrh4-ch 97.74.111.51 Zürich 47.38 8.54 47.366669 8.55 Zürich 47.366669 8.55 0.826704 0.149226 47.464699 8.549170 Zürich Airport ZRH
1477344 zrh4-ch 97.74.96.0 Zürich 47.38 8.54 47.366669 8.55 Zürich 47.366669 8.55 0.826704 0.149226 47.464699 8.549170 Zürich Airport ZRH
1477345 zrh4-ch 97.74.97.35 Zürich 47.38 8.54 47.366669 8.55 Zürich 47.366669 8.55 0.826704 0.149226 47.464699 8.549170 Zürich Airport ZRH
1477346 zrh4-ch 97.74.98.0 Zürich 47.38 8.54 47.366669 8.55 Zürich 47.366669 8.55 0.826704 0.149226 47.464699 8.549170 Zürich Airport ZRH
1477347 zrh4-ch 97.74.99.57 Zürich 47.38 8.54 47.366669 8.55 Zürich 47.366669 8.55 0.826704 0.149226 47.464699 8.549170 Zürich Airport ZRH

1477348 rows × 16 columns

merge and compare with MAnycastR census¶

In [66]:
final_df['prefix'] = final_df['dst'].apply(lambda x: '.'.join(x.split('.')[:3]) + '.0/24') # prefix for merging with census results
In [67]:
# get TR results
city_counts_per_dst = final_df.groupby('prefix')['inferred_airport_iata'].apply(lambda x: x.dropna().nunique())
In [68]:
# Merge census and traceroute enumeration results

print("Merging the two DataFrames...")
merged_df = pd.merge(
    census[['prefix', 'iGreedyICMPv4', 'MAnycastICMPv4']],
    city_counts_per_dst,
    on='prefix',
    how='outer'
)
print(f"Merged DataFrame created with {len(merged_df)} rows.")

# fill NaNs with 0
columns_to_fill = ['iGreedyICMPv4', 'inferred_airport_iata']
for col in columns_to_fill:
    # Check if NaN filling is needed before doing it
    nan_count = merged_df[col].isna().sum()
    merged_df[col] = merged_df[col].fillna(0)
    merged_df[col] = merged_df[col].astype(int)

# Reorder columns for better readability
final_columns = ['prefix', 'inferred_airport_iata', 'iGreedyICMPv4', 'MAnycastICMPv4']
merged_df = merged_df[final_columns]
merged_df = merged_df.rename(columns={'inferred_airport_iata': 'TR', 'iGreedyICMPv4': 'GCD'})

merged_df
Merging the two DataFrames...
Merged DataFrame created with 13765 rows.
Out[68]:
prefix TR GCD MAnycastICMPv4
0 1.0.0.0/24 86 71 27
1 1.1.1.0/24 82 71 28
2 1.10.10.0/24 2 3 1
3 1.12.0.0/24 6 5 5
4 1.12.12.0/24 3 3 3
... ... ... ... ...
13760 99.83.254.0/24 37 49 25
13761 99.83.255.0/24 34 49 25
13762 99.99.97.0/24 2 2 2
13763 99.99.98.0/24 2 2 2
13764 99.99.99.0/24 4 4 4

13765 rows × 4 columns

In [69]:
# add ASN data to merged_df

# Perform lookups for upstream IPs
upstream_lookups = 0
upstream_asns = []

for ip in merged_df['prefix']:
    if type(ip) is not str or len(ip) == 0:
        upstream_asns.append('-')
        continue

    try:
        rnode = lookup_radix4.search_best(ip)

        upstream_asns.append(rnode.data['AS'])
        upstream_lookups += 1
    except:
        upstream_asns.append('-')
        print('Radix tree lookup of {} failed'.format(ip))

print('{} successful lookups for {} entries'.format(upstream_lookups, len(merged_df)))


# Add additional columns to the output
merged_df['anycast_asn'] = upstream_asns

merged_df.head()
Radix tree lookup of 84.205.69.0/24 failed
Radix tree lookup of 84.205.75.0/24 failed
13763 successful lookups for 13765 entries
Out[69]:
prefix TR GCD MAnycastICMPv4 anycast_asn
0 1.0.0.0/24 86 71 27 13335
1 1.1.1.0/24 82 71 28 13335
2 1.10.10.0/24 2 3 1 148000
3 1.12.0.0/24 6 5 5 132203_45090
4 1.12.12.0/24 3 3 3 132203_45090
In [70]:
# determine which targets have completed traceroutes
merged_df['has_traceroute'] = merged_df['prefix'].isin(traceroutes_df['prefix'])
merged_df.head()
Out[70]:
prefix TR GCD MAnycastICMPv4 anycast_asn has_traceroute
0 1.0.0.0/24 86 71 27 13335 True
1 1.1.1.0/24 82 71 28 13335 True
2 1.10.10.0/24 2 3 1 148000 True
3 1.12.0.0/24 6 5 5 132203_45090 True
4 1.12.12.0/24 3 3 3 132203_45090 True
In [71]:
# GCD confirmed prefixes with no traceroute data
merged_df[~merged_df['has_traceroute'] & (merged_df['GCD'] > 1)]
Out[71]:
prefix TR GCD MAnycastICMPv4 anycast_asn has_traceroute
2573 109.122.206.0/24 0 9 7 49287 False
2930 134.231.255.0/24 0 43 16 396982 False
3912 151.242.40.0/24 0 2 2 36530_214025 False
4326 163.181.59.0/24 0 10 7 24429 False
4505 167.86.43.0/24 0 3 2 25773 False
5573 185.18.184.0/24 0 67 27 13335 False
5625 185.211.51.0/24 0 25 24 20473 False
5839 192.229.156.0/24 0 2 2 15133 False
5840 192.229.168.0/24 0 2 2 15133 False
6169 198.7.16.0/24 0 2 2 15133 False
6220 199.212.93.0/24 0 65 27 9327 False
6688 203.31.251.0/24 0 2 2 149512 False
6747 204.74.102.0/24 0 8 7 19905 False
6976 209.85.152.0/24 0 23 13 15169 False
6977 209.85.153.0/24 0 5 4 15169 False
6978 209.85.155.0/24 0 4 3 15169 False
6979 209.85.204.0/24 0 22 13 15169 False
6980 209.85.205.0/24 0 5 4 15169 False
6981 209.85.207.0/24 0 4 3 15169 False
7224 23.141.168.0/24 0 48 25 16509 False
9365 34.34.34.0/24 0 2 2 396982 False
11610 43.152.97.0/24 0 3 2 132203 False
11779 45.192.222.0/24 0 65 27 13335 False
12526 66.173.0.0/24 0 6 3 4181 False
13194 80.78.134.0/24 0 4 0 209241 False
13396 88.216.69.0/24 0 65 27 13335 False
13449 91.202.63.0/24 0 2 2 44571 False
13477 92.223.15.0/24 0 8 5 199524 False
13576 95.135.109.0/24 0 2 2 214025 False
13577 95.135.83.0/24 0 2 2 214025 False
In [72]:
# both anycast
merged_df[(merged_df['TR'] > 1) & (merged_df['GCD'] > 1)]
Out[72]:
prefix TR GCD MAnycastICMPv4 anycast_asn has_traceroute
0 1.0.0.0/24 86 71 27 13335 True
1 1.1.1.0/24 82 71 28 13335 True
2 1.10.10.0/24 2 3 1 148000 True
3 1.12.0.0/24 6 5 5 132203_45090 True
4 1.12.12.0/24 3 3 3 132203_45090 True
... ... ... ... ... ... ...
13760 99.83.254.0/24 37 49 25 16509 True
13761 99.83.255.0/24 34 49 25 16509 True
13762 99.99.97.0/24 2 2 2 7018 True
13763 99.99.98.0/24 2 2 2 7018 True
13764 99.99.99.0/24 4 4 4 7018 True

13478 rows × 6 columns

In [73]:
# igreedy and not traceroute (includes 30 with no traceroute data)
merged_df[(merged_df['GCD'] > 1) & (merged_df['TR'] < 2)]
Out[73]:
prefix TR GCD MAnycastICMPv4 anycast_asn has_traceroute
9 100.0.0.0/24 1 2 1 701 True
28 103.138.240.0/24 1 2 1 45177 True
35 103.144.246.0/24 1 2 1 138152 True
39 103.156.106.0/24 1 2 2 141202 True
42 103.158.186.0/24 1 2 2 18041 True
... ... ... ... ... ... ...
13580 95.181.180.0/24 1 3 4 210756 True
13581 95.181.181.0/24 1 2 4 210756 True
13582 95.181.182.0/24 1 3 4 210756 True
13622 98.143.88.0/24 1 2 2 19171 True
13625 98.96.233.0/24 0 3 3 62610 True

287 rows × 6 columns

In [74]:
# igreedy only with traceroute
merged_df[(merged_df['GCD'] > 1) & (merged_df['TR'] < 2) & merged_df['has_traceroute']]
Out[74]:
prefix TR GCD MAnycastICMPv4 anycast_asn has_traceroute
9 100.0.0.0/24 1 2 1 701 True
28 103.138.240.0/24 1 2 1 45177 True
35 103.144.246.0/24 1 2 1 138152 True
39 103.156.106.0/24 1 2 2 141202 True
42 103.158.186.0/24 1 2 2 18041 True
... ... ... ... ... ... ...
13580 95.181.180.0/24 1 3 4 210756 True
13581 95.181.181.0/24 1 2 4 210756 True
13582 95.181.182.0/24 1 3 4 210756 True
13622 98.143.88.0/24 1 2 2 19171 True
13625 98.96.233.0/24 0 3 3 62610 True

257 rows × 6 columns

In [75]:
# group GCD only (not detected with traceroute) by number of sites found -> most found only by GCD are in 2,3 locations

merged_df[(merged_df['GCD'] > 1) & (merged_df['TR'] < 2) & merged_df['has_traceroute']].groupby('GCD')['prefix'].nunique(0)
Out[75]:
GCD
2     195
3      29
4       4
5       3
6       1
7       1
8       2
9       2
10      1
12      1
13      3
14      2
15      1
16      1
17      1
18      6
19      1
29      2
32      1
Name: prefix, dtype: int64
In [76]:
# how many target ASes for traceroute?

merged_df[merged_df['has_traceroute']]['anycast_asn'].nunique()
Out[76]:
906
In [77]:
# targets without traceroute data

merged_df[~merged_df['has_traceroute'] & (merged_df['MAnycastICMPv4'] > 1)]
Out[77]:
prefix TR GCD MAnycastICMPv4 anycast_asn has_traceroute
2573 109.122.206.0/24 0 9 7 49287 False
2930 134.231.255.0/24 0 43 16 396982 False
3912 151.242.40.0/24 0 2 2 36530_214025 False
4326 163.181.59.0/24 0 10 7 24429 False
4505 167.86.43.0/24 0 3 2 25773 False
5573 185.18.184.0/24 0 67 27 13335 False
5625 185.211.51.0/24 0 25 24 20473 False
5839 192.229.156.0/24 0 2 2 15133 False
5840 192.229.168.0/24 0 2 2 15133 False
6169 198.7.16.0/24 0 2 2 15133 False
6220 199.212.93.0/24 0 65 27 9327 False
6688 203.31.251.0/24 0 2 2 149512 False
6747 204.74.102.0/24 0 8 7 19905 False
6976 209.85.152.0/24 0 23 13 15169 False
6977 209.85.153.0/24 0 5 4 15169 False
6978 209.85.155.0/24 0 4 3 15169 False
6979 209.85.204.0/24 0 22 13 15169 False
6980 209.85.205.0/24 0 5 4 15169 False
6981 209.85.207.0/24 0 4 3 15169 False
7224 23.141.168.0/24 0 48 25 16509 False
9365 34.34.34.0/24 0 2 2 396982 False
11610 43.152.97.0/24 0 3 2 132203 False
11779 45.192.222.0/24 0 65 27 13335 False
12526 66.173.0.0/24 0 6 3 4181 False
13396 88.216.69.0/24 0 65 27 13335 False
13449 91.202.63.0/24 0 2 2 44571 False
13477 92.223.15.0/24 0 8 5 199524 False
13576 95.135.109.0/24 0 2 2 214025 False
13577 95.135.83.0/24 0 2 2 214025 False
In [79]:
# which ASes had the most failed traceroutes?
merged_df[~merged_df['has_traceroute'] & (merged_df['MAnycastICMPv4'] > 1)].groupby('anycast_asn')['prefix'].nunique().sort_values(ascending=False)
Out[79]:
anycast_asn
15169           6
15133           3
13335           3
214025          2
396982          2
25773           1
49287           1
44571           1
4181            1
36530_214025    1
132203          1
24429           1
20473           1
199524          1
19905           1
16509           1
149512          1
9327            1
Name: prefix, dtype: int64
In [80]:
# only iGreedy

merged_df[(merged_df['TR'] < 2) & (merged_df['GCD'] > 1)]#['has_traceroute'].sum()
Out[80]:
prefix TR GCD MAnycastICMPv4 anycast_asn has_traceroute
9 100.0.0.0/24 1 2 1 701 True
28 103.138.240.0/24 1 2 1 45177 True
35 103.144.246.0/24 1 2 1 138152 True
39 103.156.106.0/24 1 2 2 141202 True
42 103.158.186.0/24 1 2 2 18041 True
... ... ... ... ... ... ...
13580 95.181.180.0/24 1 3 4 210756 True
13581 95.181.181.0/24 1 2 4 210756 True
13582 95.181.182.0/24 1 3 4 210756 True
13622 98.143.88.0/24 1 2 2 19171 True
13625 98.96.233.0/24 0 3 3 62610 True

287 rows × 6 columns

In [81]:
# unraveling MOAS infrastructures -> 4812 is a peer of 45090, but not 132203 [future work]

traceroutes_df[(traceroutes_df['dst'] == '1.12.14.16') & (traceroutes_df['tx_hostname'] == 'abz2-uk')][['tx_hostname', 'dst', 'hop_probe_ttl', 'hop_rtt', 'hop_addr', 'hop_asn', 'hop_city']]
Out[81]:
tx_hostname dst hop_probe_ttl hop_rtt hop_addr hop_asn hop_city
1460517 abz2-uk 1.12.14.16 1 0.297 137.50.19.1 786 Aberdeen
1460518 abz2-uk 1.12.14.16 2 0.499 137.50.0.33 786 Aberdeen
1460519 abz2-uk 1.12.14.16 3 0.508 137.50.255.225 786 Aberdeen
1460520 abz2-uk 1.12.14.16 4 0.375 137.50.255.150 786 Aberdeen
1460521 abz2-uk 1.12.14.16 5 0.537 146.97.128.9 786 <NA>
1460522 abz2-uk 1.12.14.16 6 3.936 146.97.37.173 786 Glasgow
1460523 abz2-uk 1.12.14.16 7 8.112 146.97.33.53 786 Manchester
1460524 abz2-uk 1.12.14.16 8 26.298 146.97.33.41 786 Luton
1460525 abz2-uk 1.12.14.16 9 13.954 146.97.33.21 786 London
1460526 abz2-uk 1.12.14.16 10 14.496 146.97.33.1 786 London
1460527 abz2-uk 1.12.14.16 11 14.643 83.97.89.1 21320 London
1460528 abz2-uk 1.12.14.16 13 217.812 202.97.97.5 4134 Shanghai
1460529 abz2-uk 1.12.14.16 14 219.227 202.97.50.194 4134 Shanghai
1460530 abz2-uk 1.12.14.16 15 191.071 202.97.94.238 4134 Shanghai
1460531 abz2-uk 1.12.14.16 16 203.739 101.95.224.226 4812 Shanghai
1460532 abz2-uk 1.12.14.16 18 226.057 101.226.210.70 4812 Shanghai
1460533 abz2-uk 1.12.14.16 23 227.985 1.12.14.16 132203_45090 Guangzhou
In [82]:
# GCD anycast, TR unicast
merged_df[(merged_df['TR'] == 1) & (merged_df['GCD'] > 1)]
Out[82]:
prefix TR GCD MAnycastICMPv4 anycast_asn has_traceroute
9 100.0.0.0/24 1 2 1 701 True
28 103.138.240.0/24 1 2 1 45177 True
35 103.144.246.0/24 1 2 1 138152 True
39 103.156.106.0/24 1 2 2 141202 True
42 103.158.186.0/24 1 2 2 18041 True
... ... ... ... ... ... ...
13579 95.181.177.0/24 1 2 4 210756 True
13580 95.181.180.0/24 1 3 4 210756 True
13581 95.181.181.0/24 1 2 4 210756 True
13582 95.181.182.0/24 1 3 4 210756 True
13622 98.143.88.0/24 1 2 2 19171 True

204 rows × 6 columns

In [83]:
# TR anycast, GCD unicast

merged_df[(merged_df['TR'] > 1) & (merged_df['GCD'] == 1)]
Out[83]:
prefix TR GCD MAnycastICMPv4 anycast_asn has_traceroute

analyze 'hypergiants'¶

In [84]:
# get largest ASes originating prefixes and the mean TR and GCD counts

merged_df.groupby('anycast_asn').agg(
    prefix_count = ('prefix', 'nunique'),
    TR_mean      = ('TR',     'mean'),
    GCD_mean     = ('GCD',    'mean')
).reset_index()
Out[84]:
anycast_asn prefix_count TR_mean GCD_mean
0 - 2 0.5 4.0
1 10029 1 2.0 2.0
2 1008 1 3.0 3.0
3 10122 2 3.5 3.5
4 10143 1 4.0 4.0
... ... ... ... ...
906 9507 1 2.0 2.0
907 9559 1 2.0 2.0
908 9678 1 3.0 3.0
909 9790 4 2.0 2.0
910 9829 1 2.0 2.0

911 rows × 4 columns

In [85]:
# why does traceroute perform worse for Amazon?

amazon = merged_df[merged_df['anycast_asn'] == '16509']

amazon_traceroutes = traceroutes_df[traceroutes_df['prefix'].isin(amazon['prefix'])]

# what is the average number of VPs that completed a traceroute towards amazon targets

amazon_traceroutes.groupby('dst')['tx_hostname'].nunique().mean() # A: far below average number of VPs with completed traceroutes
Out[85]:
132.76661631419938
In [86]:
# figure out clusters

merged_df[(merged_df['TR'] > 47) & (merged_df['TR'] < 51)].groupby('anycast_asn')['prefix'].nunique() # TR google cloud
Out[86]:
anycast_asn
13335        8
139070       5
15169      122
16509        2
19281        1
29997        2
394089       4
396982    3265
54113       17
8068         2
8075         2
Name: prefix, dtype: int64
In [87]:
merged_df[(merged_df['TR'] > 25) & (merged_df['TR'] < 45)].groupby('anycast_asn')['prefix'].nunique() # TR Amazon
Out[87]:
anycast_asn
10880     3
112       1
1149      2
13335     8
15169     5
         ..
6939      1
714       5
7280      5
8075     11
8674      2
Name: prefix, Length: 62, dtype: int64
In [88]:
merged_df[(merged_df['GCD'] > 44) & (merged_df['GCD'] < 51)].groupby('anycast_asn')['prefix'].nunique() # GCD Amazon
Out[88]:
anycast_asn
13335       13
16509     1302
19281        1
29997        4
396982      27
54113       51
8068         6
8075         4
Name: prefix, dtype: int64
In [89]:
merged_df[(merged_df['GCD'] > 40) & (merged_df['GCD'] < 45)].groupby('anycast_asn')['prefix'].nunique() # GCD Google
Out[89]:
anycast_asn
13335       18
139070       9
15169      284
19551        1
199524       3
21342       22
34939        1
394089       5
396982    3871
54113       13
8068         2
8075         7
Name: prefix, dtype: int64
In [90]:
merged_df[(merged_df['GCD'] > 60)].groupby('anycast_asn')['prefix'].nunique() # GCD CF
Out[90]:
anycast_asn
10886        1
11045        1
13335     2722
14197        4
147008       1
152594       1
152899       1
19254        8
19281        2
1958         1
1975         1
199535       1
201188       1
201839       1
2047         2
206998       1
208140       1
209242     301
209299       1
214618       1
21556        1
21787        1
23145        1
23467        2
2559         1
273584       1
27515        1
27647        5
29085        1
29729        1
30219        1
30811        1
328486       1
328608       1
33191        1
3557         1
397273       1
398291       1
398787      60
399077       4
399358       2
399566       2
399678       1
400797       1
40680        1
42          13
4749         6
4892         1
50756        1
53595        1
55532        3
62659        2
9327         3
Name: prefix, dtype: int64
In [91]:
merged_df[(merged_df['TR'] > 65)].groupby('anycast_asn')['prefix'].nunique() # TR CF
Out[91]:
anycast_asn
10886        1
11045        1
13335     2956
14197        4
147008       1
152594       1
152899       1
19254        8
19281        3
1958         1
1975         1
199535       1
201188       1
201839       1
2047         2
206998       1
208140       1
209242     301
209299       1
214618       1
21556        1
21787        1
23145        1
23467        2
2559         1
273584       1
27515        1
27647        5
29085        1
29729        1
30219        1
30811        1
328486       1
328608       1
33191        1
3557         1
397273       1
398291       1
398787      60
399077       4
399358       2
399566       2
399678       1
400797       1
40680        1
42          22
4749         6
4892         1
50756        1
53595        1
55532        3
62659        2
9327         2
Name: prefix, dtype: int64
In [123]:
igreedy_values = census['iGreedyICMPv4'].values
igreedy_values_sorted = np.sort(igreedy_values)
# Calculate CDF y-values
n_igreedy = len(igreedy_values_sorted)
y_cdf_igreedy = np.arange(1, n_igreedy + 1) / n_igreedy
print(f"Prepared {n_igreedy} iGreedyICMPv4 values (>1).")

city_counts_filtered = city_counts_per_dst[city_counts_per_dst > 1]

# Extract, sort enumeration values
city_count_values = city_counts_filtered.values
city_counts_sorted = np.sort(city_count_values)
# Calculate CDF values
n_city = len(city_counts_sorted)
y_cdf_city = np.arange(1, n_city + 1) / n_city
print(f"Prepared {n_city} city count values for {len(city_counts_per_dst)} destinations.")

print("\nPlotting combined CDFs...")

plt.figure(figsize=(12, 7))
# Plot GCD CDF
plt.plot(igreedy_values_sorted, y_cdf_igreedy,
         marker='.', linestyle='none', label='iGreedy', markersize=4)
# Plot TR CDF
plt.plot(city_counts_sorted, y_cdf_city,
         marker='x', linestyle='none', label='Traceroute', markersize=5)

plt.ylabel('Cumulative Fraction')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.ylim(0, 1.05)

all_x_values = np.concatenate((igreedy_values_sorted, city_counts_sorted))

max_x = np.max(all_x_values)
min_x_positive = all_x_values[all_x_values > 0].min() if np.any(all_x_values > 0) else 1

plt.rcParams.update({'font.size': 20})

plt.xlabel('Number of PoPs detected per prefix')
plt.xlim(left=0)
plt.xticks(np.arange(0, 90 + 1, 10))

plt.legend(markerscale=3)
plt.tight_layout()

plt.grid(True)

ellipse = patches.Ellipse((49, 0.55), width=5, height=0.4, # TR Google Cloud
                          angle=0, color='red', fill=False)
plt.gca().add_patch(ellipse)

ellipse = patches.Ellipse((35, 0.35), width=8, height=0.2, # TR Amazon
                          angle=0, color='red', fill=False)
plt.gca().add_patch(ellipse)

ellipse = patches.Ellipse((49, 0.7), width=4, height=0.2, # GCD Amazon
                          angle=0, color='black', fill=False)
plt.gca().add_patch(ellipse)

ellipse = patches.Ellipse((43, 0.5), width=4, height=0.4, # GCD Google
                          angle=0, color='black', fill=False)
plt.gca().add_patch(ellipse)

ellipse = patches.Ellipse((67, 0.9), width=18, height=0.3, # GCD Cloudflare
                          angle=0, color='black', fill=False)
plt.gca().add_patch(ellipse)

ellipse = patches.Ellipse((77, 0.9), width=18, height=0.3, # TR Cloudflare
                          angle=0, color='red', fill=False)
plt.gca().add_patch(ellipse)

plt.annotate('Google Cloud (GCD)', xy=(43, 0.5), xytext=(20, 0.8),
             arrowprops=dict(facecolor='black', arrowstyle='->'))

plt.annotate('Amazon (GCD)', xy=(49, 0.7), xytext=(40, 0.9),
             arrowprops=dict(facecolor='black', arrowstyle='->'))

plt.annotate('Google Cloud (TR)', xy=(49, 0.55), xytext=(44, 0.2),
             arrowprops=dict(facecolor='black', arrowstyle='->'))

plt.annotate('Amazon (TR)', xy=(35, 0.35), xytext=(30, 0.1),
             arrowprops=dict(facecolor='black', arrowstyle='->'))

plt.annotate('Cloudflare (TR)', xy=(77, 0.9), xytext=(70, 0.4),
             arrowprops=dict(facecolor='black', arrowstyle='->'))

plt.annotate('Cloudflare (GCD)', xy=(67, 0.9), xytext=(55, 0.6),
             arrowprops=dict(facecolor='black', arrowstyle='->'))

plt.savefig("./plots/enumeration_annotated.pdf", bbox_inches='tight', pad_inches=0)


plt.show()

print("\nDescriptive Statistics for iGreedy:")
print(pd.Series(igreedy_values_sorted).describe())

print("\nDescriptive Statistics for traceroute:")
print(pd.Series(city_counts_sorted).describe())
Prepared 13765 iGreedyICMPv4 values (>1).
Prepared 13478 city count values for 13682 destinations.

Plotting combined CDFs...
No description has been provided for this image
Descriptive Statistics for iGreedy:
count    13765.000000
mean        38.650999
std         22.739557
min          2.000000
25%         15.000000
50%         43.000000
75%         58.000000
max         75.000000
dtype: float64

Descriptive Statistics for traceroute:
count    13478.000000
mean        42.829797
std         25.399726
min          2.000000
25%         20.000000
50%         48.000000
75%         66.000000
max         87.000000
dtype: float64

run analysis w/o latency neighbor constraints toward root letters¶

In [93]:
# get results for root-letters, without using latency neighbors

roots = [
'198.41.0.4',
'170.247.170.2',
'192.33.4.12',
'199.7.91.13',
'192.203.230.10',
'192.5.5.0',
#'192.112.36.4', # g-root unresponsive to ICMP
'198.97.190.53',
'192.36.148.17',
'192.58.128.30',
'193.0.14.129',
'199.7.83.42',
'202.12.27.33']
In [94]:
traceroutes_roots = traceroutes_df[traceroutes_df['dst'].isin(roots)]
traceroutes_roots
Out[94]:
tx_hostname dst hop_count probe_count hop_addr hop_probe_ttl hop_rtt hop_reply_ttl ip4_address hop_name ... vp_asn tx_airport pop_rtt ark_to_hop ark_to_pop hop_asn hop_city hop_lat hop_lon is_neighbor
1095 aep3-ar 199.7.91.13 10 16 192.168.1.1 1 0.497 64 NaN NaN ... 7303 aep 13.934 50.782762 1423.756554 - <NA> NaN NaN False
1096 aep3-ar 199.7.91.13 10 16 192.168.0.1 2 1.201 63 NaN NaN ... 7303 aep 13.934 122.716494 1423.756554 - <NA> NaN NaN False
1097 aep3-ar 199.7.91.13 10 16 181.96.62.70 7 13.194 249 181.96.62.70 host70.181-96-62.telecom.net.ar ... 7303 aep 13.934 1348.144393 1423.756554 7303 Buenos Aires -34.613152 -58.377232 True
1098 aep3-ar 199.7.91.13 10 16 45.68.8.12 8 8.703 248 NaN NaN ... 7303 aep 13.934 889.260319 1423.756554 - Buenos Aires -34.613152 -58.377232 False
1099 aep3-ar 199.7.91.13 10 16 200.115.95.118 9 11.412 56 NaN NaN ... 7303 aep 13.934 1166.062135 1423.756554 52376 Buenos Aires -34.613152 -58.377232 True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27244876 oak5-us 192.203.230.10 9 12 142.254.59.146 3 8.355 253 142.254.59.146 142-254-59-146.static.sonic.net ... 46375 oak 3.077 853.702168 314.403539 46375 San Jose 37.339390 -121.894958 False
27244877 oak5-us 192.203.230.10 9 12 157.131.211.153 4 19.666 252 157.131.211.153 ae6.cr2.hywrca01.sonic.net ... 46375 oak 3.077 2009.444264 314.403539 46375 San Carlos 37.507160 -122.260521 False
27244878 oak5-us 192.203.230.10 9 12 75.101.33.185 7 2.885 249 75.101.33.185 100.ae1.nrd1.equinix-sj.sonic.net ... 46375 oak 3.077 294.785249 314.403539 46375 San Jose 37.339390 -121.894958 True
27244879 oak5-us 192.203.230.10 9 12 206.223.117.75 8 2.830 57 206.223.117.75 equinix-sjc.woodynet.net ... 46375 oak 3.077 289.165426 314.403539 - San Jose 37.362598 -121.929001 True
27244880 oak5-us 192.203.230.10 9 12 192.203.230.10 9 3.077 56 192.203.230.10 e.root-servers.net ... 46375 oak 3.077 314.403539 314.403539 21556 <NA> NaN NaN True

24759 rows × 24 columns

In [95]:
traceroute_roots_phops = traceroutes_roots[traceroutes_roots['dst'] != traceroutes_roots['hop_addr']].sort_values(by='hop_probe_ttl', ascending=False)\
                    .drop_duplicates(subset=['tx_hostname', 'dst'], keep='first')
traceroute_roots_phops = traceroute_roots_phops[~traceroute_roots_phops['hop_city'].isna()] # remove NaN phops (bogons)
traceroute_roots_phops
Out[95]:
tx_hostname dst hop_count probe_count hop_addr hop_probe_ttl hop_rtt hop_reply_ttl ip4_address hop_name ... vp_asn tx_airport pop_rtt ark_to_hop ark_to_pop hop_asn hop_city hop_lat hop_lon is_neighbor
812326 hnl4-us 192.58.128.30 25 30 178.172.80.1 24 222.748 227 NaN NaN ... 20473 hnl 222.398 22760.077851 22724.315343 2107 Ljubljana 46.051079 14.505130 True
19314852 puw-us 170.247.170.2 24 26 62.115.139.35 22 111.478 232 62.115.139.35 rest-bb1-link.ip.twelve99.net ... 20055 puw 88.403 11390.665500 9032.894402 1299 Washington 38.895111 -77.036369 False
17926032 gye-ec 192.58.128.30 23 28 178.172.80.1 22 201.306 251 NaN NaN ... 61468 gye 201.245 20569.164401 20562.931506 2107 Ljubljana 46.051079 14.505130 True
16375275 bed-us 198.97.190.53 25 31 96.110.36.90 21 48.696 247 96.110.36.90 be-3301-pe01.nota.fl.ibone.comcast.net ... 7922 bed 45.818 4975.688900 4681.618901 7922 Miami 25.774269 -80.193657 True
12194477 ens-nl 192.58.128.30 22 23 140.189.8.9 21 125.316 238 140.189.8.9 r-chippewatc-hub-2-ae3.ip4.wiscnet.net ... 1133 ens 119.394 12804.612908 12199.511264 2381 Plum City 44.629131 -92.192398 False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
25260722 pry-za 193.0.14.129 2 2 196.216.3.3 1 0.617 255 196.216.3.3 vl250.ar01.jnb.afrinic.net ... 33764 pry 1.052 63.044194 107.491883 33764 Johannesburg -26.202271 28.043631 True
1576510 beg-rs 193.0.14.129 2 2 193.105.163.133 1 0.567 255 193.105.163.133 133.sox.rs ... 13004 beg 0.403 57.935264 41.177974 13004 Belgrade 44.804008 20.465130 True
1581976 beg-rs 199.7.83.42 4 6 193.105.163.133 1 0.586 255 193.105.163.133 133.sox.rs ... 13004 beg 24.948 59.876657 2549.151607 13004 Belgrade 44.804008 20.465130 False
18800163 dac-bd 192.58.128.30 2 2 202.59.208.1 1 0.057 64 NaN NaN ... 24122 dac 0.107 5.824180 10.933110 9825 Dhaka 23.710400 90.407440 True
20089856 sql-us 170.247.170.2 4 6 192.158.252.3 1 0.222 255 192.158.252.3 r2.pao2-guests.isc.org ... 1280 sql 56.526 22.683648 5775.747305 1280 Palo Alto 37.441879 -122.143021 False

2990 rows × 24 columns

In [96]:
# get inferred locations radian coordinates
traceroute_roots_phops['hop_lat'] = pd.to_numeric(traceroute_roots_phops['hop_lat'], errors='coerce')
traceroute_roots_phops['hop_lon'] = pd.to_numeric(traceroute_roots_phops['hop_lon'], errors='coerce')

traceroute_roots_phops['hop_lat_rad'] = np.radians(traceroute_roots_phops['hop_lat'])
traceroute_roots_phops['hop_lon_rad'] = np.radians(traceroute_roots_phops['hop_lon'])
inferred_coords_rad_roots = traceroute_roots_phops[['hop_lat_rad', 'hop_lon_rad']].values
In [97]:
np.isnan(inferred_coords_rad_roots).any(axis=1)
Out[97]:
array([False, False, False, ..., False, False, False])
In [98]:
# querry tree for root letters

nearby_indices_list, nearby_distances_list_rad = tree.query_radius(
    inferred_coords_rad_roots, r=DISK_DISTANCE_RAD, return_distance=True
)

nearby_distances_list_km = [dist_rad * EARTH_RADIUS_KM for dist_rad in nearby_distances_list_rad]

# fallback airport (closest airport)
start_time = time.time()

closest_dist_rad, closest_idx = tree.query(inferred_coords_rad_roots, k=1)

closest_indices = closest_idx[:, 0]

print(f"BallTree query (k=1) took {time.time() - start_time:.2f}s")
BallTree query (k=1) took 0.03s
In [99]:
# get inferred locations

results = []
print("\n--- Calculating heuristic for nearby airports ---")
start_time = time.time()

for i in range(len(inferred_coords_rad_roots)): # iterate through all final latency neighbors
    # retrieve the airports within the radius, and their distance from the latency neighbor
    nearby_indices = nearby_indices_list[i]
    nearby_distances_km = nearby_distances_list_km[i]

    # retrieve fallback airport for this point
    fallback_idx = closest_indices[i]
    fallback_airport_details = {
        'lat': airport_lat[fallback_idx],
        'lon': airport_lon[fallback_idx],
        'name': airport_name[fallback_idx],
        'iata': airport_iata[fallback_idx]
    }

    # heuristic calculation
    best_p = -1.0
    best_idx_in_disk = -1 # -1 -> no suitable airport found with heuristic

    if len(nearby_indices) > 0:
        # Filter again based on exact distance (though query_radius should be accurate)
        # This ensures we strictly adhere to DISK_DISTANCE_KM
        within_disk_mask = nearby_distances_km < DISK_DISTANCE_KM
        candidate_indices = nearby_indices[within_disk_mask]

        if len(candidate_indices) > 0:
            candidate_distances = nearby_distances_km[within_disk_mask]
            candidate_heuristics = airport_heuristics[candidate_indices]

            # Calculate totals for the denominator using ONLY candidates within the disk
            total_heuristic = np.sum(candidate_heuristics)
            # Sum of (Disk Distance - Actual Distance) for candidates
            total_inverse_distance_weighted = np.sum(DISK_DISTANCE_KM - candidate_distances)

            # Check denominators are valid AND non-zero
            if total_heuristic > 0 and total_inverse_distance_weighted > 1e-9: # Use small epsilon for float comparison
                # Calculate probability p for each candidate
                p_heuristic = ALPHA * (candidate_heuristics / total_heuristic)
                p_distance = (1 - ALPHA) * ((DISK_DISTANCE_KM - candidate_distances) / total_inverse_distance_weighted)
                p = p_heuristic + p_distance

                # Find the index *among candidates* with the highest p
                best_candidate_local_idx = np.argmax(p)
                # Map back to the original airport_df index
                best_idx_in_disk = candidate_indices[best_candidate_local_idx]

    if best_idx_in_disk != -1:
        # return best airport based on heuristic
        results.append({
            'lat': airport_lat[best_idx_in_disk],
            'lon': airport_lon[best_idx_in_disk],
            'name': airport_name[best_idx_in_disk],
            'iata': airport_iata[best_idx_in_disk]
        })
    else:
        # no airport in disk, use the closest airport
        results.append(fallback_airport_details)


print(f"Heuristic calculation loop took {time.time() - start_time:.2f}s")

print("\n--- Assembling final results ---")
# join results with inferred_df
final_roots = traceroute_roots_phops.join(pd.DataFrame(results, index=traceroute_roots_phops.index))
final_roots.rename(columns={'lat': 'inferred_airport_lat',
                         'lon': 'inferred_airport_lon',
                         'name': 'inferred_airport_name',
                         'iata': 'inferred_airport_iata'}, inplace=True)
final_roots
--- Calculating heuristic for nearby airports ---
Heuristic calculation loop took 0.06s

--- Assembling final results ---
Out[99]:
tx_hostname dst hop_count probe_count hop_addr hop_probe_ttl hop_rtt hop_reply_ttl ip4_address hop_name ... hop_city hop_lat hop_lon is_neighbor hop_lat_rad hop_lon_rad inferred_airport_lat inferred_airport_lon inferred_airport_name inferred_airport_iata
812326 hnl4-us 192.58.128.30 25 30 178.172.80.1 24 222.748 227 NaN NaN ... Ljubljana 46.051079 14.505130 True 0.803743 0.253162 46.223701 14.457600 Ljubljana Jože Pučnik Airport LJU
19314852 puw-us 170.247.170.2 24 26 62.115.139.35 22 111.478 232 62.115.139.35 rest-bb1-link.ip.twelve99.net ... Washington 38.895111 -77.036369 False 0.678848 -1.344538 39.175400 -76.668297 Baltimore/Washington International Thurgood Marshall Airport BWI
17926032 gye-ec 192.58.128.30 23 28 178.172.80.1 22 201.306 251 NaN NaN ... Ljubljana 46.051079 14.505130 True 0.803743 0.253162 46.223701 14.457600 Ljubljana Jože Pučnik Airport LJU
16375275 bed-us 198.97.190.53 25 31 96.110.36.90 21 48.696 247 96.110.36.90 be-3301-pe01.nota.fl.ibone.comcast.net ... Miami 25.774269 -80.193657 True 0.449846 -1.399643 25.793200 -80.290604 Miami NAP
12194477 ens-nl 192.58.128.30 22 23 140.189.8.9 21 125.316 238 140.189.8.9 r-chippewatc-hub-2-ae3.ip4.wiscnet.net ... Plum City 44.629131 -92.192398 False 0.778925 -1.609061 44.882000 -93.221802 Minneapolis-St Paul International/Wold-Chamberlain Airport MSP
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
25260722 pry-za 193.0.14.129 2 2 196.216.3.3 1 0.617 255 196.216.3.3 vl250.ar01.jnb.afrinic.net ... Johannesburg -26.202271 28.043631 True -0.457316 0.489454 -25.938499 27.926100 Lanseria Airport HLA
1576510 beg-rs 193.0.14.129 2 2 193.105.163.133 1 0.567 255 193.105.163.133 133.sox.rs ... Belgrade 44.804008 20.465130 True 0.781977 0.357184 44.818401 20.309099 Belgrade Nikola Tesla Airport BEG
1581976 beg-rs 199.7.83.42 4 6 193.105.163.133 1 0.586 255 193.105.163.133 133.sox.rs ... Belgrade 44.804008 20.465130 False 0.781977 0.357184 44.818401 20.309099 Belgrade Nikola Tesla Airport BEG
18800163 dac-bd 192.58.128.30 2 2 202.59.208.1 1 0.057 64 NaN NaN ... Dhaka 23.710400 90.407440 True 0.413825 1.577907 23.843347 90.397783 Dhaka / Hazrat Shahjalal International Airport DAC
20089856 sql-us 170.247.170.2 4 6 192.158.252.3 1 0.222 255 192.158.252.3 r2.pao2-guests.isc.org ... Palo Alto 37.441879 -122.143021 False 0.653484 -2.131798 37.362598 -121.929001 Norman Y. Mineta San Jose International Airport SJC

2990 rows × 30 columns

In [100]:
final_roots.groupby('dst')['inferred_airport_iata'].nunique() # number of sites found per root-letter (without latency constraints)
Out[100]:
dst
170.247.170.2     13
192.203.230.10    95
192.33.4.12       13
192.36.148.17     47
192.5.5.0         82
192.58.128.30     63
193.0.14.129      51
198.41.0.4        17
198.97.190.53     30
199.7.83.42       52
199.7.91.13       75
202.12.27.33      17
Name: inferred_airport_iata, dtype: int64
In [101]:
# get average enumeration increase

count_differences = []

print("--- Calculating Counts per Destination ---")

# --- Loop and Calculate ---
for letter in roots:
    # Calculate p-hop count for the current dst
    p_hop_count = final_roots[final_roots['dst'] == letter]['inferred_airport_iata'].nunique()

    # Calculate latency neighbor count for the current dst
    latency_count = final_df[final_df['dst'] == letter]['inferred_airport_iata'].nunique()

    # Calculate the difference (increase = p-hop - latency)
    difference = p_hop_count - latency_count
    count_differences.append(difference)



print("\n--- Average Calculation ---")
average_increase = sum(count_differences) / len(count_differences)

print(f"Total number of destinations processed: {len(count_differences)}")
print(f"Average increase (p-hop unique airports - latency unique airports): {average_increase:.4f}")
--- Calculating Counts per Destination ---

--- Average Calculation ---
Total number of destinations processed: 12
Average increase (p-hop unique airports - latency unique airports): 11.9167
In [102]:
# compare number of sites found for both TR techniques, per root-letter
for letter in roots:
    print(letter)
    print("p-hop count")
    print(final_roots[final_roots['dst'] == letter]['inferred_airport_iata'].nunique())
    print("latency neighbor count")
    print(final_df[final_df['dst'] == letter]['inferred_airport_iata'].nunique())
    print('\n')
198.41.0.4
p-hop count
17
latency neighbor count
15


170.247.170.2
p-hop count
13
latency neighbor count
7


192.33.4.12
p-hop count
13
latency neighbor count
9


199.7.91.13
p-hop count
75
latency neighbor count
69


192.203.230.10
p-hop count
95
latency neighbor count
85


192.5.5.0
p-hop count
82
latency neighbor count
75


198.97.190.53
p-hop count
30
latency neighbor count
13


192.36.148.17
p-hop count
47
latency neighbor count
31


192.58.128.30
p-hop count
63
latency neighbor count
41


193.0.14.129
p-hop count
51
latency neighbor count
23


199.7.83.42
p-hop count
52
latency neighbor count
32


202.12.27.33
p-hop count
17
latency neighbor count
12


run analysis with all phops¶

In [104]:
# get p-hops

p_hops = traceroutes_df[traceroutes_df['dst'] != traceroutes_df['hop_addr']].sort_values(by='hop_probe_ttl', ascending=False)\
                    .drop_duplicates(subset=['tx_hostname', 'dst'], keep='first')
# get the highest TTL hop that is not the destination for all VP, PoP pairs

p_hops.head()
Out[104]:
tx_hostname dst hop_count probe_count hop_addr hop_probe_ttl hop_rtt hop_reply_ttl ip4_address hop_name ... vp_asn tx_airport pop_rtt ark_to_hop ark_to_pop hop_asn hop_city hop_lat hop_lon is_neighbor
19208202 puw-us 23.11.40.7 47 60 23.203.145.249 43 289.470 219 23.203.145.249 ae34.r03.border101.sea01.fab.netarch.akamai.com ... 20055 puw 151.139 29577.638119 15443.170787 20940 Seattle 47.449001 -122.308998 False
3886303 hkg5-cn 23.11.40.7 44 56 23.40.189.1 43 453.065 28 23.40.189.1 vlan100.r01.tor01.hkg01.fab.netarch.akamai.com ... 9304 hkg 230.783 46293.545494 23581.082868 20940 Hong Kong 22.320304 114.198074 False
16388104 bed-us 23.11.38.17 39 52 23.198.9.65 38 340.269 31 23.198.9.65 vlan100.r05.tor01.bos01.fab.netarch.akamai.com ... 7922 bed 172.131 34768.208605 17588.103869 20940 Boston 42.364300 -71.005203 False
2510835 ind-us 23.11.38.17 38 49 154.24.97.54 37 462.866 223 NaN NaN ... 14333 ind 229.942 47294.997911 23495.150669 174 Indianapolis 39.768379 -86.158043 False
19661446 cpv-br 23.11.38.17 40 52 23.32.63.57 37 590.487 225 23.32.63.57 ae4.r01.gru01.icn.netarch.akamai.com ... 1916 cpv 327.031 60335.132483 33415.568354 20940 Sao Paulo -23.435556 -46.473057 False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
17064520 acc-gh 72.42.124.1 3 4 196.49.14.1 1 0.341 255 NaN NaN ... 30997 acc 0.406 34.842901 41.484510 30997 Accra 5.556020 -0.196900 True
17064522 acc-gh 72.42.125.1 3 4 196.49.14.1 1 0.335 255 NaN NaN ... 30997 acc 0.320 34.229830 32.697151 30997 Accra 5.556020 -0.196900 True
17064524 acc-gh 72.42.126.1 3 4 196.49.14.1 1 0.330 255 NaN NaN ... 30997 acc 0.368 33.718937 37.601723 30997 Accra 5.556020 -0.196900 True
17064526 acc-gh 72.42.127.1 3 4 196.49.14.1 1 0.326 255 NaN NaN ... 30997 acc 0.331 33.310222 33.821115 30997 Accra 5.556020 -0.196900 True
16999300 acc-gh 23.0.0.1 2 2 196.49.14.1 1 0.349 255 NaN NaN ... 30997 acc 16.481 35.660330 1684.005437 30997 Accra 5.556020 -0.196900 False

3350019 rows × 24 columns

In [105]:
# find the nearest Airport for each inferred location

def merge_nearest_airport(inferred_locs_df, locs_df):
    """
    Finds the nearest airport from final_df for each hop in inferred_df
    and merges airport details and distance.
    """
    # Drop airports with missing coordinates and reset index to keep track
    locs_df['inferred_airport_lat'] = pd.to_numeric(locs_df['inferred_airport_lat'], errors='coerce')
    locs_df['inferred_airport_lon'] = pd.to_numeric(locs_df['inferred_airport_lon'], errors='coerce')


    airports_valid = locs_df.dropna(subset=['inferred_airport_lat', 'inferred_airport_lon']).copy()


    # Convert valid airport coordinates to radians for BallTree
    airports_valid['lat_rad'] = np.radians(airports_valid['inferred_airport_lat'])
    airports_valid['lon_rad'] = np.radians(airports_valid['inferred_airport_lon'])
    airport_coords_rad = airports_valid[['lat_rad', 'lon_rad']].values

    # Identify hops with valid coordinates
    valid_hop_mask = inferred_locs_df['hop_lat'].notna() & inferred_locs_df['hop_lon'].notna()
    valid_hops = inferred_locs_df.loc[valid_hop_mask].copy()

    # Convert valid hop coordinates to radians
    valid_hops['hop_lat'] = pd.to_numeric(valid_hops['hop_lat'], errors='coerce')
    valid_hops['hop_lon'] = pd.to_numeric(valid_hops['hop_lon'], errors='coerce')

    valid_hops['lat_rad'] = np.radians(valid_hops['hop_lat'])
    valid_hops['lon_rad'] = np.radians(valid_hops['hop_lon'])
    hop_coords_rad = valid_hops[['lat_rad', 'lon_rad']].values

    # Build tree on airport coordinates
    tree = BallTree(airport_coords_rad, metric='haversine')

    # Query the tree for the nearest airport (k=1) for each valid hop
    # Returns distances in radians and indices *relative to airport_coords_rad*
    distances_rad, indices_in_tree = tree.query(hop_coords_rad, k=1)

    # Get the original indices from final_df corresponding to the nearest airports
    nearest_airport_original_indices = airports_valid.index[indices_in_tree.flatten()]

    # Calculate distances in kilometers
    distances_km = distances_rad.flatten() * EARTH_RADIUS_KM

    # Create a DataFrame with results, indexed by the original index of valid_hops
    results_df = pd.DataFrame({
        'nearest_airport_original_idx': nearest_airport_original_indices,
        'distance_to_nearest_airport_km': distances_km
    }, index=valid_hops.index) # Use the original index from inferred_df

    # Create the new columns in the original inferred_df, initially filled with NaN
    for col in ['nearest_airport_lat', 'nearest_airport_lon', 'nearest_airport_name',
                'nearest_airport_iata', 'distance_to_nearest_airport_km']:
        inferred_locs_df[col] = np.nan

    # Fill distance for valid hops
    inferred_locs_df.loc[results_df.index, 'distance_to_nearest_airport_km'] = results_df['distance_to_nearest_airport_km']

    # Use the nearest_airport_original_idx to look up details in the original final_df
    nearest_indices = results_df['nearest_airport_original_idx']
    inferred_locs_df.loc[results_df.index, 'nearest_airport_lat'] = locs_df.loc[nearest_indices, 'inferred_airport_lat'].values
    inferred_locs_df.loc[results_df.index, 'nearest_airport_lon'] = locs_df.loc[nearest_indices, 'inferred_airport_lon'].values
    inferred_locs_df.loc[results_df.index, 'nearest_airport_name'] = locs_df.loc[nearest_indices, 'inferred_airport_name'].values
    inferred_locs_df.loc[results_df.index, 'nearest_airport_iata'] = locs_df.loc[nearest_indices, 'inferred_airport_iata'].values


    return inferred_locs_df

# takes some time
combined_df = merge_nearest_airport(p_hops, final_df)
combined_df.head()
/var/folders/k_/b2zc8kg571bcqh2l9j8mj82h0000gn/T/ipykernel_48014/3012844186.py:64: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '['Boeing Field King County International Airport'
 'Chek Lap Kok International Airport'
 'General Edward Lawrence Logan International Airport' ...
 'Kotoka International Airport' 'Kotoka International Airport'
 'Kotoka International Airport']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  inferred_locs_df.loc[results_df.index, 'nearest_airport_name'] = locs_df.loc[nearest_indices, 'inferred_airport_name'].values
/var/folders/k_/b2zc8kg571bcqh2l9j8mj82h0000gn/T/ipykernel_48014/3012844186.py:65: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '['BFI' 'HKG' 'BOS' ... 'ACC' 'ACC' 'ACC']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  inferred_locs_df.loc[results_df.index, 'nearest_airport_iata'] = locs_df.loc[nearest_indices, 'inferred_airport_iata'].values
Out[105]:
tx_hostname dst hop_count probe_count hop_addr hop_probe_ttl hop_rtt hop_reply_ttl ip4_address hop_name ... hop_asn hop_city hop_lat hop_lon is_neighbor nearest_airport_lat nearest_airport_lon nearest_airport_name nearest_airport_iata distance_to_nearest_airport_km
19208202 puw-us 23.11.40.7 47 60 23.203.145.249 43 289.470 219 23.203.145.249 ae34.r03.border101.sea01.fab.netarch.akamai.com ... 20940 Seattle 47.449001 -122.308998 False 47.529999 -122.302002 Boeing Field King County International Airport BFI 9.021869
3886303 hkg5-cn 23.11.40.7 44 56 23.40.189.1 43 453.065 28 23.40.189.1 vlan100.r01.tor01.hkg01.fab.netarch.akamai.com ... 20940 Hong Kong 22.320304 114.198074 False 22.308889 113.914722 Chek Lap Kok International Airport HKG 29.175437
16388104 bed-us 23.11.38.17 39 52 23.198.9.65 38 340.269 31 23.198.9.65 vlan100.r05.tor01.bos01.fab.netarch.akamai.com ... 20940 Boston 42.364300 -71.005203 False 42.364300 -71.005203 General Edward Lawrence Logan International Airport BOS 0.000033
2510835 ind-us 23.11.38.17 38 49 154.24.97.54 37 462.866 223 NaN NaN ... 174 Indianapolis 39.768379 -86.158043 False 39.717300 -86.294403 Indianapolis International Airport IND 12.968696
19661446 cpv-br 23.11.38.17 40 52 23.32.63.57 37 590.487 225 23.32.63.57 ae4.r01.gru01.icn.netarch.akamai.com ... 20940 Sao Paulo -23.435556 -46.473057 False -23.509100 -46.637798 Campo de Marte Airport SAO 18.686953

5 rows × 29 columns

In [106]:
combined_df['distance_to_nearest_airport_km'].mean() # 20km mean for valid latency neighbors, 26k mean for p-hops
Out[106]:
26.525433633860494
In [107]:
# bogon p-hops
combined_df[combined_df['hop_addr'].str.startswith('10.')]
Out[107]:
tx_hostname dst hop_count probe_count hop_addr hop_probe_ttl hop_rtt hop_reply_ttl ip4_address hop_name ... hop_asn hop_city hop_lat hop_lon is_neighbor nearest_airport_lat nearest_airport_lon nearest_airport_name nearest_airport_iata distance_to_nearest_airport_km
10797560 bkl-us 44.31.110.0 34 37 10.64.1.186 32 72.655 52 NaN NaN ... - <NA> NaN NaN False NaN NaN NaN NaN NaN
343127 bdl-us 44.31.110.0 33 34 10.64.1.182 31 74.293 58 NaN NaN ... - <NA> NaN NaN True NaN NaN NaN NaN NaN
3994211 osl-no 193.19.225.2 33 44 10.79.1.174 31 266.681 49 NaN NaN ... - <NA> NaN NaN False NaN NaN NaN NaN NaN
16824891 lun-zm 43.224.183.166 32 35 10.73.201.22 31 513.738 46 NaN NaN ... - <NA> NaN NaN True NaN NaN NaN NaN NaN
6021659 tlv3-il 46.8.8.66 31 41 10.79.1.118 30 281.252 51 NaN NaN ... - <NA> NaN NaN True NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
16896268 aep2-ar 104.192.141.0 7 10 10.3.7.97 3 1.842 253 NaN NaN ... - <NA> NaN NaN True NaN NaN NaN NaN NaN
6355310 hyd-in 209.131.101.100 6 10 10.130.32.1 2 0.761 63 NaN NaN ... - <NA> NaN NaN True NaN NaN NaN NaN NaN
6401646 hyd-in 156.154.58.1 6 10 10.130.32.1 2 1.165 63 NaN NaN ... - <NA> NaN NaN False NaN NaN NaN NaN NaN
6407864 hyd-in 173.243.129.1 6 10 10.130.32.1 2 1.000 63 NaN NaN ... - <NA> NaN NaN False NaN NaN NaN NaN NaN
22004682 sao-br 194.0.14.1 3 3 10.210.200.250 2 0.384 63 NaN NaN ... - <NA> NaN NaN True NaN NaN NaN NaN NaN

32578 rows × 29 columns

In [108]:
# how many are NOT within 3 ms of the target
combined_df[~combined_df['is_neighbor']]
Out[108]:
tx_hostname dst hop_count probe_count hop_addr hop_probe_ttl hop_rtt hop_reply_ttl ip4_address hop_name ... hop_asn hop_city hop_lat hop_lon is_neighbor nearest_airport_lat nearest_airport_lon nearest_airport_name nearest_airport_iata distance_to_nearest_airport_km
19208202 puw-us 23.11.40.7 47 60 23.203.145.249 43 289.470 219 23.203.145.249 ae34.r03.border101.sea01.fab.netarch.akamai.com ... 20940 Seattle 47.449001 -122.308998 False 47.529999 -122.302002 Boeing Field King County International Airport BFI 9.021869
3886303 hkg5-cn 23.11.40.7 44 56 23.40.189.1 43 453.065 28 23.40.189.1 vlan100.r01.tor01.hkg01.fab.netarch.akamai.com ... 20940 Hong Kong 22.320304 114.198074 False 22.308889 113.914722 Chek Lap Kok International Airport HKG 29.175437
16388104 bed-us 23.11.38.17 39 52 23.198.9.65 38 340.269 31 23.198.9.65 vlan100.r05.tor01.bos01.fab.netarch.akamai.com ... 20940 Boston 42.364300 -71.005203 False 42.364300 -71.005203 General Edward Lawrence Logan International Airport BOS 0.000033
2510835 ind-us 23.11.38.17 38 49 154.24.97.54 37 462.866 223 NaN NaN ... 174 Indianapolis 39.768379 -86.158043 False 39.717300 -86.294403 Indianapolis International Airport IND 12.968696
19661446 cpv-br 23.11.38.17 40 52 23.32.63.57 37 590.487 225 23.32.63.57 ae4.r01.gru01.icn.netarch.akamai.com ... 20940 Sao Paulo -23.435556 -46.473057 False -23.509100 -46.637798 Campo de Marte Airport SAO 18.686953
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14509083 ams5-nl 199.232.205.0 3 4 213.46.228.254 1 7.781 255 213.46.228.254 nl-ams05a-re1-fe-2-0.aorta.net ... 6830 Amsterdam 52.308601 4.763890 False 52.308601 4.763890 Amsterdam Airport Schiphol AMX 0.000045
19007616 scl3-cl 34.34.193.1 5 8 200.0.206.73 1 1.199 255 200.0.206.73 dnsp.redclara.net ... 27750 <NA> NaN NaN False NaN NaN NaN NaN NaN
19059509 scl3-cl 172.217.60.0 4 6 200.0.206.73 1 1.648 255 200.0.206.73 dnsp.redclara.net ... 27750 <NA> NaN NaN False NaN NaN NaN NaN NaN
19045998 scl3-cl 136.124.0.150 5 8 200.0.206.73 1 1.131 255 200.0.206.73 dnsp.redclara.net ... 27750 <NA> NaN NaN False NaN NaN NaN NaN NaN
16999300 acc-gh 23.0.0.1 2 2 196.49.14.1 1 0.349 255 NaN NaN ... 30997 Accra 5.556020 -0.196900 False 5.605190 -0.166786 Kotoka International Airport ACC 6.403097

760988 rows × 29 columns

In [109]:
# how many are within 3ms of the target
combined_df[combined_df['is_neighbor']]
Out[109]:
tx_hostname dst hop_count probe_count hop_addr hop_probe_ttl hop_rtt hop_reply_ttl ip4_address hop_name ... hop_asn hop_city hop_lat hop_lon is_neighbor nearest_airport_lat nearest_airport_lon nearest_airport_name nearest_airport_iata distance_to_nearest_airport_km
16381107 bed-us 203.30.19.12 37 43 203.34.30.181 36 259.773 232 203.34.30.181 xe-2-579-rt1.perma.zettagrid.com ... 7604 Perth -31.952240 115.861397 True -31.940300 115.967003 Perth International Airport PER 10.052402
16367629 bed-us 103.79.5.1 37 41 103.197.232.191 36 232.999 233 NaN NaN ... 64098 Sydney -33.867851 151.207321 True -33.924400 150.988007 Sydney Bankstown Airport BWU 21.196300
14059800 mem-us 91.237.67.0 37 39 87.44.56.138 36 279.795 244 NaN NaN ... 1213 Dublin 53.333061 -6.248890 True 53.421299 -6.270070 Dublin Airport DUB 9.911669
16381076 bed-us 203.24.54.4 37 43 203.34.30.181 36 259.226 232 203.34.30.181 xe-2-579-rt1.perma.zettagrid.com ... 7604 Perth -31.952240 115.861397 True -31.940300 115.967003 Perth International Airport PER 10.052402
13329052 lej-de 103.79.5.1 36 47 103.197.232.191 35 311.270 217 NaN NaN ... 64098 Sydney -33.867851 151.207321 True -33.924400 150.988007 Sydney Bankstown Airport BWU 21.196300
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
17064518 acc-gh 72.42.123.1 3 4 196.49.14.1 1 0.342 255 NaN NaN ... 30997 Accra 5.556020 -0.196900 True 5.605190 -0.166786 Kotoka International Airport ACC 6.403097
17064520 acc-gh 72.42.124.1 3 4 196.49.14.1 1 0.341 255 NaN NaN ... 30997 Accra 5.556020 -0.196900 True 5.605190 -0.166786 Kotoka International Airport ACC 6.403097
17064522 acc-gh 72.42.125.1 3 4 196.49.14.1 1 0.335 255 NaN NaN ... 30997 Accra 5.556020 -0.196900 True 5.605190 -0.166786 Kotoka International Airport ACC 6.403097
17064524 acc-gh 72.42.126.1 3 4 196.49.14.1 1 0.330 255 NaN NaN ... 30997 Accra 5.556020 -0.196900 True 5.605190 -0.166786 Kotoka International Airport ACC 6.403097
17064526 acc-gh 72.42.127.1 3 4 196.49.14.1 1 0.326 255 NaN NaN ... 30997 Accra 5.556020 -0.196900 True 5.605190 -0.166786 Kotoka International Airport ACC 6.403097

2589031 rows × 29 columns

In [110]:
# how many are latency neighbor hops
combined_df[combined_df['is_neighbor'] & (combined_df['pop_rtt'] < TRACEROUTE_MAX_LATENCY)]
Out[110]:
tx_hostname dst hop_count probe_count hop_addr hop_probe_ttl hop_rtt hop_reply_ttl ip4_address hop_name ... hop_asn hop_city hop_lat hop_lon is_neighbor nearest_airport_lat nearest_airport_lon nearest_airport_name nearest_airport_iata distance_to_nearest_airport_km
24242602 agb-de 23.11.40.7 23 31 23.210.52.235 22 11.539 234 23.210.52.235 ae3.decix-fra.netarch.akamai.com ... 20940 Frankfurt am Main 50.033333 8.570556 True 50.026402 8.543130 Frankfurt am Main International Airport FRA 2.105217
2947846 cjj-kr 207.189.149.0 22 27 141.101.82.12 21 4.745 244 NaN NaN ... 13335 Seoul 37.566002 126.978401 True 37.558300 126.791000 Gimpo International Airport SEL 16.540326
27203320 oak5-us 66.174.0.3 22 29 63.59.164.212 21 9.621 233 NaN NaN ... 22394 San Jose 37.339390 -121.894958 True 37.362598 -121.929001 Norman Y. Mineta San Jose International Airport SJC 3.964158
11857756 fra-de 23.11.40.7 25 34 23.210.54.63 21 1.110 44 23.210.54.63 ae34.r03.border101.fra03.fab.netarch.akamai.com ... 20940 Frankfurt am Main 50.033333 8.570556 True 50.026402 8.543130 Frankfurt am Main International Airport FRA 2.105217
3005931 cjj-kr 43.159.115.11 22 29 30.37.127.233 20 4.694 244 NaN NaN ... 749 <NA> NaN NaN True NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
17064518 acc-gh 72.42.123.1 3 4 196.49.14.1 1 0.342 255 NaN NaN ... 30997 Accra 5.556020 -0.196900 True 5.605190 -0.166786 Kotoka International Airport ACC 6.403097
17064520 acc-gh 72.42.124.1 3 4 196.49.14.1 1 0.341 255 NaN NaN ... 30997 Accra 5.556020 -0.196900 True 5.605190 -0.166786 Kotoka International Airport ACC 6.403097
17064522 acc-gh 72.42.125.1 3 4 196.49.14.1 1 0.335 255 NaN NaN ... 30997 Accra 5.556020 -0.196900 True 5.605190 -0.166786 Kotoka International Airport ACC 6.403097
17064524 acc-gh 72.42.126.1 3 4 196.49.14.1 1 0.330 255 NaN NaN ... 30997 Accra 5.556020 -0.196900 True 5.605190 -0.166786 Kotoka International Airport ACC 6.403097
17064526 acc-gh 72.42.127.1 3 4 196.49.14.1 1 0.326 255 NaN NaN ... 30997 Accra 5.556020 -0.196900 True 5.605190 -0.166786 Kotoka International Airport ACC 6.403097

1472998 rows × 29 columns

In [111]:
# how many are invalid latency neighbors

invalids = combined_df[~combined_df['is_neighbor'] | (combined_df['pop_rtt'] > TRACEROUTE_MAX_LATENCY)]
invalids
Out[111]:
tx_hostname dst hop_count probe_count hop_addr hop_probe_ttl hop_rtt hop_reply_ttl ip4_address hop_name ... hop_asn hop_city hop_lat hop_lon is_neighbor nearest_airport_lat nearest_airport_lon nearest_airport_name nearest_airport_iata distance_to_nearest_airport_km
19208202 puw-us 23.11.40.7 47 60 23.203.145.249 43 289.470 219 23.203.145.249 ae34.r03.border101.sea01.fab.netarch.akamai.com ... 20940 Seattle 47.449001 -122.308998 False 47.529999 -122.302002 Boeing Field King County International Airport BFI 9.021869
3886303 hkg5-cn 23.11.40.7 44 56 23.40.189.1 43 453.065 28 23.40.189.1 vlan100.r01.tor01.hkg01.fab.netarch.akamai.com ... 20940 Hong Kong 22.320304 114.198074 False 22.308889 113.914722 Chek Lap Kok International Airport HKG 29.175437
16388104 bed-us 23.11.38.17 39 52 23.198.9.65 38 340.269 31 23.198.9.65 vlan100.r05.tor01.bos01.fab.netarch.akamai.com ... 20940 Boston 42.364300 -71.005203 False 42.364300 -71.005203 General Edward Lawrence Logan International Airport BOS 0.000033
2510835 ind-us 23.11.38.17 38 49 154.24.97.54 37 462.866 223 NaN NaN ... 174 Indianapolis 39.768379 -86.158043 False 39.717300 -86.294403 Indianapolis International Airport IND 12.968696
19661446 cpv-br 23.11.38.17 40 52 23.32.63.57 37 590.487 225 23.32.63.57 ae4.r01.gru01.icn.netarch.akamai.com ... 20940 Sao Paulo -23.435556 -46.473057 False -23.509100 -46.637798 Campo de Marte Airport SAO 18.686953
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14509083 ams5-nl 199.232.205.0 3 4 213.46.228.254 1 7.781 255 213.46.228.254 nl-ams05a-re1-fe-2-0.aorta.net ... 6830 Amsterdam 52.308601 4.763890 False 52.308601 4.763890 Amsterdam Airport Schiphol AMX 0.000045
19007616 scl3-cl 34.34.193.1 5 8 200.0.206.73 1 1.199 255 200.0.206.73 dnsp.redclara.net ... 27750 <NA> NaN NaN False NaN NaN NaN NaN NaN
19059509 scl3-cl 172.217.60.0 4 6 200.0.206.73 1 1.648 255 200.0.206.73 dnsp.redclara.net ... 27750 <NA> NaN NaN False NaN NaN NaN NaN NaN
19045998 scl3-cl 136.124.0.150 5 8 200.0.206.73 1 1.131 255 200.0.206.73 dnsp.redclara.net ... 27750 <NA> NaN NaN False NaN NaN NaN NaN NaN
16999300 acc-gh 23.0.0.1 2 2 196.49.14.1 1 0.349 255 NaN NaN ... 30997 Accra 5.556020 -0.196900 False 5.605190 -0.166786 Kotoka International Airport ACC 6.403097

1876960 rows × 29 columns

In [112]:
# plot for invalids

rtt_data = invalids['pop_rtt'].dropna()
distance_data = invalids['distance_to_nearest_airport_km'].dropna()

def calculate_cdf(series):
    """Calculates sorted data and cumulative probabilities for a CDF."""
    data_sorted = np.sort(series)
    n = data_sorted.size
    y_cdf = np.arange(1, n + 1) / n
    return data_sorted, y_cdf

fig, axes = plt.subplots(figsize=(14, 6))

x_dist, y_dist_cdf = calculate_cdf(distance_data)
ax = axes
ax.plot(x_dist, y_dist_cdf, drawstyle='steps-post', linestyle='-')

ax.set_xlabel('Distance (km)')
ax.set_ylabel('Cumulative Probability')
ax.grid(True, linestyle='--', alpha=0.6)

ax.set_ylim(0, 1)
ax.set_xlim(1, 4_000)
ax.set_xscale('log')

plt.tight_layout()

plt.savefig("./plots/phop_distances_log.pdf", bbox_inches='tight', pad_inches=0)

plt.show()
No description has been provided for this image

Investigate probing cost¶

In [113]:
# infer hops adjacent to the VP with bogon addresses to be in the same ASN

df = traceroutes_df

MISSING_ASN_VALUE = '-'

df.sort_values(['tx_hostname', 'dst', 'hop_probe_ttl'], inplace=True)

# find valid AS hops
df['asn_is_valid'] = df['hop_asn'] != MISSING_ASN_VALUE



# Identify the initial segment before the first valid ASN within each group
df['seen_valid_asn'] = df.groupby(['tx_hostname', 'dst'])['asn_is_valid'].cummax()

# determine which rows need filling
rows_to_fill_mask = (df['hop_asn'] == MISSING_ASN_VALUE) & (~df['seen_valid_asn'])

# fill rows
df.loc[rows_to_fill_mask, 'hop_asn'] = df.loc[rows_to_fill_mask, 'vp_asn']

df.drop(columns=['asn_is_valid', 'seen_valid_asn'], inplace=True)
In [114]:
# fill in hop ASNs (hops in-between two hops with the same AS are inferred to be in that same AS, if they have no AS data)

df['hop_asn_temp'] = df['hop_asn'].replace(MISSING_ASN_VALUE, np.nan)


df['asn_ffill'] = df.groupby(['tx_hostname', 'dst'])['hop_asn_temp'].ffill()


df['asn_bfill'] = df.groupby(['tx_hostname', 'dst'])['hop_asn_temp'].bfill()

fill_condition = (
    (df['hop_asn'] == MISSING_ASN_VALUE) &  # Original was missing
    (df['asn_ffill'] == df['asn_bfill']) &  # Surrounding valid ASNs match
    (df['asn_ffill'].notna())               # The matching ASN is a valid number
)

df.loc[fill_condition, 'hop_asn'] = df.loc[fill_condition, 'asn_ffill']

df.drop(columns=['hop_asn_temp', 'asn_ffill', 'asn_bfill'], inplace=True)

df.head()
Out[114]:
tx_hostname dst hop_count probe_count hop_addr hop_probe_ttl hop_rtt hop_reply_ttl ip4_address hop_name ... vp_asn tx_airport pop_rtt ark_to_hop ark_to_pop hop_asn hop_city hop_lat hop_lon is_neighbor
1324098 abz2-uk 1.0.0.0 11 11 137.50.19.1 1 0.318 255 137.50.19.1 milliways2-sbx-one-ipv4.erg.abdn.ac.uk ... 786 abz 13.506 32.492793 1380.024115 786 Aberdeen 57.143688 -2.09814 False
1324537 abz2-uk 1.0.0.0 11 12 137.50.19.1 1 0.326 255 137.50.19.1 milliways2-sbx-one-ipv4.erg.abdn.ac.uk ... 786 abz 13.506 33.310222 1380.024115 786 Aberdeen 57.143688 -2.09814 False
1325067 abz2-uk 1.0.0.0 11 11 137.50.19.1 1 0.348 255 137.50.19.1 milliways2-sbx-one-ipv4.erg.abdn.ac.uk ... 786 abz 13.506 35.558151 1380.024115 786 Aberdeen 57.143688 -2.09814 False
1325895 abz2-uk 1.0.0.0 11 11 137.50.19.1 1 0.336 255 137.50.19.1 milliways2-sbx-one-ipv4.erg.abdn.ac.uk ... 786 abz 13.506 34.332008 1380.024115 786 Aberdeen 57.143688 -2.09814 False
1327020 abz2-uk 1.0.0.0 11 11 137.50.19.1 1 0.314 255 137.50.19.1 milliways2-sbx-one-ipv4.erg.abdn.ac.uk ... 786 abz 13.506 32.084079 1380.024115 786 Aberdeen 57.143688 -2.09814 False

5 rows × 24 columns

In [115]:
# what is the average hop count?

df.groupby(['tx_hostname', 'dst'])['hop_count'].first().mean()
Out[115]:
9.828091099985004
In [116]:
# what is the average hop count (< 12 ms)

df[df['pop_rtt'] < TRACEROUTE_MAX_LATENCY].groupby(['tx_hostname', 'dst'])['hop_count'].first().mean()
Out[116]:
8.40202352968762
In [117]:
# average number of hops within the same AS as the VP

df[df['hop_asn'] == df['vp_asn']].groupby(['tx_hostname', 'dst']).size().mean()
Out[117]:
2.86832625940674
In [118]:
# average number of VPs per target

traceroutes_df.groupby('dst')['tx_hostname'].nunique().mean()
Out[118]:
244.19898070622497
In [119]:
# average number of VPs within 12 ms per target

traceroutes_df[traceroutes_df['pop_rtt'] < TRACEROUTE_MAX_LATENCY].groupby('dst')['tx_hostname'].nunique().mean()
Out[119]:
124.46592733109587
In [120]:
# plot number of hops against number of PoPs

traceroute_lengths = df.groupby(['tx_hostname', 'prefix'])['hop_count'].first()

avg_hop_counts_per_prefix = traceroute_lengths.groupby(level='prefix').mean().rename('avg_hop_count')

site_counts_per_prefix = merged_df[['prefix', 'TR']].drop_duplicates(subset=['prefix']).set_index('prefix')['TR']

plot_data = pd.concat([avg_hop_counts_per_prefix, site_counts_per_prefix], axis=1, join='inner')

plt.figure(figsize=(10, 6))
plt.scatter(plot_data['TR'], plot_data['avg_hop_count'], alpha=0.1)

plt.xlabel("PoPs detected")
plt.ylabel("Average hop count")

plt.grid(True, linestyle='--', alpha=0.6)

plt.tight_layout()

plt.savefig("./plots/hop_count_scatter.pdf", bbox_inches='tight', pad_inches=0)

plt.show()
No description has been provided for this image
In [121]:
# plot latency against hop count

# get traceroute lengths of each VP, target pair
traceroute_lengths = df.groupby(['tx_hostname', 'prefix'])['hop_count'].first()
avg_hop_counts_per_prefix = traceroute_lengths.groupby(level='prefix').mean().rename('avg_hop_count')

# get RTT towards each PoP
pop_rtts = df.groupby(['tx_hostname', 'prefix'])['pop_rtt'].first()
avg_rtts_per_prefix = pop_rtts.groupby(level='prefix').mean().rename('avg_pop_rtt') # Calculate average RTT

# combine ttl lengths and average rtt
plot_data = pd.concat([avg_hop_counts_per_prefix, avg_rtts_per_prefix], axis=1, join='inner')
print(plot_data)
                avg_hop_count  avg_pop_rtt
prefix                                    
1.0.0.0/24           7.802920     9.607292
1.1.1.0/24           7.703846    10.929742
1.10.10.0/24        14.062500   208.615852
1.12.0.0/24         14.458333   102.723392
1.12.12.0/24        13.632979    90.069745
...                       ...          ...
99.83.254.0/24      11.747368    15.629442
99.83.255.0/24      11.773196    15.709402
99.99.97.0/24       12.940171    75.167333
99.99.98.0/24       13.879310    85.667448
99.99.99.0/24       13.923077    82.285115

[13735 rows x 2 columns]
In [122]:
plt.figure(figsize=(10, 6))
plt.scatter(plot_data['avg_pop_rtt'], plot_data['avg_hop_count'], alpha=0.1)

plt.xlabel("Latency to PoP (ms)")
plt.xscale('log', base=2)
plt.xlim(1, 1000)

plt.ylabel("Average hop count")

plt.grid(True, linestyle='--', alpha=0.6)

plt.tight_layout()

plt.savefig("./plots/latency_hop_count.pdf", bbox_inches='tight', pad_inches=0)

plt.show()
No description has been provided for this image

Geolocation¶

In [12]:
# Helper Functions
def i_greedy_location_to_df(file_path):
     # Load JSON data
    with open(file_path, 'r') as f:
        data = list(json.load(f))

    rows = []
    
    for element in data:
        for entry in element['instances']:
            row = []
            row.append(element["prefix"])
            row.append(entry['marker']['latitude'])
            row.append(entry['marker']['longitude'])
            row.append(entry['marker']['city'])
            row.append(entry['marker']['code_country'])
            rows.append(row)
    
    return pd.DataFrame(rows, columns= ['Prefix', 'Latitude','Longitude','City', 'Country'  ]).drop_duplicates() 

def root_server_location_to_df(file_path):
     # Load JSON data
    with open(file_path, 'r') as f:
        data = dict(json.load(f))

    rows = []
    
    for entry in data['Sites']:
        row = []
        row.append(data["IPv4"])
        row.append(entry['Latitude'])
        row.append(entry['Longitude'])
        row.append(entry['Town'])
        row.append(entry['Country'])
        rows.append(row)
    
    return pd.DataFrame(rows, columns= ['IPv4', 'Latitude','Longitude','City', 'Country'  ]).drop_duplicates() 
In [19]:
# Import the iGreedy Locations and Ground Truth Locations
i_greedy_locations =  i_greedy_location_to_df('./data/2025-04-19_v4_locations.json') # Can be found here: https://github.com/anycast-census/anycast-census/tree/main/2025/04/03

root_server_letters = ['A','B','C','D','E','F','G','H','I','J','K','L','M',]

root_server_location_dict  = {}

for letter in root_server_letters:
    root_server_location_dict[letter] = root_server_location_to_df(f"./data/root_server_locations/{letter}.json") # Can be found here: https://root-servers.org/

# And have a list of them to loop over

addresses = [
'198.41.0.4',
'170.247.170.2',
'192.33.4.12',
'199.7.91.13',
'192.203.230.10',
'192.5.5.0',
#'192.112.36.4', # g-root unresponsive to ICMP
'198.97.190.53',
'192.36.148.17',
'192.58.128.30',
'193.0.14.129',
'199.7.83.42',
'202.12.27.33']
In [30]:
traceroute_means = []
traceroute_medians = []
traceroute_enumeration = []
traceroute_full_list = []

igreedy_means = []
igreedy_medians = []
igreedy_enumeration = []
igreedy_full_list = []

#Loop over all responsive root servers and pick the right ground truth
for address in addresses:
    if address == '198.41.0.4':
        groundtruth_df = root_server_location_dict['A']
    elif address == '170.247.170.2':
        groundtruth_df = root_server_location_dict['B']
    elif address == '192.33.4.12':
        groundtruth_df = root_server_location_dict['C']
    elif address == '199.7.91.13':
        groundtruth_df = root_server_location_dict['D']
    elif address == '192.203.230.10':
        groundtruth_df = root_server_location_dict['E']
    elif address == '192.5.5.0':
        groundtruth_df = root_server_location_dict['F']
    elif address == '192.112.36.4':
        groundtruth_df = root_server_location_dict['G']
    elif address == '198.97.190.53':
        groundtruth_df = root_server_location_dict['H']
    elif address == '192.36.148.17':
        groundtruth_df = root_server_location_dict['I']
    elif address == '192.58.128.30':
        groundtruth_df = root_server_location_dict['J']
    elif address == '193.0.14.129':
        groundtruth_df = root_server_location_dict['K']
    elif address == '199.7.83.42':
        groundtruth_df = root_server_location_dict['L']
    elif address == '202.12.27.33':
        groundtruth_df = root_server_location_dict['M']
    else:
        break

    prefix = ".".join(address.split(".")[:3]) + ".0/24"

    # Get the right locations from the respective datasets

    traceroute_df = final_df[final_df['dst']==address].dropna()
    traceroute_df['location'] = traceroute_df.apply(lambda row: (row['inferred_airport_lat'], row['inferred_airport_lon'], row['inferred_airport_name']), axis=1)
    traceroute_locs = traceroute_df['location'].unique()

    i_greedy_df = i_greedy_locations[i_greedy_locations['Prefix']==prefix].dropna()
    i_greedy_df['location'] = i_greedy_df.apply(lambda row: (row['Latitude'], row['Longitude'], row['City']), axis=1)
    i_greedy_locs = i_greedy_df['location'].unique()
    

    groundtruth_df['location'] = groundtruth_df.apply(lambda row: (row['Latitude'], row['Longitude'], row['City']), axis=1)
    gt_locs = groundtruth_df['location'].unique()

    # Calculate the distance for all found locations to the closest ground truth locations for Traceroute and iGreedy
    
    tcd_list = []
    for loc in traceroute_locs:
        closest_distance =1000000
        adr = None
        for gt_loc in gt_locs:
            distance_between = distance.geodesic((loc[0], loc[1]),(gt_loc[0], gt_loc[1])).km
            if distance_between < closest_distance:
                closest_distance = distance_between
                adr = (loc[2], gt_loc[2])
        tcd_list.append(closest_distance)
    
    traceroute_means.append(statistics.mean(tcd_list))
    traceroute_medians.append(statistics.median(tcd_list))
    traceroute_enumeration.append(len(tcd_list)/len(gt_locs))


    icd_list = []
    for loc in i_greedy_locs:
        closest_distance =1000000
        adr = None
        for gt_loc in gt_locs:
            distance_between = distance.geodesic((loc[0], loc[1]),(gt_loc[0], gt_loc[1])).km
            if distance_between < closest_distance:
                closest_distance = distance_between
                adr = (loc[2], gt_loc[2])
        icd_list.append(closest_distance)
    
    igreedy_means.append(statistics.mean(icd_list))
    igreedy_medians.append(statistics.median(icd_list))
    igreedy_enumeration.append(len(icd_list)/len(gt_locs))

# Plot the results

fig, (ax1, ax2) = plt.subplots(1,2)

ax1.boxplot([traceroute_enumeration, igreedy_enumeration])
ax1.set_xticklabels(["TR", "GCD"])
ax1.set_title('Enumeration')

ax2.boxplot([traceroute_means, traceroute_medians, igreedy_means, igreedy_medians])
ax2.set_title('Distance to GT in km')
ax2.set_xticklabels(["TR \n Mean", "TR \n Median", "GCD \n Mean", "GCD \n Median"])

print(f"Plot for Traceroute Radius with {DISK_DISTANCE_KM}km")
plt.show()   
Plot for Traceroute Radius with 100km
No description has been provided for this image