In [ ]:
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
imports and global variables (see requirements.txt for dependencies)¶
In [24]:
# !pip install -r requirements.txt
# --- Standard Library Imports ---
import gzip
import io
import ipaddress
import json
import os
import subprocess
import statistics
import sys
import time
import urllib.error
import urllib.parse
import urllib.request
# --- Third-Party Library Imports ---
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import pandas as pd
import radix
import requests
from bs4 import BeautifulSoup
from sklearn.neighbors import BallTree
import duckdb
from geopy import distance
# constants
SPEED_IN_FIBRE = 299_792 / 1.467 # 1.467 refraction index fibre optics https://www.thorlabs.com/newgrouppage9.cfm?objectgroup_id=949
ALPHA = 1
EARTH_RADIUS_KM = 6371 # Radius of Earth
# parameters
LATENCY_THRESHOLD = 3 # RTT maximum between latency neighbor hop and PoP
TRACEROUTE_MAX_LATENCY = 12 # RTT maximum between VP and PoP
DISK_DISTANCE_KM = 100 # airport grouping radius
DISK_DISTANCE_RAD = DISK_DISTANCE_KM / EARTH_RADIUS_KM # convert distance to radians
Read in data and combine with external datasets¶
In [3]:
# for reproducability we share a single combined parquet containing all traceroute results
traceroutes_df = pd.read_parquet('./data/combined.parquet', engine='pyarrow') # engine='fastparquet'
traceroutes_df.head()
Out[3]:
| tx_hostname | dst | hop_count | probe_count | hop_addr | hop_probe_ttl | hop_rtt | hop_reply_ttl | |
|---|---|---|---|---|---|---|---|---|
| 0 | aep3-ar.ark.caida.org | 1.0.0.0 | 9 | 13 | 192.168.1.1 | 1 | 1.281 | 64 |
| 1 | aep3-ar.ark.caida.org | 1.0.0.0 | 9 | 13 | 192.168.0.1 | 2 | 1.606 | 63 |
| 2 | aep3-ar.ark.caida.org | 1.0.0.0 | 9 | 13 | 181.96.113.217 | 7 | 13.811 | 249 |
| 3 | aep3-ar.ark.caida.org | 1.0.0.0 | 9 | 13 | 181.13.127.254 | 8 | 9.127 | 248 |
| 4 | aep3-ar.ark.caida.org | 1.0.0.0 | 9 | 13 | 1.0.0.0 | 9 | 12.355 | 56 |
In [3]:
# # extract hop_name column for retrieving OI data
#
# unique_hop_addr = traceroutes_df['hop_addr'].dropna().unique()
# pd.DataFrame(unique_hop_addr).to_csv('./data/unique_hops.csv', index=False, header=False)
In [ ]:
# add OpenINTEL PTR recrods to traceroute data
hop_names = pd.read_csv('./data/ptr_lookup.csv')
traceroutes_df = pd.merge(traceroutes_df, hop_names, how='left', left_on='hop_addr', right_on='ip4_address')
# rename ptr_name to hop_name
traceroutes_df = traceroutes_df.rename(columns={'ptr_name': 'hop_name'})
In [9]:
# load in MAnycastR census data
census = pd.read_csv('./data/IPv4_full_19_04.csv.gz', compression='gzip') # data from 2025 April 19
census = census[census['iGreedyICMPv4'] > 1] # filter on iGreedy confirmed anycast
census.head()
Out[9]:
| prefix | MAnycastICMPv4 | MAnycastTCPv4 | MAnycastUDPv4 | iGreedyICMPv4 | iGreedyTCPv4 | |
|---|---|---|---|---|---|---|
| 1 | 1.0.0.0/24 | 27 | 27 | 26 | 71 | 28 |
| 136 | 1.1.1.0/24 | 28 | 28 | 27 | 71 | 28 |
| 229 | 1.10.10.0/24 | 1 | 0 | 1 | 3 | 0 |
| 2081 | 1.12.0.0/24 | 5 | 0 | 4 | 5 | 0 |
| 2082 | 1.12.12.0/24 | 3 | 1 | 0 | 3 | 1 |
In [6]:
# add prefix column
traceroutes_df['prefix'] = traceroutes_df['dst'].apply(lambda x: '.'.join(x.split('.')[:3]) + '.0/24')
In [7]:
# filter traceroutes by those confirmed using iGreedy (we filter out MAncyast2 candidate anycast targets as we do not consider traceroute as an anycast detection technique)
traceroutes_df = traceroutes_df[traceroutes_df['prefix'].isin(census['prefix'])]
In [8]:
# read in locations of Ark VPs
ark_locs = pd.read_csv('./data/arklocs.txt', header=None, names=['tx_hostname', 'vp_lat', 'vp_lon', 'tx_city', 'vp_asn'])
ark_locs['tx_hostname'] = ark_locs['tx_hostname'].str.split(".ark.caida.org").str[0]
ark_locs['vp_asn'] = ark_locs['vp_asn'].apply(lambda x: '-' if pd.isna(x) else str(int(float(x))))
ark_locs['tx_airport'] = ark_locs['tx_hostname'].str[:3]
ark_locs.head()
Out[8]:
| tx_hostname | vp_lat | vp_lon | tx_city | vp_asn | tx_airport | |
|---|---|---|---|---|---|---|
| 0 | hlz2-nz | -37.79 | 175.28 | Hamilton | 9500 | hlz |
| 1 | hkg4-cn | 22.36 | 114.12 | Hong Kong | 212238 | hkg |
| 2 | lax4-us | 33.92 | -118.39 | Los Angeles | 212238 | lax |
| 3 | bfi-us | 47.61 | -122.33 | Seattle | 209 | bfi |
| 4 | drs-de | 51.03 | 13.73 | Dresden | 680 | drs |
In [9]:
# how many distinct ASes?
ark_locs['vp_asn'].nunique()
Out[9]:
178
In [10]:
# how many distinct cities?
ark_locs['tx_city'].nunique(0)
Out[10]:
205
In [11]:
ark_locs['country_code'] = ark_locs['tx_hostname'].str.split("-").str[1]
ark_locs['country_code'].nunique()
Out[11]:
66
In [12]:
# drop country code
ark_locs.drop(columns='country_code', inplace=True)
In [13]:
# add ark locations
traceroutes_df['tx_hostname'] = traceroutes_df['tx_hostname'].str.split(".ark.caida.org").str[0]
traceroutes_df = pd.merge(traceroutes_df, ark_locs, on='tx_hostname', how='left')
traceroutes_df.head()
Out[13]:
| tx_hostname | dst | hop_count | probe_count | hop_addr | hop_probe_ttl | hop_rtt | hop_reply_ttl | ip4_address | hop_name | prefix | vp_lat | vp_lon | tx_city | vp_asn | tx_airport | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | aep3-ar | 1.0.0.0 | 9 | 13 | 192.168.1.1 | 1 | 1.281 | 64 | NaN | NaN | 1.0.0.0/24 | -34.6 | -58.38 | Buenos Aires | 7303 | aep |
| 1 | aep3-ar | 1.0.0.0 | 9 | 13 | 192.168.0.1 | 2 | 1.606 | 63 | NaN | NaN | 1.0.0.0/24 | -34.6 | -58.38 | Buenos Aires | 7303 | aep |
| 2 | aep3-ar | 1.0.0.0 | 9 | 13 | 181.96.113.217 | 7 | 13.811 | 249 | 181.96.113.217 | host217.181-96-113.telecom.net.ar. | 1.0.0.0/24 | -34.6 | -58.38 | Buenos Aires | 7303 | aep |
| 3 | aep3-ar | 1.0.0.0 | 9 | 13 | 181.13.127.254 | 8 | 9.127 | 248 | 181.13.127.254 | host254.181-13-127.telecom.net.ar. | 1.0.0.0/24 | -34.6 | -58.38 | Buenos Aires | 7303 | aep |
| 4 | aep3-ar | 1.0.0.0 | 9 | 13 | 1.0.0.0 | 9 | 12.355 | 56 | NaN | NaN | 1.0.0.0/24 | -34.6 | -58.38 | Buenos Aires | 7303 | aep |
exploratory data analysis¶
In [14]:
# how many VPs participated?
traceroutes_df['tx_hostname'].nunique()
Out[14]:
274
In [15]:
# how many ASes are these in?
traceroutes_df['vp_asn'].nunique()
Out[15]:
178
In [16]:
# how many targets did we measure?
traceroutes_df['dst'].nunique()
Out[16]:
13735
In [17]:
# how many tx_hostname, dst pairs (i.e., how many traceroutes?)
traceroutes_df[['tx_hostname', 'dst']].drop_duplicates().shape[0]
Out[17]:
3354073
In [18]:
# what is the average number of VPs that completed a traceroute towards a target
traceroutes_df.groupby('dst')['tx_hostname'].nunique().mean()
Out[18]:
244.19898070622497
In [19]:
# what is the average number of hops captured?
traceroutes_df.groupby(['tx_hostname', 'dst']).size().mean()
Out[19]:
8.123025944873591
In [20]:
# how many unique hop addresses?
traceroutes_df['hop_addr'].nunique()
Out[20]:
99393
In [21]:
# how many unique hop addresses have PTR records?
traceroutes_df[~traceroutes_df['hop_name'].isna()]['hop_addr'].nunique()
Out[21]:
45106
In [22]:
# how many hops have PTR records?
(~traceroutes_df['hop_name'].isna()).sum()
Out[22]:
14336720
In [23]:
# how many hops do not have PTR records?
traceroutes_df['hop_name'].isna().sum()
Out[23]:
12908502
In [24]:
# how many unique PTR records did we observe?
traceroutes_df['hop_name'].nunique()
Out[24]:
41758
Calculate distances and RTTS¶
In [25]:
# get RTT to the PoP for each traceroute
dst_hops_df = traceroutes_df[traceroutes_df['dst'] == traceroutes_df['hop_addr']]
dst_hops_df = dst_hops_df[['tx_hostname', 'dst', 'hop_rtt']].rename(
columns={'hop_rtt': 'pop_rtt'}
)
dst_hops_df = dst_hops_df.sort_values(by='pop_rtt').drop_duplicates(subset=['tx_hostname', 'dst']) # remove duplicate RTTs observed between an (Ark, PoP) pair -> keep the lowest RTT
traceroutes_df = pd.merge(traceroutes_df, dst_hops_df, on=['tx_hostname', 'dst'], how='left')
In [26]:
def rtt_to_distance(rtt_ms):
one_way_time_s = (rtt_ms / 10_00) / 2 # to seconds and divide by 2 to get one-way latency
distance_km = one_way_time_s * SPEED_IN_FIBRE
return distance_km
In [27]:
# calculate distance between VP and hop
traceroutes_df['ark_to_hop'] = traceroutes_df['hop_rtt'].apply(rtt_to_distance)
# calculate distance between VP and PoP
traceroutes_df['ark_to_pop'] = traceroutes_df['pop_rtt'].apply(rtt_to_distance)
# calculate distance between hop and PoP
#traceroutes_df['hop_to_pop'] = np.maximum(0, result_df['ark_to_pop'] - result_df['ark_to_hop']) # this value is meaningless as the delta RTT between the two is unreliable
load in ipinfo data¶
In [28]:
# load in ipinfo data (weekly snapshots)
# our methodology requires an IP to location database with city-level data
# this example uses IPInfo's geolocation dataset
# they provide free access for academic, education, and non-commercial projects under certain terms (https://ipinfo.io/use-cases/ip-data-for-academic-research)
json_file = './data/2025-03-31_standard_location.json.gz' # monthly snapshot
def ipv4_to_uint32(ip_str):
"""Converts an IPv4 string to an unsigned 32-bit integer."""
if ip_str is None:
return None
try:
# Convert to IPv4Address object first to validate
ip_obj = ipaddress.ip_address(ip_str)
if ip_obj.version == 4:
return int(ip_obj) # Returns the integer representation
else:
return None # Not an IPv4 address
except ValueError:
# Handle cases where the string is not a valid IP address at all
return None
# load data in duck DB
db_file = 'ip_locations_ipv4.duckdb'
table_name = 'locations_ipv4'
# if db_file exists, we are done
if os.path.exists(db_file):
print(f"Database file {db_file} already exists. Skipping loading.")
else:
print(f"Loading IPv4 ipinfo data from {json_file} into DuckDB table {table_name}")
start_time = time.time()
# create the database file
con = duckdb.connect(database=db_file, read_only=False)
# add ipv4 translation function to duckdb
con.create_function('ipv4_to_uint32_udf', ipv4_to_uint32, [duckdb.typing.VARCHAR], duckdb.typing.UINTEGER)
# create table schema
con.execute(f"""
DROP TABLE IF EXISTS {table_name};
CREATE TABLE {table_name} (
start_ip_str VARCHAR,
end_ip_str VARCHAR,
join_key VARCHAR,
city VARCHAR,
region VARCHAR,
country VARCHAR,
latitude DOUBLE,
longitude DOUBLE,
postal_code VARCHAR,
timezone VARCHAR,
start_ip_uint32 UINTEGER,
end_ip_uint32 UINTEGER
);
""")
# load in data (excluding ipv6 addresses)
try:
con.execute(f"""
INSERT INTO {table_name} (
start_ip_str, end_ip_str, join_key, city, region, country,
latitude, longitude, postal_code, timezone,
start_ip_uint32, end_ip_uint32
)
SELECT
start_ip,
end_ip,
join_key,
city,
region,
country,
CAST(latitude AS DOUBLE),
CAST(longitude AS DOUBLE),
postal_code,
timezone,
ipv4_to_uint32_udf(start_ip) AS start_ip_u32,
ipv4_to_uint32_udf(end_ip) AS end_ip_u32
FROM read_json_auto('{json_file}', filename=true)
WHERE
-- filter out non-ipv4 addresses
start_ip LIKE '%.%.%.%' AND start_ip NOT LIKE '%:%' AND
end_ip LIKE '%.%.%.%' AND end_ip NOT LIKE '%:%'
""")
except Exception as e:
print(f"An error occurred during INSERT: {e}")
con.close()
raise
print("Creating index on IP ranges...")
con.execute(f"CREATE INDEX ip_range_idx_u32 ON {table_name} (start_ip_uint32, end_ip_uint32);")
con.close()
end_time = time.time()
print(f"IPInfo (IPV4) Database creation took: {end_time - start_time:.2f} seconds")
Database file ip_locations_ipv4.duckdb already exists. Skipping loading.
In [29]:
# get unique addresses
unique_addr = traceroutes_df['hop_addr'].dropna().unique()
print(f"Found {len(unique_addr)} unique hop addresses out of {len(traceroutes_df)} total hops.")
# get u32 values for lookup
addr_data = []
for hop in unique_addr:
ip_uint32 = ipv4_to_uint32(hop)
if ip_uint32 is not None: # Keep only valid IPv4 conversions
addr_data.append({'hop_addr': hop, 'hop_addr_uint32': ip_uint32})
unique_addr_df = pd.DataFrame(addr_data)
unique_addr_df['hop_addr_uint32'] = unique_addr_df['hop_addr_uint32'].astype('uint32')
del unique_addr
del addr_data
Found 99393 unique hop addresses out of 27245222 total hops.
In [30]:
columns_to_load = [
'city',
'latitude',
'longitude',
'start_ip_uint32',
'end_ip_uint32'
]
select_cols_str = ", ".join(columns_to_load)
dtypes_to_load = {
'city': 'string',
'latitude': 'float32',
'longitude': 'float32',
'start_ip_uint32': 'uint32',
'end_ip_uint32': 'uint32'
}
print(f"Loading selected columns from {table_name} into Pandas...")
start_load_time = time.time()
con = None
try:
con = duckdb.connect(database=db_file, read_only=True)
print(" Connected to DuckDB.")
# cast df
select_expressions = []
for col in columns_to_load:
dtype = dtypes_to_load.get(col)
if dtype:
# data types
sql_type = dtype.replace('string', 'VARCHAR') \
.replace('float32', 'FLOAT') \
.replace('float64', 'DOUBLE') \
.replace('uint32', 'UINTEGER') \
.replace('Int64', 'BIGINT')
select_expressions.append(f"CAST({col} AS {sql_type}) AS {col}")
else:
select_expressions.append(col)
select_query_cols_str = ",\n ".join(select_expressions)
query = f"""
SELECT
{select_query_cols_str}
FROM {table_name}
WHERE start_ip_uint32 IS NOT NULL -- exclude NaN values
ORDER BY start_ip_uint32 ASC; -- sort by start_ip_uint32
"""
ipinfo_df = con.execute(query).fetchdf()
print(f"Successfully loaded {len(ipinfo_df)} rows.")
except Exception as e:
print(f"An error occurred loading data from DuckDB: {e}")
finally:
if con:
con.close()
print(" DuckDB connection closed.")
end_load_time = time.time()
print(f"\nLoading and initial sorting took: {end_load_time - start_load_time:.2f} seconds")
print("\nLoaded ip2location DataFrame Info:")
ipinfo_df.info(memory_usage='deep') # Check memory usage
Loading selected columns from locations_ipv4 into Pandas... Connected to DuckDB.
FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))
Successfully loaded 27274159 rows. DuckDB connection closed. Loading and initial sorting took: 5.72 seconds Loaded ip2location DataFrame Info: <class 'pandas.core.frame.DataFrame'> RangeIndex: 27274159 entries, 0 to 27274158 Data columns (total 5 columns): # Column Dtype --- ------ ----- 0 city object 1 latitude float32 2 longitude float32 3 start_ip_uint32 uint32 4 end_ip_uint32 uint32 dtypes: float32(2), object(1), uint32(2) memory usage: 2.1 GB
In [31]:
# filter out bogons
BOGON_RANGES_CIDR = [
"0.0.0.0/8", # Current network (RFC 6890) - Often treated as bogon
"10.0.0.0/8", # Private Use (RFC 1918)
"100.64.0.0/10", # Shared Address Space (RFC 6598)
"127.0.0.0/8", # Loopback (RFC 6890)
"169.254.0.0/16", # Link-Local (RFC 3927)
"172.16.0.0/12", # Private Use (RFC 1918)
"192.0.0.0/24", # IETF Protocol Assignments (RFC 6890)
"192.0.2.0/24", # TEST-NET-1 (RFC 5737)
"192.88.99.0/24", # 6to4 Relay Anycast (Deprecated but often blocked, RFC 7526)
"192.168.0.0/16", # Private Use (RFC 1918)
"198.18.0.0/15", # Network Interconnect Device Benchmark Testing (RFC 2544)
"198.51.100.0/24", # TEST-NET-2 (RFC 5737)
"203.0.113.0/24", # TEST-NET-3 (RFC 5737)
"224.0.0.0/4", # Multicast (RFC 5771)
"240.0.0.0/4", # Reserved for Future Use (RFC 1112)
]
# get u32 ranges of bogons
BOGON_INT_RANGES = []
for cidr in BOGON_RANGES_CIDR:
network = ipaddress.ip_network(cidr, strict=False)
start_int = int(network.network_address)
end_int = int(network.broadcast_address)
BOGON_INT_RANGES.append((start_int, end_int))
print(f"Defined {len(BOGON_INT_RANGES)} bogon integer ranges.\n")
def filter_bogons_uint32(df: pd.DataFrame, ip_column_name: str) -> pd.DataFrame:
"""
Filters a DataFrame to remove rows where the uint32 IP address falls
within defined bogon ranges.
"""
if ip_column_name not in df.columns:
raise ValueError(f"Column '{ip_column_name}' not found in DataFrame.")
if not pd.api.types.is_numeric_dtype(df[ip_column_name]):
raise TypeError(f"Column '{ip_column_name}' must be a numeric type (ideally uint32).")
print(f"Filtering bogons in column '{ip_column_name}'...")
initial_rows = len(df)
bogon_mask = pd.Series(False, index=df.index)
ip_series = df[ip_column_name]
# if the address is in any bogon range, return true
for start_int, end_int in BOGON_INT_RANGES:
in_range_mask = ip_series.between(start_int, end_int)
bogon_mask |= in_range_mask
num_bogons = bogon_mask.sum()
num_valid = initial_rows - num_bogons
print(f"Initial rows: {initial_rows}")
print(f"Bogon addresses identified: {num_bogons}")
print(f"Non-bogon addresses remaining: {num_valid}")
# return non bogon addresses
df_filtered = df[~bogon_mask]
return df_filtered
unique_addr_df = filter_bogons_uint32(unique_addr_df, 'hop_addr_uint32')
Defined 15 bogon integer ranges. Filtering bogons in column 'hop_addr_uint32'... Initial rows: 99393 Bogon addresses identified: 6282 Non-bogon addresses remaining: 93111
In [32]:
# unique hop addresses df
unique_addr_df = unique_addr_df.sort_values('hop_addr_uint32').reset_index(drop=True)
# add location information
unique_addr_df = pd.merge_asof(
unique_addr_df,
ipinfo_df,
left_on='hop_addr_uint32',
right_on='start_ip_uint32',
direction='backward'
)
In [33]:
cols_to_merge = ['hop_addr', 'city', 'latitude', 'longitude']
# add location data to full dataset
traceroutes_df = pd.merge(
traceroutes_df,
unique_addr_df[cols_to_merge],
on='hop_addr',
how='left'
)
rename_map = {
"city": "ipinfo_city",
"latitude": "ipinfo_lat",
"longitude": "ipinfo_lon"
}
traceroutes_df = traceroutes_df.rename(columns=rename_map)
In [34]:
# how many hops have IPInfo locations?
(~traceroutes_df['ipinfo_city'].isna()).sum()
Out[34]:
24412016
In [35]:
# how many hops have no IPInfo locations (i.e., bogons)?
traceroutes_df['ipinfo_city'].isna().sum()
Out[35]:
2833206
Get ASN data¶
In [36]:
def find_day_url_v4():
#https://data.caida.org/datasets/routing/routeviews-prefix2as/2025/04/routeviews-rv2-20250418-1200.pfx2as.gz
base_url = 'http://data.caida.org/datasets/routing/routeviews-prefix2as/{}/{:02d}'.format(2025, 4)
daymask = '{}{:02d}{:02d}'.format(2025, 4, 19)
dayurl = None
try:
page = requests.get(base_url, timeout=10).text
soup = BeautifulSoup(page, 'html.parser')
for node in soup.find_all('a'):
href = node.get('href')
if href and href.endswith('pfx2as.gz'):
find_dayurl = f"{base_url}/{href}" # Construct full URL
if daymask in find_dayurl:
dayurl = find_dayurl
break
except requests.exceptions.RequestException as e:
print(f"Error fetching CAIDA directory page {base_url}: {e}")
sys.exit(99)
if dayurl is None:
print('Unable to find day-specific URL for {}'.format(day.isoformat()))
sys.exit(99)
return dayurl
In [37]:
# adds ASN and BGP prefix from CAIDA data to a csv with ipaddresses
def fetch_caida_data_as_radix():
caida_url = find_day_url_v4()
caida_radix = radix.Radix()
try:
sys.stdout.write('Retrieving {} ... '.format(caida_url))
sys.stdout.flush()
#
# # Retrieve gzipped file
# Read the response as bytes
response = urllib.request.urlopen(caida_url)
gzFile = io.BytesIO()
gzFile.write(response.read())
print('OK')
except os.error as err:
print('Failed to download or parse {}'.format(caida_url))
print(err)
sys.exit(99)
gzFile.seek(0, os.SEEK_END)
print('Read {} bytes of compressed data'.format(gzFile.tell()))
gzFile.seek(0, os.SEEK_SET)
# Iterate over decompressed content
pfx2as = gzip.GzipFile(fileobj=gzFile, mode='rb')
pfx_count = 0
for line in pfx2as:
line = line.decode('utf-8')
line = line.rstrip('\n')
caida_vals = line.split('\t')
if len(caida_vals) != 3:
print('Invalid line in data retrieved from CAIDA ({})'.format(line))
prefix = '{}/{}'.format(caida_vals[0], caida_vals[1])
asn = caida_vals[2]
rnode = caida_radix.add(prefix)
rnode.data["AS"] = asn
pfx_count += 1
if (pfx_count % 100000) == 0:
sys.stdout.write('+')
sys.stdout.flush()
elif (pfx_count % 10000) == 0:
sys.stdout.write('-')
sys.stdout.flush()
pfx2as.close()
print()
print('Processed {} prefixes'.format(pfx_count))
return caida_radix
In [38]:
print(f"\nFound {len(unique_addr_df)} unique non-null IP strings to look up.") # unique hop addresses, excluding bogons
if 'lookup_radix4' not in locals():
lookup_radix4 = fetch_caida_data_as_radix()
# perform lookup
print(f"\nPerforming {len(unique_addr_df)} lookups in Radix tree...")
start_lookup_time = time.time()
lookup_results = []
successful_lookups = 0
failed_lookups = 0
for ip in unique_addr_df['hop_addr']:
try:
# search_best finds the longest matching prefix
rnode = lookup_radix4.search_best(ip)
if rnode: # Check if a node was found
lookup_results.append({
'hop_addr': ip, # The IP we looked up
'hop_asn': rnode.data.get('AS', '-'), # Get AS, default to '-'
})
successful_lookups += 1
else:
# No matching prefix found in the tree
lookup_results.append({'hop_addr': ip, 'hop_asn': '-'})
failed_lookups += 1
except ValueError: # Handle invalid IP format errors from radix lookup
lookup_results.append({'hop_addr': ip, 'hop_asn': '-'})
if failed_lookups < 10: print(f"Radix lookup failed for invalid IP format: {ip}")
failed_lookups += 1
except Exception as lookup_err: # Catch other potential errors
lookup_results.append({'hop_addr': ip, 'hop_asn': '-'})
if failed_lookups < 10: print(f"Radix lookup failed for {ip}: {lookup_err}")
failed_lookups += 1
end_lookup_time = time.time()
print(f"Lookups completed in {end_lookup_time - start_lookup_time:.2f} seconds.")
print(f" Successful: {successful_lookups}")
print(f" Failed/Not Found: {failed_lookups}")
asn_df = pd.DataFrame(lookup_results)
print("\nMerging ASN/Prefix data back into the original DataFrame...")
# drop hop_asn if it already exists
if 'hop_asn' in traceroutes_df.columns:
traceroutes_df = traceroutes_df.drop(columns=['hop_asn'])
traceroutes_df = pd.merge(
traceroutes_df,
asn_df[['hop_addr', 'hop_asn']],
on='hop_addr',
how='left'
)
# unknown ASes have value '-'
traceroutes_df['hop_asn'].fillna('-', inplace=True)
print("Merge complete.")
Found 93111 unique non-null IP strings to look up. Retrieving http://data.caida.org/datasets/routing/routeviews-prefix2as/2025/04/routeviews-rv2-20250419-0800.pfx2as.gz ... OK Read 3585254 bytes of compressed data ---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+--- Processed 1035774 prefixes Performing 93111 lookups in Radix tree... Lookups completed in 9.77 seconds. Successful: 84677 Failed/Not Found: 8434 Merging ASN/Prefix data back into the original DataFrame...
/var/folders/k_/b2zc8kg571bcqh2l9j8mj82h0000gn/T/ipykernel_48014/2055499213.py:58: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
traceroutes_df['hop_asn'].fillna('-', inplace=True)
Merge complete.
In [39]:
# how many ASes in the dataset?
traceroutes_df['hop_asn'].nunique()
Out[39]:
1827
In [40]:
# how many source ASNs?
ark_locs['vp_asn'].nunique()
Out[40]:
178
PTR record translation (Hoiho)¶
In [ ]:
perl = False # flip this to True to run the perl script (requires Perl and the JSON::XS module)
In [41]:
if perl:
# configuration files
ptr_records_file = './data/ptr_records.csv'
hoiho_script = './data/hoiho/hoiho-apply.pl'
hoiho_db = './data/hoiho/2024-08.midar-iff.geo-re.jsonl'
# store unique PTR records
print(f"\nSaving unique PTR records to {ptr_records_file}...")
traceroutes_df['hop_name'] = traceroutes_df['hop_name'].str.rstrip('.') # OI data has a trailing dot
unique_hop_names = traceroutes_df['hop_name'].unique()
pd.DataFrame(unique_hop_names).to_csv(ptr_records_file, index=False, header=False)
print(f"Saved {len(unique_hop_names)} unique names.")
# Check if required files/scripts exist before running
if not os.path.exists(hoiho_script):
print(f"ERROR: Hoiho script not found at '{hoiho_script}'")
elif not os.path.exists(hoiho_db):
print(f"ERROR: Hoiho database not found at '{hoiho_db}'")
elif not os.path.exists(ptr_records_file):
print(f"ERROR: Input PTR file not found at '{ptr_records_file}'")
else:
print(f"\nExecuting Hoiho Perl script: {hoiho_script}...")
command = f"cat {ptr_records_file} | perl {hoiho_script} {hoiho_db}"
try:
# run perl command
process_result = subprocess.run(
command,
shell=True,
capture_output=True,
text=True,
check=True
)
# Get the standard output from the completed process
ptr_translations_content = process_result.stdout
print(f"Perl script executed successfully. Output length: {len(ptr_translations_content)} chars.")
# Check for errors printed to stderr
if process_result.stderr:
print("\n--- Warnings/Errors from Perl script (stderr) ---")
print(process_result.stderr)
print("-------------------------------------------------")
print("\nLoading PTR translations from script output...")
if ptr_translations_content:
ptr_map = pd.read_csv(
io.StringIO(ptr_translations_content),
sep=' ',
on_bad_lines='skip',
names=['hop_name', 'location', 'code', 'ptr_lat', 'ptr_lon', 'ptr_city'],
skipinitialspace=True
)
print(f"Loaded {len(ptr_map)} translation entries.")
else:
print("Warning: Perl script produced no output. PTR map will be empty.")
ptr_map = pd.DataFrame(columns=['hop_name', 'location', 'code', 'ptr_lat', 'ptr_lon', 'ptr_city'])
except FileNotFoundError:
print(f"ERROR: 'perl' command not found. Is Perl installed and in your system's PATH?")
except subprocess.CalledProcessError as e:
print(f"ERROR: Perl script execution failed with exit code {e.returncode}.")
print("--- Perl script STDOUT ---")
print(e.stdout)
print("--- Perl script STDERR ---")
print(e.stderr)
except Exception as e:
print(f"An unexpected error occurred during subprocess execution: {e}")
else:
traceroutes_df['hop_name'] = traceroutes_df['hop_name'].str.rstrip('.') # OI data has a trailing dot
ptr_map = pd.read_csv('./data/hoiho/hoiho_translations.csv')
print("\nMerging PTR translations into final DataFrame...")
cols_to_merge = ['hop_name', 'ptr_lat', 'ptr_lon', 'ptr_city']
traceroutes_df = pd.merge(
traceroutes_df,
ptr_map[cols_to_merge],
on='hop_name',
how='left'
)
print("Merge complete.")
Saving unique PTR records to ./data/ptr_records.csv... Saved 41759 unique names. Executing Hoiho Perl script: ./data/hoiho/hoiho-apply.pl... Perl script executed successfully. Output length: 1452294 chars. --- Warnings/Errors from Perl script (stderr) --- Use of uninitialized value in printf at ./data/hoiho/hoiho-apply.pl line 165, <STDIN> line 18677. Use of uninitialized value in printf at ./data/hoiho/hoiho-apply.pl line 165, <STDIN> line 18849. ------------------------------------------------- Loading PTR translations from script output... Loaded 19613 translation entries. Merging PTR translations into final DataFrame... Merge complete.
In [42]:
# how many PTR records have a city location?
(~ptr_map['ptr_city'].isna()).sum()
Out[42]:
16565
In [43]:
# how many PTR records have no city location?
ptr_map['ptr_city'].isna().sum()
Out[43]:
3048
In [44]:
# how many hops have PTR city locations?
(~traceroutes_df['ptr_city'].isna()).sum()
Out[44]:
3340097
In [45]:
# how many hops have NO PTR city locations?
(traceroutes_df['ptr_city'].isna()).sum()
Out[45]:
23905125
Validate hop locations¶
In [46]:
# loc_valid
# calculate distance between 'vp_lat', 'vp_lon' and 'latitude', 'longitude'
# verify distance is smaller than 'ark_to_hop' (which is in km)
def haversine_vectorized(lat1, lon1, lat2, lon2, earth_radius_km=6371):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees), returning distance in kilometers.
Vectorized version for Pandas Series.
"""
# Convert decimal degrees to radians
lat1 = pd.to_numeric(lat1, errors='coerce')
lon1 = pd.to_numeric(lon1, errors='coerce')
lat2 = pd.to_numeric(lat2, errors='coerce')
lon2 = pd.to_numeric(lon2, errors='coerce')
lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
# Haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
# Use arctan2 for numerical stability
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
distance_km = earth_radius_km * c
return distance_km
### verify IPinfo locations
# calculate distance
traceroutes_df['calculated_distance_km'] = haversine_vectorized(
traceroutes_df['vp_lat'],
traceroutes_df['vp_lon'],
traceroutes_df['ipinfo_lat'],
traceroutes_df['ipinfo_lon']
)
# distance between points must be less than the maximum distance travelled given the latency difference
traceroutes_df['ipinfo_loc_valid'] = traceroutes_df['calculated_distance_km'] < traceroutes_df['ark_to_hop']
### verify PTR record locations
traceroutes_df['calculated_distance_km'] = haversine_vectorized(
traceroutes_df['vp_lat'],
traceroutes_df['vp_lon'],
traceroutes_df['ptr_lat'],
traceroutes_df['ptr_lon']
)
traceroutes_df['ptr_loc_valid'] = traceroutes_df['calculated_distance_km'] < traceroutes_df['ark_to_hop']
traceroutes_df.drop(columns=['calculated_distance_km'], inplace=True)
# ptr valids
valid_count = traceroutes_df['ptr_loc_valid'].sum()
total_count = (~traceroutes_df['ptr_city'].isna()).sum()
print(f"PTR locations {valid_count} valid out of {total_count}")
# ipinfo valids
valid_count = traceroutes_df['ipinfo_loc_valid'].sum()
total_count = (~traceroutes_df['ipinfo_city'].isna()).sum()
print(f"IPInfo locations {valid_count} valid out of {total_count}")
either_valid = (traceroutes_df['ptr_loc_valid'] | traceroutes_df['ipinfo_loc_valid']).sum()
print(f"Locations with either PTR or IPInfo valid locations: {either_valid}")
PTR locations 3281087 valid out of 3340097 IPInfo locations 20713439 valid out of 24412016 Locations with either PTR or IPInfo valid locations: 20795606
In [47]:
# how many unique hop addresses have a valid ipinfo location?
traceroutes_df[traceroutes_df['ipinfo_loc_valid']]['hop_addr'].nunique()
Out[47]:
91890
In [48]:
# how many unique hop addresses have a valid PTR location?
traceroutes_df[traceroutes_df['ptr_loc_valid']]['hop_addr'].nunique()
Out[48]:
16929
In [49]:
# how many unique hop address for ipinfo excluding those with ptr location
unique_ptr_loc_valid = traceroutes_df[traceroutes_df['ptr_loc_valid']]['hop_addr'].unique()
unique_ip_info_valid = traceroutes_df[traceroutes_df['ipinfo_loc_valid']]['hop_addr'].unique()
only_in_ipinfo = np.setdiff1d(unique_ip_info_valid, unique_ptr_loc_valid)
len(only_in_ipinfo)
Out[49]:
75143
In [50]:
in_both = np.intersect1d(unique_ptr_loc_valid, unique_ip_info_valid)
len(in_both)
Out[50]:
16747
In [51]:
union = np.union1d(unique_ptr_loc_valid, unique_ip_info_valid)
len(union)
Out[51]:
92072
In [52]:
# get preferred locations (ptr if valid, else ipinfo if valid, else NaN)
conditions = [
traceroutes_df['ptr_loc_valid'] == True,
traceroutes_df['ipinfo_loc_valid'] == True
]
# For hop_city
city_choices = [
traceroutes_df['ptr_city'],
traceroutes_df['ipinfo_city']
]
# For hop_lat
lat_choices = [
traceroutes_df['ptr_lat'],
traceroutes_df['ipinfo_lat']
]
lon_choices = [
traceroutes_df['ptr_lon'],
traceroutes_df['ipinfo_lon']
]
# get hop location info
traceroutes_df['hop_city'] = np.select(conditions, city_choices, default=pd.NA)
traceroutes_df['hop_lat'] = np.select(conditions, lat_choices, default=np.nan)
traceroutes_df['hop_lon'] = np.select(conditions, lon_choices, default=np.nan)
traceroutes_df = traceroutes_df.drop(columns=
[
'ipinfo_city', 'ipinfo_lat', 'ipinfo_lon',
'ptr_city', 'ptr_lat', 'ptr_lon',
'ptr_loc_valid', 'ipinfo_loc_valid'
]
)
print(f"no locations available {(traceroutes_df['hop_city'].isna()).sum()}")
print(f"locations available {(~traceroutes_df['hop_city'].isna()).sum()}")
no locations available 6449628 locations available 20795594
Obtain results using 'latency neighbors'¶
In [53]:
# detect neighbors
print(f"Latency threshold for latency neighbors: {LATENCY_THRESHOLD} ms")
traceroutes_df['is_neighbor'] = abs(traceroutes_df['pop_rtt'] - traceroutes_df['hop_rtt']) < LATENCY_THRESHOLD
Latency threshold for latency neighbors: 3 ms
In [54]:
# for geolocating PoPs we only use VPs within 12ms of the anycast PoP
traceroutes_df_f = traceroutes_df[traceroutes_df['pop_rtt'] < TRACEROUTE_MAX_LATENCY]
traceroutes_df_f
Out[54]:
| tx_hostname | dst | hop_count | probe_count | hop_addr | hop_probe_ttl | hop_rtt | hop_reply_ttl | ip4_address | hop_name | ... | vp_asn | tx_airport | pop_rtt | ark_to_hop | ark_to_pop | hop_asn | hop_city | hop_lat | hop_lon | is_neighbor | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | aep3-ar | 1.0.0.0 | 9 | 13 | 192.168.1.1 | 1 | 1.281 | 64 | NaN | NaN | ... | 7303 | aep | 5.976 | 130.890781 | 610.619288 | - | <NA> | NaN | NaN | False |
| 1 | aep3-ar | 1.0.0.0 | 9 | 13 | 192.168.0.1 | 2 | 1.606 | 63 | NaN | NaN | ... | 7303 | aep | 5.976 | 164.098825 | 610.619288 | - | <NA> | NaN | NaN | False |
| 2 | aep3-ar | 1.0.0.0 | 9 | 13 | 181.96.113.217 | 7 | 13.811 | 249 | 181.96.113.217 | host217.181-96-113.telecom.net.ar | ... | 7303 | aep | 5.976 | 1411.188586 | 610.619288 | 7303 | Buenos Aires | -34.613152 | -58.377232 | False |
| 3 | aep3-ar | 1.0.0.0 | 9 | 13 | 181.13.127.254 | 8 | 9.127 | 248 | 181.13.127.254 | host254.181-13-127.telecom.net.ar | ... | 7303 | aep | 5.976 | 932.584044 | 610.619288 | 7303 | Ciudad del Libertador General San Martín | -34.577888 | -58.538639 | False |
| 4 | aep3-ar | 1.0.0.0 | 9 | 13 | 1.0.0.0 | 9 | 12.355 | 56 | NaN | NaN | ... | 7303 | aep | 5.976 | 1262.416551 | 610.619288 | 13335 | <NA> | NaN | NaN | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27245190 | oak5-us | 192.188.22.6 | 14 | 20 | 75.101.33.185 | 7 | 2.236 | 249 | 75.101.33.185 | 100.ae1.nrd1.equinix-sj.sonic.net | ... | 46375 | oak | 4.146 | 228.471340 | 423.632458 | 46375 | San Jose | 37.339390 | -121.894958 | True |
| 27245191 | oak5-us | 192.188.22.6 | 14 | 20 | 206.223.116.137 | 8 | 2.320 | 57 | 206.223.116.137 | eqx-sj.es.net | ... | 46375 | oak | 4.146 | 237.054342 | 423.632458 | - | San Jose | 37.339390 | -121.894958 | True |
| 27245192 | oak5-us | 192.188.22.6 | 14 | 20 | 198.129.100.30 | 12 | 4.476 | 53 | 198.129.100.30 | lbnl59-cr6--esnetwest-se425.ip.es.net | ... | 46375 | oak | 4.146 | 457.351395 | 423.632458 | 292 | San Jose | 37.339390 | -121.894958 | True |
| 27245193 | oak5-us | 192.188.22.6 | 14 | 20 | 198.129.100.29 | 13 | 4.431 | 243 | 198.129.100.29 | site--esnetwest-se425.ip.es.net | ... | 46375 | oak | 4.146 | 452.753358 | 423.632458 | 292 | San Jose | 37.339390 | -121.894958 | True |
| 27245194 | oak5-us | 192.188.22.6 | 14 | 20 | 192.188.22.6 | 14 | 4.146 | 51 | 192.188.22.6 | nmx1.es.net | ... | 46375 | oak | 4.146 | 423.632458 | 423.632458 | 3443 | San Jose | 37.339390 | -121.894958 | True |
12040885 rows × 24 columns
In [55]:
# how many valid tx_hostname,dst pairs remain?
traceroutes_df_f[['tx_hostname', 'dst']].drop_duplicates().shape[0]
Out[55]:
1705930
In [56]:
# how many valid dsts?
traceroutes_df_f['dst'].nunique()
Out[56]:
13706
In [57]:
# find cases where the VP itself is a latency neighbor
print("Finding Ark VP latency neighbors")
is_dest_hop = traceroutes_df_f['dst'] == traceroutes_df_f['hop_addr']
is_fast_dest_hop = is_dest_hop & (traceroutes_df_f['hop_rtt'] < LATENCY_THRESHOLD)
fast_dest_df = traceroutes_df_f[is_fast_dest_hop]
fast_dest_results_list = fast_dest_df.apply(
lambda row: pd.Series({
'tx_hostname': row['tx_hostname'],
'dst': row['dst'],
'inferred_city': row['tx_city'],
'vp_lat': row['vp_lat'],
'vp_lon': row['vp_lon'],
'hop_lat': row['vp_lat'], # use vp coords as hop coords
'hop_lon': row['vp_lon']
}), axis=1
).drop_duplicates().to_dict('records')
print(f"Found {len(fast_dest_results_list)} routes resolved by latency neighbor VPs")
fast_dest_results_df = pd.DataFrame(fast_dest_results_list)
Finding Ark VP latency neighbors Found 939187 routes resolved by latency neighbor VPs
In [58]:
# Filter out hop == dst, to identify latency neighbor hops
hops_df_f = traceroutes_df_f[traceroutes_df_f['dst'] != traceroutes_df_f['hop_addr']]
print(f"DataFrame shape after removing all destination hops: {hops_df_f.shape}")
def find_closest_neighbor_data(group):
"""
Finds the data (city, lat, lon, latitude, longitude) of the latency neighbor
closest to the destination (highest TTL), ensuring the location is valid.
Returns a Series with the required data or NAs.
"""
# Filter on latency neighbors of the anycast address
neighbors = group[group['is_neighbor'] == True]
# Filter on neighbors with city geolocation data (already validated)
valid_city_neighbors = neighbors.dropna(subset=['hop_city'])
if not valid_city_neighbors.empty:
# Get the neighbor location closest to the VP (lowest TTL)
closest_neighbor_to_vp = valid_city_neighbors.sort_values('hop_probe_ttl', ascending=True).iloc[0]
# we only use latency neighbors if there exists one within 9ms of the VP (to avoid false geolocations due to RTT noise)
# no neighbor within 9ms
if closest_neighbor_to_vp['hop_rtt'] > 9:
return pd.Series({
'inferred_city': pd.NA,
'vp_lat': pd.NA,
'vp_lon': pd.NA,
'hop_lat': pd.NA,
'hop_lon': pd.NA
}, index=['inferred_city', 'vp_lat', 'vp_lon', 'hop_lat', 'hop_lon'])
closest_neighbor_to_pop = valid_city_neighbors.sort_values('hop_probe_ttl', ascending=False).iloc[0]
# return location
return pd.Series({
'inferred_city': closest_neighbor_to_pop['hop_city'],
'vp_lat': closest_neighbor_to_pop['vp_lat'],
'vp_lon': closest_neighbor_to_pop['vp_lon'],
'hop_lat': closest_neighbor_to_pop['hop_lat'],
'hop_lon': closest_neighbor_to_pop['hop_lon']
})
else:
#nNo valid location found
return pd.Series({
'inferred_city': pd.NA,
'vp_lat': pd.NA,
'vp_lon': pd.NA,
'hop_lat': pd.NA,
'hop_lon': pd.NA
}, index=['inferred_city', 'vp_lat', 'vp_lon', 'hop_lat', 'hop_lon'])
# Group by tx_hostname, dst pairs
print("\nApplying neighbor logic to remaining routes...")
neighbor_results_df = hops_df_f.groupby(
['tx_hostname', 'dst'], observed=True, dropna=False
).apply(find_closest_neighbor_data).reset_index()
# combine results
print("\nCombining results from VP latency neighbors and hop latency neighbor logic...")
# Concatenate the two sets of results
inferred_df = pd.concat([fast_dest_results_df, neighbor_results_df], ignore_index=True)
# Prioritize fast_dest_results_df over neighbor_results_df
inferred_df = inferred_df.drop_duplicates(subset=['tx_hostname', 'dst'], keep='first')
# remove NaNs
inferred_df = inferred_df[~inferred_df['inferred_city'].isna()]
inferred_df # may take a while
DataFrame shape after removing all destination hops: (10274463, 24) Applying neighbor logic to remaining routes...
/var/folders/k_/b2zc8kg571bcqh2l9j8mj82h0000gn/T/ipykernel_48014/1495268981.py:54: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. neighbor_results_df = hops_df_f.groupby(
Combining results from VP latency neighbors and hop latency neighbor logic...
Out[58]:
| tx_hostname | dst | inferred_city | vp_lat | vp_lon | hop_lat | hop_lon | |
|---|---|---|---|---|---|---|---|
| 0 | dar-tz | 192.31.196.1 | Dar es Salaam | -6.79 | 39.21 | -6.79 | 39.21 |
| 1 | dar-tz | 192.33.14.30 | Dar es Salaam | -6.79 | 39.21 | -6.79 | 39.21 |
| 2 | dar-tz | 192.58.128.30 | Dar es Salaam | -6.79 | 39.21 | -6.79 | 39.21 |
| 3 | dar-tz | 194.0.17.1 | Dar es Salaam | -6.79 | 39.21 | -6.79 | 39.21 |
| 4 | dar-tz | 194.0.27.12 | Dar es Salaam | -6.79 | 39.21 | -6.79 | 39.21 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2641136 | zrh4-ch | 97.74.111.51 | Zürich | 47.38 | 8.54 | 47.366669 | 8.55 |
| 2641137 | zrh4-ch | 97.74.96.0 | Zürich | 47.38 | 8.54 | 47.366669 | 8.55 |
| 2641138 | zrh4-ch | 97.74.97.35 | Zürich | 47.38 | 8.54 | 47.366669 | 8.55 |
| 2641139 | zrh4-ch | 97.74.98.0 | Zürich | 47.38 | 8.54 | 47.366669 | 8.55 |
| 2641140 | zrh4-ch | 97.74.99.57 | Zürich | 47.38 | 8.54 | 47.366669 | 8.55 |
1436047 rows × 7 columns
Airport clustering¶
In [4]:
airport_df = pd.read_csv('./data/airports.csv', sep='\t')
print("--- Pre-processing Airport Data ---")
start_time = time.time()
# split coords
coords = airport_df['lat long'].str.split(' ', expand=True)
airport_df['a_lat'] = pd.to_numeric(coords[0], errors='coerce')
airport_df['a_lon'] = pd.to_numeric(coords[1], errors='coerce')
airport_df['a_heuristic'] = pd.to_numeric(airport_df['pop'].str.split(' ', expand=True)[0], errors='coerce')
# convert airport coords to radians
airport_df['a_lat_rad'] = np.radians(airport_df['a_lat'])
airport_df['a_lon_rad'] = np.radians(airport_df['a_lon'])
# numpy arrays for fast lookups
airport_coords_rad = airport_df[['a_lat_rad', 'a_lon_rad']].values
airport_heuristics = airport_df['a_heuristic'].values
airport_lat = airport_df['a_lat'].values
airport_lon = airport_df['a_lon'].values
airport_iata = airport_df['#IATA'].values
airport_name = airport_df['name'].values
print(f"Airport pre-processing took {time.time() - start_time:.2f}s")
airport_df
--- Pre-processing Airport Data --- Airport pre-processing took 0.01s
Out[4]:
| #IATA | size | name | lat long | country_code | city | pop | heuristic | 1 | 2 | 3 | a_lat | a_lon | a_heuristic | a_lat_rad | a_lon_rad | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ASH | large_airport | Ashburn Airport | 39.043611 -77.4875 | US | Ashburn | 43511 0 0 0 | NaN | NaN | NaN | NaN | 39.043611 | -77.487500 | 43511 | 0.681440 | -1.352412 |
| 1 | AMX | large_airport | Amsterdam Airport Schiphol | 52.3086013794 4.7638897896 | NL | Amsterdam | 741636 0 0 0 | NaN | NaN | NaN | NaN | 52.308601 | 4.763890 | 741636 | 0.912957 | 0.083146 |
| 2 | AMS | large_airport | Amsterdam Airport Schiphol | 52.3086013794 4.7638897896 | NL | Amsterdam | 741636 0 0 0 | NaN | NaN | NaN | NaN | 52.308601 | 4.763890 | 741636 | 0.912957 | 0.083146 |
| 3 | AMZ | large_airport | Amsterdam Airport Schiphol | 52.3086013794 4.7638897896 | NL | Amsterdam | 741636 0 0 0 | NaN | NaN | NaN | NaN | 52.308601 | 4.763890 | 741636 | 0.912957 | 0.083146 |
| 4 | PIX | large_airport | San Francisco International Airport | 37.6189994812 -122.375 | US | San Francisco | 805235 0 -122.389979 37.615223 | NaN | NaN | NaN | NaN | 37.618999 | -122.375000 | 805235 | 0.656575 | -2.135847 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3023 | CPM | large_airport | Los Angeles International Airport | 33.94250107 -118.4079971 | US | Los Angeles | 3792621 0 0 0 | NaN | NaN | NaN | NaN | 33.942501 | -118.407997 | 3792621 | 0.592408 | -2.066609 |
| 3024 | OXR | large_airport | Los Angeles International Airport | 33.94250107 -118.4079971 | US | Los Angeles | 3792621 0 0 0 | NaN | NaN | NaN | NaN | 33.942501 | -118.407997 | 3792621 | 0.592408 | -2.066609 |
| 3025 | RHV | medium_airport | Evelio Javier Airport | 10.765999794 121.932998657 | PH | San Jose | 118807 2 0 0 | NaN | NaN | NaN | NaN | 10.766000 | 121.932999 | 118807 | 0.187902 | 2.128132 |
| 3026 | STO | large_airport | Stockholm-Arlanda Airport | 59.6519012451 17.9186000824 | SE | Stockholm | 1253309 0 0 0 | NaN | NaN | NaN | NaN | 59.651901 | 17.918600 | 1253309 | 1.041122 | 0.312739 |
| 3027 | CNJ | large_airport | Newark Liberty International Airport | 40.6925010681 -74.1687011719 | US | Newark | 277140 0 0 0 | NaN | NaN | NaN | NaN | 40.692501 | -74.168701 | 277140 | 0.710218 | -1.294488 |
3028 rows × 16 columns
In [5]:
# get inferred locations radian coordinates
inferred_df['hop_lat'] = pd.to_numeric(inferred_df['hop_lat'], errors='coerce')
inferred_df['hop_lon'] = pd.to_numeric(inferred_df['hop_lon'], errors='coerce')
inferred_df['hop_lat_rad'] = np.radians(inferred_df['hop_lat'])
inferred_df['hop_lon_rad'] = np.radians(inferred_df['hop_lon'])
inferred_coords_rad = inferred_df[['hop_lat_rad', 'hop_lon_rad']].values
inferred_coords_rad
Out[5]:
array([[-0.11850786, 0.6843436 ],
[-0.11850786, 0.6843436 ],
[-0.11850786, 0.6843436 ],
...,
[ 0.82670432, 0.14922565],
[ 0.82670432, 0.14922565],
[ 0.82670432, 0.14922565]], shape=(1477348, 2))
In [6]:
# create ball tree
print("\n--- Building BallTree ---")
tree = BallTree(airport_coords_rad, metric='haversine')
print("\n--- Querying nearby airports using BallTree ---")
start_time = time.time()
# find distances (and indices) of airports within the radius
nearby_indices_list, nearby_distances_list_rad = tree.query_radius(
inferred_coords_rad, r=DISK_DISTANCE_RAD, return_distance=True
)
nearby_distances_list_km = [dist_rad * EARTH_RADIUS_KM for dist_rad in nearby_distances_list_rad]
print(f"BallTree query_radius took {time.time() - start_time:.2f}s")
--- Building BallTree --- --- Querying nearby airports using BallTree --- BallTree query_radius took 9.05s
In [7]:
# fallback airports (closest airport)
print("\n--- Finding the absolute closest airport (fallback) ---")
start_time = time.time()
# query returns distances (in radians) and indices for the k=1 nearest neighbor
closest_dist_rad, closest_idx = tree.query(inferred_coords_rad, k=1)
# closest_dist_km = closest_dist_rad[:, 0] * EARTH_RADIUS_KM # Distances in km
closest_indices = closest_idx[:, 0] # Get the index array
print(f"BallTree query (k=1) took {time.time() - start_time:.2f}s")
--- Finding the absolute closest airport (fallback) --- BallTree query (k=1) took 15.09s
In [8]:
results = []
print("\n--- Calculating heuristic for nearby airports ---")
start_time = time.time()
for i in range(len(inferred_coords_rad)): # iterate through all final latency neighbors
# retrieve the airports within the radius, and their distance from the latency neighbor
nearby_indices = nearby_indices_list[i]
nearby_distances_km = nearby_distances_list_km[i]
# retrieve fallback airport for this point
fallback_idx = closest_indices[i]
fallback_airport_details = {
'lat': airport_lat[fallback_idx],
'lon': airport_lon[fallback_idx],
'name': airport_name[fallback_idx],
'iata': airport_iata[fallback_idx]
}
# heuristic calculation
best_p = -1.0
best_idx_in_disk = -1 # -1 -> no suitable airport found with heuristic
if len(nearby_indices) > 0:
# Filter again based on exact distance (though query_radius should be accurate)
# This ensures we strictly adhere to DISK_DISTANCE_KM
within_disk_mask = nearby_distances_km < DISK_DISTANCE_KM
candidate_indices = nearby_indices[within_disk_mask]
if len(candidate_indices) > 0:
candidate_distances = nearby_distances_km[within_disk_mask]
candidate_heuristics = airport_heuristics[candidate_indices]
# Calculate totals for the denominator using ONLY candidates within the disk
total_heuristic = np.sum(candidate_heuristics)
# Sum of (Disk Distance - Actual Distance) for candidates
total_inverse_distance_weighted = np.sum(DISK_DISTANCE_KM - candidate_distances)
# Check denominators are valid AND non-zero
if total_heuristic > 0 and total_inverse_distance_weighted > 1e-9: # Use small epsilon for float comparison
# Calculate probability p for each candidate
p_heuristic = ALPHA * (candidate_heuristics / total_heuristic)
p_distance = (1 - ALPHA) * ((DISK_DISTANCE_KM - candidate_distances) / total_inverse_distance_weighted)
p = p_heuristic + p_distance
# Find the index *among candidates* with the highest p
best_candidate_local_idx = np.argmax(p)
# Map back to the original airport_df index
best_idx_in_disk = candidate_indices[best_candidate_local_idx]
if best_idx_in_disk != -1:
# return best airport based on heuristic
results.append({
'lat': airport_lat[best_idx_in_disk],
'lon': airport_lon[best_idx_in_disk],
'name': airport_name[best_idx_in_disk],
'iata': airport_iata[best_idx_in_disk]
})
else:
# no airport in disk, use the closest airport
results.append(fallback_airport_details)
print(f"Heuristic calculation loop took {time.time() - start_time:.2f}s")
print("\n--- Assembling final results ---")
# join results with inferred_df
results_df = pd.DataFrame(results, index=inferred_df.index)
final_df = inferred_df.join(results_df)
final_df.rename(columns={'lat': 'inferred_airport_lat',
'lon': 'inferred_airport_lon',
'name': 'inferred_airport_name',
'iata': 'inferred_airport_iata'}, inplace=True)
final_df
--- Calculating heuristic for nearby airports --- Heuristic calculation loop took 19.81s --- Assembling final results ---
Out[8]:
| tx_hostname | dst | inferred_city | vp_lat | vp_lon | hop_lat | hop_lon | clustered_inferred_city | clustered_lat | clustered_lon | hop_lat_rad | hop_lon_rad | inferred_airport_lat | inferred_airport_lon | inferred_airport_name | inferred_airport_iata | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | dar-tz | 192.31.196.1 | Dar es Salaam | -6.79 | 39.21 | -6.790000 | 39.21 | Dar es Salaam | -6.790000 | 39.21 | -0.118508 | 0.684344 | -6.878056 | 39.203333 | Dar es Salaam | TIX |
| 1 | dar-tz | 192.33.14.30 | Dar es Salaam | -6.79 | 39.21 | -6.790000 | 39.21 | Dar es Salaam | -6.790000 | 39.21 | -0.118508 | 0.684344 | -6.878056 | 39.203333 | Dar es Salaam | TIX |
| 2 | dar-tz | 192.58.128.30 | Dar es Salaam | -6.79 | 39.21 | -6.790000 | 39.21 | Dar es Salaam | -6.790000 | 39.21 | -0.118508 | 0.684344 | -6.878056 | 39.203333 | Dar es Salaam | TIX |
| 3 | dar-tz | 194.0.17.1 | Dar es Salaam | -6.79 | 39.21 | -6.790000 | 39.21 | Dar es Salaam | -6.790000 | 39.21 | -0.118508 | 0.684344 | -6.878056 | 39.203333 | Dar es Salaam | TIX |
| 4 | dar-tz | 194.0.27.12 | Dar es Salaam | -6.79 | 39.21 | -6.790000 | 39.21 | Dar es Salaam | -6.790000 | 39.21 | -0.118508 | 0.684344 | -6.878056 | 39.203333 | Dar es Salaam | TIX |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1477343 | zrh4-ch | 97.74.111.51 | Zürich | 47.38 | 8.54 | 47.366669 | 8.55 | Zürich | 47.366669 | 8.55 | 0.826704 | 0.149226 | 47.464699 | 8.549170 | Zürich Airport | ZRH |
| 1477344 | zrh4-ch | 97.74.96.0 | Zürich | 47.38 | 8.54 | 47.366669 | 8.55 | Zürich | 47.366669 | 8.55 | 0.826704 | 0.149226 | 47.464699 | 8.549170 | Zürich Airport | ZRH |
| 1477345 | zrh4-ch | 97.74.97.35 | Zürich | 47.38 | 8.54 | 47.366669 | 8.55 | Zürich | 47.366669 | 8.55 | 0.826704 | 0.149226 | 47.464699 | 8.549170 | Zürich Airport | ZRH |
| 1477346 | zrh4-ch | 97.74.98.0 | Zürich | 47.38 | 8.54 | 47.366669 | 8.55 | Zürich | 47.366669 | 8.55 | 0.826704 | 0.149226 | 47.464699 | 8.549170 | Zürich Airport | ZRH |
| 1477347 | zrh4-ch | 97.74.99.57 | Zürich | 47.38 | 8.54 | 47.366669 | 8.55 | Zürich | 47.366669 | 8.55 | 0.826704 | 0.149226 | 47.464699 | 8.549170 | Zürich Airport | ZRH |
1477348 rows × 16 columns
merge and compare with MAnycastR census¶
In [66]:
final_df['prefix'] = final_df['dst'].apply(lambda x: '.'.join(x.split('.')[:3]) + '.0/24') # prefix for merging with census results
In [67]:
# get TR results
city_counts_per_dst = final_df.groupby('prefix')['inferred_airport_iata'].apply(lambda x: x.dropna().nunique())
In [68]:
# Merge census and traceroute enumeration results
print("Merging the two DataFrames...")
merged_df = pd.merge(
census[['prefix', 'iGreedyICMPv4', 'MAnycastICMPv4']],
city_counts_per_dst,
on='prefix',
how='outer'
)
print(f"Merged DataFrame created with {len(merged_df)} rows.")
# fill NaNs with 0
columns_to_fill = ['iGreedyICMPv4', 'inferred_airport_iata']
for col in columns_to_fill:
# Check if NaN filling is needed before doing it
nan_count = merged_df[col].isna().sum()
merged_df[col] = merged_df[col].fillna(0)
merged_df[col] = merged_df[col].astype(int)
# Reorder columns for better readability
final_columns = ['prefix', 'inferred_airport_iata', 'iGreedyICMPv4', 'MAnycastICMPv4']
merged_df = merged_df[final_columns]
merged_df = merged_df.rename(columns={'inferred_airport_iata': 'TR', 'iGreedyICMPv4': 'GCD'})
merged_df
Merging the two DataFrames... Merged DataFrame created with 13765 rows.
Out[68]:
| prefix | TR | GCD | MAnycastICMPv4 | |
|---|---|---|---|---|
| 0 | 1.0.0.0/24 | 86 | 71 | 27 |
| 1 | 1.1.1.0/24 | 82 | 71 | 28 |
| 2 | 1.10.10.0/24 | 2 | 3 | 1 |
| 3 | 1.12.0.0/24 | 6 | 5 | 5 |
| 4 | 1.12.12.0/24 | 3 | 3 | 3 |
| ... | ... | ... | ... | ... |
| 13760 | 99.83.254.0/24 | 37 | 49 | 25 |
| 13761 | 99.83.255.0/24 | 34 | 49 | 25 |
| 13762 | 99.99.97.0/24 | 2 | 2 | 2 |
| 13763 | 99.99.98.0/24 | 2 | 2 | 2 |
| 13764 | 99.99.99.0/24 | 4 | 4 | 4 |
13765 rows × 4 columns
In [69]:
# add ASN data to merged_df
# Perform lookups for upstream IPs
upstream_lookups = 0
upstream_asns = []
for ip in merged_df['prefix']:
if type(ip) is not str or len(ip) == 0:
upstream_asns.append('-')
continue
try:
rnode = lookup_radix4.search_best(ip)
upstream_asns.append(rnode.data['AS'])
upstream_lookups += 1
except:
upstream_asns.append('-')
print('Radix tree lookup of {} failed'.format(ip))
print('{} successful lookups for {} entries'.format(upstream_lookups, len(merged_df)))
# Add additional columns to the output
merged_df['anycast_asn'] = upstream_asns
merged_df.head()
Radix tree lookup of 84.205.69.0/24 failed Radix tree lookup of 84.205.75.0/24 failed 13763 successful lookups for 13765 entries
Out[69]:
| prefix | TR | GCD | MAnycastICMPv4 | anycast_asn | |
|---|---|---|---|---|---|
| 0 | 1.0.0.0/24 | 86 | 71 | 27 | 13335 |
| 1 | 1.1.1.0/24 | 82 | 71 | 28 | 13335 |
| 2 | 1.10.10.0/24 | 2 | 3 | 1 | 148000 |
| 3 | 1.12.0.0/24 | 6 | 5 | 5 | 132203_45090 |
| 4 | 1.12.12.0/24 | 3 | 3 | 3 | 132203_45090 |
In [70]:
# determine which targets have completed traceroutes
merged_df['has_traceroute'] = merged_df['prefix'].isin(traceroutes_df['prefix'])
merged_df.head()
Out[70]:
| prefix | TR | GCD | MAnycastICMPv4 | anycast_asn | has_traceroute | |
|---|---|---|---|---|---|---|
| 0 | 1.0.0.0/24 | 86 | 71 | 27 | 13335 | True |
| 1 | 1.1.1.0/24 | 82 | 71 | 28 | 13335 | True |
| 2 | 1.10.10.0/24 | 2 | 3 | 1 | 148000 | True |
| 3 | 1.12.0.0/24 | 6 | 5 | 5 | 132203_45090 | True |
| 4 | 1.12.12.0/24 | 3 | 3 | 3 | 132203_45090 | True |
In [71]:
# GCD confirmed prefixes with no traceroute data
merged_df[~merged_df['has_traceroute'] & (merged_df['GCD'] > 1)]
Out[71]:
| prefix | TR | GCD | MAnycastICMPv4 | anycast_asn | has_traceroute | |
|---|---|---|---|---|---|---|
| 2573 | 109.122.206.0/24 | 0 | 9 | 7 | 49287 | False |
| 2930 | 134.231.255.0/24 | 0 | 43 | 16 | 396982 | False |
| 3912 | 151.242.40.0/24 | 0 | 2 | 2 | 36530_214025 | False |
| 4326 | 163.181.59.0/24 | 0 | 10 | 7 | 24429 | False |
| 4505 | 167.86.43.0/24 | 0 | 3 | 2 | 25773 | False |
| 5573 | 185.18.184.0/24 | 0 | 67 | 27 | 13335 | False |
| 5625 | 185.211.51.0/24 | 0 | 25 | 24 | 20473 | False |
| 5839 | 192.229.156.0/24 | 0 | 2 | 2 | 15133 | False |
| 5840 | 192.229.168.0/24 | 0 | 2 | 2 | 15133 | False |
| 6169 | 198.7.16.0/24 | 0 | 2 | 2 | 15133 | False |
| 6220 | 199.212.93.0/24 | 0 | 65 | 27 | 9327 | False |
| 6688 | 203.31.251.0/24 | 0 | 2 | 2 | 149512 | False |
| 6747 | 204.74.102.0/24 | 0 | 8 | 7 | 19905 | False |
| 6976 | 209.85.152.0/24 | 0 | 23 | 13 | 15169 | False |
| 6977 | 209.85.153.0/24 | 0 | 5 | 4 | 15169 | False |
| 6978 | 209.85.155.0/24 | 0 | 4 | 3 | 15169 | False |
| 6979 | 209.85.204.0/24 | 0 | 22 | 13 | 15169 | False |
| 6980 | 209.85.205.0/24 | 0 | 5 | 4 | 15169 | False |
| 6981 | 209.85.207.0/24 | 0 | 4 | 3 | 15169 | False |
| 7224 | 23.141.168.0/24 | 0 | 48 | 25 | 16509 | False |
| 9365 | 34.34.34.0/24 | 0 | 2 | 2 | 396982 | False |
| 11610 | 43.152.97.0/24 | 0 | 3 | 2 | 132203 | False |
| 11779 | 45.192.222.0/24 | 0 | 65 | 27 | 13335 | False |
| 12526 | 66.173.0.0/24 | 0 | 6 | 3 | 4181 | False |
| 13194 | 80.78.134.0/24 | 0 | 4 | 0 | 209241 | False |
| 13396 | 88.216.69.0/24 | 0 | 65 | 27 | 13335 | False |
| 13449 | 91.202.63.0/24 | 0 | 2 | 2 | 44571 | False |
| 13477 | 92.223.15.0/24 | 0 | 8 | 5 | 199524 | False |
| 13576 | 95.135.109.0/24 | 0 | 2 | 2 | 214025 | False |
| 13577 | 95.135.83.0/24 | 0 | 2 | 2 | 214025 | False |
In [72]:
# both anycast
merged_df[(merged_df['TR'] > 1) & (merged_df['GCD'] > 1)]
Out[72]:
| prefix | TR | GCD | MAnycastICMPv4 | anycast_asn | has_traceroute | |
|---|---|---|---|---|---|---|
| 0 | 1.0.0.0/24 | 86 | 71 | 27 | 13335 | True |
| 1 | 1.1.1.0/24 | 82 | 71 | 28 | 13335 | True |
| 2 | 1.10.10.0/24 | 2 | 3 | 1 | 148000 | True |
| 3 | 1.12.0.0/24 | 6 | 5 | 5 | 132203_45090 | True |
| 4 | 1.12.12.0/24 | 3 | 3 | 3 | 132203_45090 | True |
| ... | ... | ... | ... | ... | ... | ... |
| 13760 | 99.83.254.0/24 | 37 | 49 | 25 | 16509 | True |
| 13761 | 99.83.255.0/24 | 34 | 49 | 25 | 16509 | True |
| 13762 | 99.99.97.0/24 | 2 | 2 | 2 | 7018 | True |
| 13763 | 99.99.98.0/24 | 2 | 2 | 2 | 7018 | True |
| 13764 | 99.99.99.0/24 | 4 | 4 | 4 | 7018 | True |
13478 rows × 6 columns
In [73]:
# igreedy and not traceroute (includes 30 with no traceroute data)
merged_df[(merged_df['GCD'] > 1) & (merged_df['TR'] < 2)]
Out[73]:
| prefix | TR | GCD | MAnycastICMPv4 | anycast_asn | has_traceroute | |
|---|---|---|---|---|---|---|
| 9 | 100.0.0.0/24 | 1 | 2 | 1 | 701 | True |
| 28 | 103.138.240.0/24 | 1 | 2 | 1 | 45177 | True |
| 35 | 103.144.246.0/24 | 1 | 2 | 1 | 138152 | True |
| 39 | 103.156.106.0/24 | 1 | 2 | 2 | 141202 | True |
| 42 | 103.158.186.0/24 | 1 | 2 | 2 | 18041 | True |
| ... | ... | ... | ... | ... | ... | ... |
| 13580 | 95.181.180.0/24 | 1 | 3 | 4 | 210756 | True |
| 13581 | 95.181.181.0/24 | 1 | 2 | 4 | 210756 | True |
| 13582 | 95.181.182.0/24 | 1 | 3 | 4 | 210756 | True |
| 13622 | 98.143.88.0/24 | 1 | 2 | 2 | 19171 | True |
| 13625 | 98.96.233.0/24 | 0 | 3 | 3 | 62610 | True |
287 rows × 6 columns
In [74]:
# igreedy only with traceroute
merged_df[(merged_df['GCD'] > 1) & (merged_df['TR'] < 2) & merged_df['has_traceroute']]
Out[74]:
| prefix | TR | GCD | MAnycastICMPv4 | anycast_asn | has_traceroute | |
|---|---|---|---|---|---|---|
| 9 | 100.0.0.0/24 | 1 | 2 | 1 | 701 | True |
| 28 | 103.138.240.0/24 | 1 | 2 | 1 | 45177 | True |
| 35 | 103.144.246.0/24 | 1 | 2 | 1 | 138152 | True |
| 39 | 103.156.106.0/24 | 1 | 2 | 2 | 141202 | True |
| 42 | 103.158.186.0/24 | 1 | 2 | 2 | 18041 | True |
| ... | ... | ... | ... | ... | ... | ... |
| 13580 | 95.181.180.0/24 | 1 | 3 | 4 | 210756 | True |
| 13581 | 95.181.181.0/24 | 1 | 2 | 4 | 210756 | True |
| 13582 | 95.181.182.0/24 | 1 | 3 | 4 | 210756 | True |
| 13622 | 98.143.88.0/24 | 1 | 2 | 2 | 19171 | True |
| 13625 | 98.96.233.0/24 | 0 | 3 | 3 | 62610 | True |
257 rows × 6 columns
In [75]:
# group GCD only (not detected with traceroute) by number of sites found -> most found only by GCD are in 2,3 locations
merged_df[(merged_df['GCD'] > 1) & (merged_df['TR'] < 2) & merged_df['has_traceroute']].groupby('GCD')['prefix'].nunique(0)
Out[75]:
GCD 2 195 3 29 4 4 5 3 6 1 7 1 8 2 9 2 10 1 12 1 13 3 14 2 15 1 16 1 17 1 18 6 19 1 29 2 32 1 Name: prefix, dtype: int64
In [76]:
# how many target ASes for traceroute?
merged_df[merged_df['has_traceroute']]['anycast_asn'].nunique()
Out[76]:
906
In [77]:
# targets without traceroute data
merged_df[~merged_df['has_traceroute'] & (merged_df['MAnycastICMPv4'] > 1)]
Out[77]:
| prefix | TR | GCD | MAnycastICMPv4 | anycast_asn | has_traceroute | |
|---|---|---|---|---|---|---|
| 2573 | 109.122.206.0/24 | 0 | 9 | 7 | 49287 | False |
| 2930 | 134.231.255.0/24 | 0 | 43 | 16 | 396982 | False |
| 3912 | 151.242.40.0/24 | 0 | 2 | 2 | 36530_214025 | False |
| 4326 | 163.181.59.0/24 | 0 | 10 | 7 | 24429 | False |
| 4505 | 167.86.43.0/24 | 0 | 3 | 2 | 25773 | False |
| 5573 | 185.18.184.0/24 | 0 | 67 | 27 | 13335 | False |
| 5625 | 185.211.51.0/24 | 0 | 25 | 24 | 20473 | False |
| 5839 | 192.229.156.0/24 | 0 | 2 | 2 | 15133 | False |
| 5840 | 192.229.168.0/24 | 0 | 2 | 2 | 15133 | False |
| 6169 | 198.7.16.0/24 | 0 | 2 | 2 | 15133 | False |
| 6220 | 199.212.93.0/24 | 0 | 65 | 27 | 9327 | False |
| 6688 | 203.31.251.0/24 | 0 | 2 | 2 | 149512 | False |
| 6747 | 204.74.102.0/24 | 0 | 8 | 7 | 19905 | False |
| 6976 | 209.85.152.0/24 | 0 | 23 | 13 | 15169 | False |
| 6977 | 209.85.153.0/24 | 0 | 5 | 4 | 15169 | False |
| 6978 | 209.85.155.0/24 | 0 | 4 | 3 | 15169 | False |
| 6979 | 209.85.204.0/24 | 0 | 22 | 13 | 15169 | False |
| 6980 | 209.85.205.0/24 | 0 | 5 | 4 | 15169 | False |
| 6981 | 209.85.207.0/24 | 0 | 4 | 3 | 15169 | False |
| 7224 | 23.141.168.0/24 | 0 | 48 | 25 | 16509 | False |
| 9365 | 34.34.34.0/24 | 0 | 2 | 2 | 396982 | False |
| 11610 | 43.152.97.0/24 | 0 | 3 | 2 | 132203 | False |
| 11779 | 45.192.222.0/24 | 0 | 65 | 27 | 13335 | False |
| 12526 | 66.173.0.0/24 | 0 | 6 | 3 | 4181 | False |
| 13396 | 88.216.69.0/24 | 0 | 65 | 27 | 13335 | False |
| 13449 | 91.202.63.0/24 | 0 | 2 | 2 | 44571 | False |
| 13477 | 92.223.15.0/24 | 0 | 8 | 5 | 199524 | False |
| 13576 | 95.135.109.0/24 | 0 | 2 | 2 | 214025 | False |
| 13577 | 95.135.83.0/24 | 0 | 2 | 2 | 214025 | False |
In [79]:
# which ASes had the most failed traceroutes?
merged_df[~merged_df['has_traceroute'] & (merged_df['MAnycastICMPv4'] > 1)].groupby('anycast_asn')['prefix'].nunique().sort_values(ascending=False)
Out[79]:
anycast_asn 15169 6 15133 3 13335 3 214025 2 396982 2 25773 1 49287 1 44571 1 4181 1 36530_214025 1 132203 1 24429 1 20473 1 199524 1 19905 1 16509 1 149512 1 9327 1 Name: prefix, dtype: int64
In [80]:
# only iGreedy
merged_df[(merged_df['TR'] < 2) & (merged_df['GCD'] > 1)]#['has_traceroute'].sum()
Out[80]:
| prefix | TR | GCD | MAnycastICMPv4 | anycast_asn | has_traceroute | |
|---|---|---|---|---|---|---|
| 9 | 100.0.0.0/24 | 1 | 2 | 1 | 701 | True |
| 28 | 103.138.240.0/24 | 1 | 2 | 1 | 45177 | True |
| 35 | 103.144.246.0/24 | 1 | 2 | 1 | 138152 | True |
| 39 | 103.156.106.0/24 | 1 | 2 | 2 | 141202 | True |
| 42 | 103.158.186.0/24 | 1 | 2 | 2 | 18041 | True |
| ... | ... | ... | ... | ... | ... | ... |
| 13580 | 95.181.180.0/24 | 1 | 3 | 4 | 210756 | True |
| 13581 | 95.181.181.0/24 | 1 | 2 | 4 | 210756 | True |
| 13582 | 95.181.182.0/24 | 1 | 3 | 4 | 210756 | True |
| 13622 | 98.143.88.0/24 | 1 | 2 | 2 | 19171 | True |
| 13625 | 98.96.233.0/24 | 0 | 3 | 3 | 62610 | True |
287 rows × 6 columns
In [81]:
# unraveling MOAS infrastructures -> 4812 is a peer of 45090, but not 132203 [future work]
traceroutes_df[(traceroutes_df['dst'] == '1.12.14.16') & (traceroutes_df['tx_hostname'] == 'abz2-uk')][['tx_hostname', 'dst', 'hop_probe_ttl', 'hop_rtt', 'hop_addr', 'hop_asn', 'hop_city']]
Out[81]:
| tx_hostname | dst | hop_probe_ttl | hop_rtt | hop_addr | hop_asn | hop_city | |
|---|---|---|---|---|---|---|---|
| 1460517 | abz2-uk | 1.12.14.16 | 1 | 0.297 | 137.50.19.1 | 786 | Aberdeen |
| 1460518 | abz2-uk | 1.12.14.16 | 2 | 0.499 | 137.50.0.33 | 786 | Aberdeen |
| 1460519 | abz2-uk | 1.12.14.16 | 3 | 0.508 | 137.50.255.225 | 786 | Aberdeen |
| 1460520 | abz2-uk | 1.12.14.16 | 4 | 0.375 | 137.50.255.150 | 786 | Aberdeen |
| 1460521 | abz2-uk | 1.12.14.16 | 5 | 0.537 | 146.97.128.9 | 786 | <NA> |
| 1460522 | abz2-uk | 1.12.14.16 | 6 | 3.936 | 146.97.37.173 | 786 | Glasgow |
| 1460523 | abz2-uk | 1.12.14.16 | 7 | 8.112 | 146.97.33.53 | 786 | Manchester |
| 1460524 | abz2-uk | 1.12.14.16 | 8 | 26.298 | 146.97.33.41 | 786 | Luton |
| 1460525 | abz2-uk | 1.12.14.16 | 9 | 13.954 | 146.97.33.21 | 786 | London |
| 1460526 | abz2-uk | 1.12.14.16 | 10 | 14.496 | 146.97.33.1 | 786 | London |
| 1460527 | abz2-uk | 1.12.14.16 | 11 | 14.643 | 83.97.89.1 | 21320 | London |
| 1460528 | abz2-uk | 1.12.14.16 | 13 | 217.812 | 202.97.97.5 | 4134 | Shanghai |
| 1460529 | abz2-uk | 1.12.14.16 | 14 | 219.227 | 202.97.50.194 | 4134 | Shanghai |
| 1460530 | abz2-uk | 1.12.14.16 | 15 | 191.071 | 202.97.94.238 | 4134 | Shanghai |
| 1460531 | abz2-uk | 1.12.14.16 | 16 | 203.739 | 101.95.224.226 | 4812 | Shanghai |
| 1460532 | abz2-uk | 1.12.14.16 | 18 | 226.057 | 101.226.210.70 | 4812 | Shanghai |
| 1460533 | abz2-uk | 1.12.14.16 | 23 | 227.985 | 1.12.14.16 | 132203_45090 | Guangzhou |
In [82]:
# GCD anycast, TR unicast
merged_df[(merged_df['TR'] == 1) & (merged_df['GCD'] > 1)]
Out[82]:
| prefix | TR | GCD | MAnycastICMPv4 | anycast_asn | has_traceroute | |
|---|---|---|---|---|---|---|
| 9 | 100.0.0.0/24 | 1 | 2 | 1 | 701 | True |
| 28 | 103.138.240.0/24 | 1 | 2 | 1 | 45177 | True |
| 35 | 103.144.246.0/24 | 1 | 2 | 1 | 138152 | True |
| 39 | 103.156.106.0/24 | 1 | 2 | 2 | 141202 | True |
| 42 | 103.158.186.0/24 | 1 | 2 | 2 | 18041 | True |
| ... | ... | ... | ... | ... | ... | ... |
| 13579 | 95.181.177.0/24 | 1 | 2 | 4 | 210756 | True |
| 13580 | 95.181.180.0/24 | 1 | 3 | 4 | 210756 | True |
| 13581 | 95.181.181.0/24 | 1 | 2 | 4 | 210756 | True |
| 13582 | 95.181.182.0/24 | 1 | 3 | 4 | 210756 | True |
| 13622 | 98.143.88.0/24 | 1 | 2 | 2 | 19171 | True |
204 rows × 6 columns
In [83]:
# TR anycast, GCD unicast
merged_df[(merged_df['TR'] > 1) & (merged_df['GCD'] == 1)]
Out[83]:
| prefix | TR | GCD | MAnycastICMPv4 | anycast_asn | has_traceroute |
|---|
analyze 'hypergiants'¶
In [84]:
# get largest ASes originating prefixes and the mean TR and GCD counts
merged_df.groupby('anycast_asn').agg(
prefix_count = ('prefix', 'nunique'),
TR_mean = ('TR', 'mean'),
GCD_mean = ('GCD', 'mean')
).reset_index()
Out[84]:
| anycast_asn | prefix_count | TR_mean | GCD_mean | |
|---|---|---|---|---|
| 0 | - | 2 | 0.5 | 4.0 |
| 1 | 10029 | 1 | 2.0 | 2.0 |
| 2 | 1008 | 1 | 3.0 | 3.0 |
| 3 | 10122 | 2 | 3.5 | 3.5 |
| 4 | 10143 | 1 | 4.0 | 4.0 |
| ... | ... | ... | ... | ... |
| 906 | 9507 | 1 | 2.0 | 2.0 |
| 907 | 9559 | 1 | 2.0 | 2.0 |
| 908 | 9678 | 1 | 3.0 | 3.0 |
| 909 | 9790 | 4 | 2.0 | 2.0 |
| 910 | 9829 | 1 | 2.0 | 2.0 |
911 rows × 4 columns
In [85]:
# why does traceroute perform worse for Amazon?
amazon = merged_df[merged_df['anycast_asn'] == '16509']
amazon_traceroutes = traceroutes_df[traceroutes_df['prefix'].isin(amazon['prefix'])]
# what is the average number of VPs that completed a traceroute towards amazon targets
amazon_traceroutes.groupby('dst')['tx_hostname'].nunique().mean() # A: far below average number of VPs with completed traceroutes
Out[85]:
132.76661631419938
In [86]:
# figure out clusters
merged_df[(merged_df['TR'] > 47) & (merged_df['TR'] < 51)].groupby('anycast_asn')['prefix'].nunique() # TR google cloud
Out[86]:
anycast_asn 13335 8 139070 5 15169 122 16509 2 19281 1 29997 2 394089 4 396982 3265 54113 17 8068 2 8075 2 Name: prefix, dtype: int64
In [87]:
merged_df[(merged_df['TR'] > 25) & (merged_df['TR'] < 45)].groupby('anycast_asn')['prefix'].nunique() # TR Amazon
Out[87]:
anycast_asn
10880 3
112 1
1149 2
13335 8
15169 5
..
6939 1
714 5
7280 5
8075 11
8674 2
Name: prefix, Length: 62, dtype: int64
In [88]:
merged_df[(merged_df['GCD'] > 44) & (merged_df['GCD'] < 51)].groupby('anycast_asn')['prefix'].nunique() # GCD Amazon
Out[88]:
anycast_asn 13335 13 16509 1302 19281 1 29997 4 396982 27 54113 51 8068 6 8075 4 Name: prefix, dtype: int64
In [89]:
merged_df[(merged_df['GCD'] > 40) & (merged_df['GCD'] < 45)].groupby('anycast_asn')['prefix'].nunique() # GCD Google
Out[89]:
anycast_asn 13335 18 139070 9 15169 284 19551 1 199524 3 21342 22 34939 1 394089 5 396982 3871 54113 13 8068 2 8075 7 Name: prefix, dtype: int64
In [90]:
merged_df[(merged_df['GCD'] > 60)].groupby('anycast_asn')['prefix'].nunique() # GCD CF
Out[90]:
anycast_asn 10886 1 11045 1 13335 2722 14197 4 147008 1 152594 1 152899 1 19254 8 19281 2 1958 1 1975 1 199535 1 201188 1 201839 1 2047 2 206998 1 208140 1 209242 301 209299 1 214618 1 21556 1 21787 1 23145 1 23467 2 2559 1 273584 1 27515 1 27647 5 29085 1 29729 1 30219 1 30811 1 328486 1 328608 1 33191 1 3557 1 397273 1 398291 1 398787 60 399077 4 399358 2 399566 2 399678 1 400797 1 40680 1 42 13 4749 6 4892 1 50756 1 53595 1 55532 3 62659 2 9327 3 Name: prefix, dtype: int64
In [91]:
merged_df[(merged_df['TR'] > 65)].groupby('anycast_asn')['prefix'].nunique() # TR CF
Out[91]:
anycast_asn 10886 1 11045 1 13335 2956 14197 4 147008 1 152594 1 152899 1 19254 8 19281 3 1958 1 1975 1 199535 1 201188 1 201839 1 2047 2 206998 1 208140 1 209242 301 209299 1 214618 1 21556 1 21787 1 23145 1 23467 2 2559 1 273584 1 27515 1 27647 5 29085 1 29729 1 30219 1 30811 1 328486 1 328608 1 33191 1 3557 1 397273 1 398291 1 398787 60 399077 4 399358 2 399566 2 399678 1 400797 1 40680 1 42 22 4749 6 4892 1 50756 1 53595 1 55532 3 62659 2 9327 2 Name: prefix, dtype: int64
In [123]:
igreedy_values = census['iGreedyICMPv4'].values
igreedy_values_sorted = np.sort(igreedy_values)
# Calculate CDF y-values
n_igreedy = len(igreedy_values_sorted)
y_cdf_igreedy = np.arange(1, n_igreedy + 1) / n_igreedy
print(f"Prepared {n_igreedy} iGreedyICMPv4 values (>1).")
city_counts_filtered = city_counts_per_dst[city_counts_per_dst > 1]
# Extract, sort enumeration values
city_count_values = city_counts_filtered.values
city_counts_sorted = np.sort(city_count_values)
# Calculate CDF values
n_city = len(city_counts_sorted)
y_cdf_city = np.arange(1, n_city + 1) / n_city
print(f"Prepared {n_city} city count values for {len(city_counts_per_dst)} destinations.")
print("\nPlotting combined CDFs...")
plt.figure(figsize=(12, 7))
# Plot GCD CDF
plt.plot(igreedy_values_sorted, y_cdf_igreedy,
marker='.', linestyle='none', label='iGreedy', markersize=4)
# Plot TR CDF
plt.plot(city_counts_sorted, y_cdf_city,
marker='x', linestyle='none', label='Traceroute', markersize=5)
plt.ylabel('Cumulative Fraction')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.ylim(0, 1.05)
all_x_values = np.concatenate((igreedy_values_sorted, city_counts_sorted))
max_x = np.max(all_x_values)
min_x_positive = all_x_values[all_x_values > 0].min() if np.any(all_x_values > 0) else 1
plt.rcParams.update({'font.size': 20})
plt.xlabel('Number of PoPs detected per prefix')
plt.xlim(left=0)
plt.xticks(np.arange(0, 90 + 1, 10))
plt.legend(markerscale=3)
plt.tight_layout()
plt.grid(True)
ellipse = patches.Ellipse((49, 0.55), width=5, height=0.4, # TR Google Cloud
angle=0, color='red', fill=False)
plt.gca().add_patch(ellipse)
ellipse = patches.Ellipse((35, 0.35), width=8, height=0.2, # TR Amazon
angle=0, color='red', fill=False)
plt.gca().add_patch(ellipse)
ellipse = patches.Ellipse((49, 0.7), width=4, height=0.2, # GCD Amazon
angle=0, color='black', fill=False)
plt.gca().add_patch(ellipse)
ellipse = patches.Ellipse((43, 0.5), width=4, height=0.4, # GCD Google
angle=0, color='black', fill=False)
plt.gca().add_patch(ellipse)
ellipse = patches.Ellipse((67, 0.9), width=18, height=0.3, # GCD Cloudflare
angle=0, color='black', fill=False)
plt.gca().add_patch(ellipse)
ellipse = patches.Ellipse((77, 0.9), width=18, height=0.3, # TR Cloudflare
angle=0, color='red', fill=False)
plt.gca().add_patch(ellipse)
plt.annotate('Google Cloud (GCD)', xy=(43, 0.5), xytext=(20, 0.8),
arrowprops=dict(facecolor='black', arrowstyle='->'))
plt.annotate('Amazon (GCD)', xy=(49, 0.7), xytext=(40, 0.9),
arrowprops=dict(facecolor='black', arrowstyle='->'))
plt.annotate('Google Cloud (TR)', xy=(49, 0.55), xytext=(44, 0.2),
arrowprops=dict(facecolor='black', arrowstyle='->'))
plt.annotate('Amazon (TR)', xy=(35, 0.35), xytext=(30, 0.1),
arrowprops=dict(facecolor='black', arrowstyle='->'))
plt.annotate('Cloudflare (TR)', xy=(77, 0.9), xytext=(70, 0.4),
arrowprops=dict(facecolor='black', arrowstyle='->'))
plt.annotate('Cloudflare (GCD)', xy=(67, 0.9), xytext=(55, 0.6),
arrowprops=dict(facecolor='black', arrowstyle='->'))
plt.savefig("./plots/enumeration_annotated.pdf", bbox_inches='tight', pad_inches=0)
plt.show()
print("\nDescriptive Statistics for iGreedy:")
print(pd.Series(igreedy_values_sorted).describe())
print("\nDescriptive Statistics for traceroute:")
print(pd.Series(city_counts_sorted).describe())
Prepared 13765 iGreedyICMPv4 values (>1). Prepared 13478 city count values for 13682 destinations. Plotting combined CDFs...
Descriptive Statistics for iGreedy: count 13765.000000 mean 38.650999 std 22.739557 min 2.000000 25% 15.000000 50% 43.000000 75% 58.000000 max 75.000000 dtype: float64 Descriptive Statistics for traceroute: count 13478.000000 mean 42.829797 std 25.399726 min 2.000000 25% 20.000000 50% 48.000000 75% 66.000000 max 87.000000 dtype: float64
run analysis w/o latency neighbor constraints toward root letters¶
In [93]:
# get results for root-letters, without using latency neighbors
roots = [
'198.41.0.4',
'170.247.170.2',
'192.33.4.12',
'199.7.91.13',
'192.203.230.10',
'192.5.5.0',
#'192.112.36.4', # g-root unresponsive to ICMP
'198.97.190.53',
'192.36.148.17',
'192.58.128.30',
'193.0.14.129',
'199.7.83.42',
'202.12.27.33']
In [94]:
traceroutes_roots = traceroutes_df[traceroutes_df['dst'].isin(roots)]
traceroutes_roots
Out[94]:
| tx_hostname | dst | hop_count | probe_count | hop_addr | hop_probe_ttl | hop_rtt | hop_reply_ttl | ip4_address | hop_name | ... | vp_asn | tx_airport | pop_rtt | ark_to_hop | ark_to_pop | hop_asn | hop_city | hop_lat | hop_lon | is_neighbor | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1095 | aep3-ar | 199.7.91.13 | 10 | 16 | 192.168.1.1 | 1 | 0.497 | 64 | NaN | NaN | ... | 7303 | aep | 13.934 | 50.782762 | 1423.756554 | - | <NA> | NaN | NaN | False |
| 1096 | aep3-ar | 199.7.91.13 | 10 | 16 | 192.168.0.1 | 2 | 1.201 | 63 | NaN | NaN | ... | 7303 | aep | 13.934 | 122.716494 | 1423.756554 | - | <NA> | NaN | NaN | False |
| 1097 | aep3-ar | 199.7.91.13 | 10 | 16 | 181.96.62.70 | 7 | 13.194 | 249 | 181.96.62.70 | host70.181-96-62.telecom.net.ar | ... | 7303 | aep | 13.934 | 1348.144393 | 1423.756554 | 7303 | Buenos Aires | -34.613152 | -58.377232 | True |
| 1098 | aep3-ar | 199.7.91.13 | 10 | 16 | 45.68.8.12 | 8 | 8.703 | 248 | NaN | NaN | ... | 7303 | aep | 13.934 | 889.260319 | 1423.756554 | - | Buenos Aires | -34.613152 | -58.377232 | False |
| 1099 | aep3-ar | 199.7.91.13 | 10 | 16 | 200.115.95.118 | 9 | 11.412 | 56 | NaN | NaN | ... | 7303 | aep | 13.934 | 1166.062135 | 1423.756554 | 52376 | Buenos Aires | -34.613152 | -58.377232 | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27244876 | oak5-us | 192.203.230.10 | 9 | 12 | 142.254.59.146 | 3 | 8.355 | 253 | 142.254.59.146 | 142-254-59-146.static.sonic.net | ... | 46375 | oak | 3.077 | 853.702168 | 314.403539 | 46375 | San Jose | 37.339390 | -121.894958 | False |
| 27244877 | oak5-us | 192.203.230.10 | 9 | 12 | 157.131.211.153 | 4 | 19.666 | 252 | 157.131.211.153 | ae6.cr2.hywrca01.sonic.net | ... | 46375 | oak | 3.077 | 2009.444264 | 314.403539 | 46375 | San Carlos | 37.507160 | -122.260521 | False |
| 27244878 | oak5-us | 192.203.230.10 | 9 | 12 | 75.101.33.185 | 7 | 2.885 | 249 | 75.101.33.185 | 100.ae1.nrd1.equinix-sj.sonic.net | ... | 46375 | oak | 3.077 | 294.785249 | 314.403539 | 46375 | San Jose | 37.339390 | -121.894958 | True |
| 27244879 | oak5-us | 192.203.230.10 | 9 | 12 | 206.223.117.75 | 8 | 2.830 | 57 | 206.223.117.75 | equinix-sjc.woodynet.net | ... | 46375 | oak | 3.077 | 289.165426 | 314.403539 | - | San Jose | 37.362598 | -121.929001 | True |
| 27244880 | oak5-us | 192.203.230.10 | 9 | 12 | 192.203.230.10 | 9 | 3.077 | 56 | 192.203.230.10 | e.root-servers.net | ... | 46375 | oak | 3.077 | 314.403539 | 314.403539 | 21556 | <NA> | NaN | NaN | True |
24759 rows × 24 columns
In [95]:
traceroute_roots_phops = traceroutes_roots[traceroutes_roots['dst'] != traceroutes_roots['hop_addr']].sort_values(by='hop_probe_ttl', ascending=False)\
.drop_duplicates(subset=['tx_hostname', 'dst'], keep='first')
traceroute_roots_phops = traceroute_roots_phops[~traceroute_roots_phops['hop_city'].isna()] # remove NaN phops (bogons)
traceroute_roots_phops
Out[95]:
| tx_hostname | dst | hop_count | probe_count | hop_addr | hop_probe_ttl | hop_rtt | hop_reply_ttl | ip4_address | hop_name | ... | vp_asn | tx_airport | pop_rtt | ark_to_hop | ark_to_pop | hop_asn | hop_city | hop_lat | hop_lon | is_neighbor | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 812326 | hnl4-us | 192.58.128.30 | 25 | 30 | 178.172.80.1 | 24 | 222.748 | 227 | NaN | NaN | ... | 20473 | hnl | 222.398 | 22760.077851 | 22724.315343 | 2107 | Ljubljana | 46.051079 | 14.505130 | True |
| 19314852 | puw-us | 170.247.170.2 | 24 | 26 | 62.115.139.35 | 22 | 111.478 | 232 | 62.115.139.35 | rest-bb1-link.ip.twelve99.net | ... | 20055 | puw | 88.403 | 11390.665500 | 9032.894402 | 1299 | Washington | 38.895111 | -77.036369 | False |
| 17926032 | gye-ec | 192.58.128.30 | 23 | 28 | 178.172.80.1 | 22 | 201.306 | 251 | NaN | NaN | ... | 61468 | gye | 201.245 | 20569.164401 | 20562.931506 | 2107 | Ljubljana | 46.051079 | 14.505130 | True |
| 16375275 | bed-us | 198.97.190.53 | 25 | 31 | 96.110.36.90 | 21 | 48.696 | 247 | 96.110.36.90 | be-3301-pe01.nota.fl.ibone.comcast.net | ... | 7922 | bed | 45.818 | 4975.688900 | 4681.618901 | 7922 | Miami | 25.774269 | -80.193657 | True |
| 12194477 | ens-nl | 192.58.128.30 | 22 | 23 | 140.189.8.9 | 21 | 125.316 | 238 | 140.189.8.9 | r-chippewatc-hub-2-ae3.ip4.wiscnet.net | ... | 1133 | ens | 119.394 | 12804.612908 | 12199.511264 | 2381 | Plum City | 44.629131 | -92.192398 | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25260722 | pry-za | 193.0.14.129 | 2 | 2 | 196.216.3.3 | 1 | 0.617 | 255 | 196.216.3.3 | vl250.ar01.jnb.afrinic.net | ... | 33764 | pry | 1.052 | 63.044194 | 107.491883 | 33764 | Johannesburg | -26.202271 | 28.043631 | True |
| 1576510 | beg-rs | 193.0.14.129 | 2 | 2 | 193.105.163.133 | 1 | 0.567 | 255 | 193.105.163.133 | 133.sox.rs | ... | 13004 | beg | 0.403 | 57.935264 | 41.177974 | 13004 | Belgrade | 44.804008 | 20.465130 | True |
| 1581976 | beg-rs | 199.7.83.42 | 4 | 6 | 193.105.163.133 | 1 | 0.586 | 255 | 193.105.163.133 | 133.sox.rs | ... | 13004 | beg | 24.948 | 59.876657 | 2549.151607 | 13004 | Belgrade | 44.804008 | 20.465130 | False |
| 18800163 | dac-bd | 192.58.128.30 | 2 | 2 | 202.59.208.1 | 1 | 0.057 | 64 | NaN | NaN | ... | 24122 | dac | 0.107 | 5.824180 | 10.933110 | 9825 | Dhaka | 23.710400 | 90.407440 | True |
| 20089856 | sql-us | 170.247.170.2 | 4 | 6 | 192.158.252.3 | 1 | 0.222 | 255 | 192.158.252.3 | r2.pao2-guests.isc.org | ... | 1280 | sql | 56.526 | 22.683648 | 5775.747305 | 1280 | Palo Alto | 37.441879 | -122.143021 | False |
2990 rows × 24 columns
In [96]:
# get inferred locations radian coordinates
traceroute_roots_phops['hop_lat'] = pd.to_numeric(traceroute_roots_phops['hop_lat'], errors='coerce')
traceroute_roots_phops['hop_lon'] = pd.to_numeric(traceroute_roots_phops['hop_lon'], errors='coerce')
traceroute_roots_phops['hop_lat_rad'] = np.radians(traceroute_roots_phops['hop_lat'])
traceroute_roots_phops['hop_lon_rad'] = np.radians(traceroute_roots_phops['hop_lon'])
inferred_coords_rad_roots = traceroute_roots_phops[['hop_lat_rad', 'hop_lon_rad']].values
In [97]:
np.isnan(inferred_coords_rad_roots).any(axis=1)
Out[97]:
array([False, False, False, ..., False, False, False])
In [98]:
# querry tree for root letters
nearby_indices_list, nearby_distances_list_rad = tree.query_radius(
inferred_coords_rad_roots, r=DISK_DISTANCE_RAD, return_distance=True
)
nearby_distances_list_km = [dist_rad * EARTH_RADIUS_KM for dist_rad in nearby_distances_list_rad]
# fallback airport (closest airport)
start_time = time.time()
closest_dist_rad, closest_idx = tree.query(inferred_coords_rad_roots, k=1)
closest_indices = closest_idx[:, 0]
print(f"BallTree query (k=1) took {time.time() - start_time:.2f}s")
BallTree query (k=1) took 0.03s
In [99]:
# get inferred locations
results = []
print("\n--- Calculating heuristic for nearby airports ---")
start_time = time.time()
for i in range(len(inferred_coords_rad_roots)): # iterate through all final latency neighbors
# retrieve the airports within the radius, and their distance from the latency neighbor
nearby_indices = nearby_indices_list[i]
nearby_distances_km = nearby_distances_list_km[i]
# retrieve fallback airport for this point
fallback_idx = closest_indices[i]
fallback_airport_details = {
'lat': airport_lat[fallback_idx],
'lon': airport_lon[fallback_idx],
'name': airport_name[fallback_idx],
'iata': airport_iata[fallback_idx]
}
# heuristic calculation
best_p = -1.0
best_idx_in_disk = -1 # -1 -> no suitable airport found with heuristic
if len(nearby_indices) > 0:
# Filter again based on exact distance (though query_radius should be accurate)
# This ensures we strictly adhere to DISK_DISTANCE_KM
within_disk_mask = nearby_distances_km < DISK_DISTANCE_KM
candidate_indices = nearby_indices[within_disk_mask]
if len(candidate_indices) > 0:
candidate_distances = nearby_distances_km[within_disk_mask]
candidate_heuristics = airport_heuristics[candidate_indices]
# Calculate totals for the denominator using ONLY candidates within the disk
total_heuristic = np.sum(candidate_heuristics)
# Sum of (Disk Distance - Actual Distance) for candidates
total_inverse_distance_weighted = np.sum(DISK_DISTANCE_KM - candidate_distances)
# Check denominators are valid AND non-zero
if total_heuristic > 0 and total_inverse_distance_weighted > 1e-9: # Use small epsilon for float comparison
# Calculate probability p for each candidate
p_heuristic = ALPHA * (candidate_heuristics / total_heuristic)
p_distance = (1 - ALPHA) * ((DISK_DISTANCE_KM - candidate_distances) / total_inverse_distance_weighted)
p = p_heuristic + p_distance
# Find the index *among candidates* with the highest p
best_candidate_local_idx = np.argmax(p)
# Map back to the original airport_df index
best_idx_in_disk = candidate_indices[best_candidate_local_idx]
if best_idx_in_disk != -1:
# return best airport based on heuristic
results.append({
'lat': airport_lat[best_idx_in_disk],
'lon': airport_lon[best_idx_in_disk],
'name': airport_name[best_idx_in_disk],
'iata': airport_iata[best_idx_in_disk]
})
else:
# no airport in disk, use the closest airport
results.append(fallback_airport_details)
print(f"Heuristic calculation loop took {time.time() - start_time:.2f}s")
print("\n--- Assembling final results ---")
# join results with inferred_df
final_roots = traceroute_roots_phops.join(pd.DataFrame(results, index=traceroute_roots_phops.index))
final_roots.rename(columns={'lat': 'inferred_airport_lat',
'lon': 'inferred_airport_lon',
'name': 'inferred_airport_name',
'iata': 'inferred_airport_iata'}, inplace=True)
final_roots
--- Calculating heuristic for nearby airports --- Heuristic calculation loop took 0.06s --- Assembling final results ---
Out[99]:
| tx_hostname | dst | hop_count | probe_count | hop_addr | hop_probe_ttl | hop_rtt | hop_reply_ttl | ip4_address | hop_name | ... | hop_city | hop_lat | hop_lon | is_neighbor | hop_lat_rad | hop_lon_rad | inferred_airport_lat | inferred_airport_lon | inferred_airport_name | inferred_airport_iata | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 812326 | hnl4-us | 192.58.128.30 | 25 | 30 | 178.172.80.1 | 24 | 222.748 | 227 | NaN | NaN | ... | Ljubljana | 46.051079 | 14.505130 | True | 0.803743 | 0.253162 | 46.223701 | 14.457600 | Ljubljana Jože Pučnik Airport | LJU |
| 19314852 | puw-us | 170.247.170.2 | 24 | 26 | 62.115.139.35 | 22 | 111.478 | 232 | 62.115.139.35 | rest-bb1-link.ip.twelve99.net | ... | Washington | 38.895111 | -77.036369 | False | 0.678848 | -1.344538 | 39.175400 | -76.668297 | Baltimore/Washington International Thurgood Marshall Airport | BWI |
| 17926032 | gye-ec | 192.58.128.30 | 23 | 28 | 178.172.80.1 | 22 | 201.306 | 251 | NaN | NaN | ... | Ljubljana | 46.051079 | 14.505130 | True | 0.803743 | 0.253162 | 46.223701 | 14.457600 | Ljubljana Jože Pučnik Airport | LJU |
| 16375275 | bed-us | 198.97.190.53 | 25 | 31 | 96.110.36.90 | 21 | 48.696 | 247 | 96.110.36.90 | be-3301-pe01.nota.fl.ibone.comcast.net | ... | Miami | 25.774269 | -80.193657 | True | 0.449846 | -1.399643 | 25.793200 | -80.290604 | Miami | NAP |
| 12194477 | ens-nl | 192.58.128.30 | 22 | 23 | 140.189.8.9 | 21 | 125.316 | 238 | 140.189.8.9 | r-chippewatc-hub-2-ae3.ip4.wiscnet.net | ... | Plum City | 44.629131 | -92.192398 | False | 0.778925 | -1.609061 | 44.882000 | -93.221802 | Minneapolis-St Paul International/Wold-Chamberlain Airport | MSP |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25260722 | pry-za | 193.0.14.129 | 2 | 2 | 196.216.3.3 | 1 | 0.617 | 255 | 196.216.3.3 | vl250.ar01.jnb.afrinic.net | ... | Johannesburg | -26.202271 | 28.043631 | True | -0.457316 | 0.489454 | -25.938499 | 27.926100 | Lanseria Airport | HLA |
| 1576510 | beg-rs | 193.0.14.129 | 2 | 2 | 193.105.163.133 | 1 | 0.567 | 255 | 193.105.163.133 | 133.sox.rs | ... | Belgrade | 44.804008 | 20.465130 | True | 0.781977 | 0.357184 | 44.818401 | 20.309099 | Belgrade Nikola Tesla Airport | BEG |
| 1581976 | beg-rs | 199.7.83.42 | 4 | 6 | 193.105.163.133 | 1 | 0.586 | 255 | 193.105.163.133 | 133.sox.rs | ... | Belgrade | 44.804008 | 20.465130 | False | 0.781977 | 0.357184 | 44.818401 | 20.309099 | Belgrade Nikola Tesla Airport | BEG |
| 18800163 | dac-bd | 192.58.128.30 | 2 | 2 | 202.59.208.1 | 1 | 0.057 | 64 | NaN | NaN | ... | Dhaka | 23.710400 | 90.407440 | True | 0.413825 | 1.577907 | 23.843347 | 90.397783 | Dhaka / Hazrat Shahjalal International Airport | DAC |
| 20089856 | sql-us | 170.247.170.2 | 4 | 6 | 192.158.252.3 | 1 | 0.222 | 255 | 192.158.252.3 | r2.pao2-guests.isc.org | ... | Palo Alto | 37.441879 | -122.143021 | False | 0.653484 | -2.131798 | 37.362598 | -121.929001 | Norman Y. Mineta San Jose International Airport | SJC |
2990 rows × 30 columns
In [100]:
final_roots.groupby('dst')['inferred_airport_iata'].nunique() # number of sites found per root-letter (without latency constraints)
Out[100]:
dst 170.247.170.2 13 192.203.230.10 95 192.33.4.12 13 192.36.148.17 47 192.5.5.0 82 192.58.128.30 63 193.0.14.129 51 198.41.0.4 17 198.97.190.53 30 199.7.83.42 52 199.7.91.13 75 202.12.27.33 17 Name: inferred_airport_iata, dtype: int64
In [101]:
# get average enumeration increase
count_differences = []
print("--- Calculating Counts per Destination ---")
# --- Loop and Calculate ---
for letter in roots:
# Calculate p-hop count for the current dst
p_hop_count = final_roots[final_roots['dst'] == letter]['inferred_airport_iata'].nunique()
# Calculate latency neighbor count for the current dst
latency_count = final_df[final_df['dst'] == letter]['inferred_airport_iata'].nunique()
# Calculate the difference (increase = p-hop - latency)
difference = p_hop_count - latency_count
count_differences.append(difference)
print("\n--- Average Calculation ---")
average_increase = sum(count_differences) / len(count_differences)
print(f"Total number of destinations processed: {len(count_differences)}")
print(f"Average increase (p-hop unique airports - latency unique airports): {average_increase:.4f}")
--- Calculating Counts per Destination --- --- Average Calculation --- Total number of destinations processed: 12 Average increase (p-hop unique airports - latency unique airports): 11.9167
In [102]:
# compare number of sites found for both TR techniques, per root-letter
for letter in roots:
print(letter)
print("p-hop count")
print(final_roots[final_roots['dst'] == letter]['inferred_airport_iata'].nunique())
print("latency neighbor count")
print(final_df[final_df['dst'] == letter]['inferred_airport_iata'].nunique())
print('\n')
198.41.0.4 p-hop count 17 latency neighbor count 15 170.247.170.2 p-hop count 13 latency neighbor count 7 192.33.4.12 p-hop count 13 latency neighbor count 9 199.7.91.13 p-hop count 75 latency neighbor count 69 192.203.230.10 p-hop count 95 latency neighbor count 85 192.5.5.0 p-hop count 82 latency neighbor count 75 198.97.190.53 p-hop count 30 latency neighbor count 13 192.36.148.17 p-hop count 47 latency neighbor count 31 192.58.128.30 p-hop count 63 latency neighbor count 41 193.0.14.129 p-hop count 51 latency neighbor count 23 199.7.83.42 p-hop count 52 latency neighbor count 32 202.12.27.33 p-hop count 17 latency neighbor count 12
run analysis with all phops¶
In [104]:
# get p-hops
p_hops = traceroutes_df[traceroutes_df['dst'] != traceroutes_df['hop_addr']].sort_values(by='hop_probe_ttl', ascending=False)\
.drop_duplicates(subset=['tx_hostname', 'dst'], keep='first')
# get the highest TTL hop that is not the destination for all VP, PoP pairs
p_hops.head()
Out[104]:
| tx_hostname | dst | hop_count | probe_count | hop_addr | hop_probe_ttl | hop_rtt | hop_reply_ttl | ip4_address | hop_name | ... | vp_asn | tx_airport | pop_rtt | ark_to_hop | ark_to_pop | hop_asn | hop_city | hop_lat | hop_lon | is_neighbor | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 19208202 | puw-us | 23.11.40.7 | 47 | 60 | 23.203.145.249 | 43 | 289.470 | 219 | 23.203.145.249 | ae34.r03.border101.sea01.fab.netarch.akamai.com | ... | 20055 | puw | 151.139 | 29577.638119 | 15443.170787 | 20940 | Seattle | 47.449001 | -122.308998 | False |
| 3886303 | hkg5-cn | 23.11.40.7 | 44 | 56 | 23.40.189.1 | 43 | 453.065 | 28 | 23.40.189.1 | vlan100.r01.tor01.hkg01.fab.netarch.akamai.com | ... | 9304 | hkg | 230.783 | 46293.545494 | 23581.082868 | 20940 | Hong Kong | 22.320304 | 114.198074 | False |
| 16388104 | bed-us | 23.11.38.17 | 39 | 52 | 23.198.9.65 | 38 | 340.269 | 31 | 23.198.9.65 | vlan100.r05.tor01.bos01.fab.netarch.akamai.com | ... | 7922 | bed | 172.131 | 34768.208605 | 17588.103869 | 20940 | Boston | 42.364300 | -71.005203 | False |
| 2510835 | ind-us | 23.11.38.17 | 38 | 49 | 154.24.97.54 | 37 | 462.866 | 223 | NaN | NaN | ... | 14333 | ind | 229.942 | 47294.997911 | 23495.150669 | 174 | Indianapolis | 39.768379 | -86.158043 | False |
| 19661446 | cpv-br | 23.11.38.17 | 40 | 52 | 23.32.63.57 | 37 | 590.487 | 225 | 23.32.63.57 | ae4.r01.gru01.icn.netarch.akamai.com | ... | 1916 | cpv | 327.031 | 60335.132483 | 33415.568354 | 20940 | Sao Paulo | -23.435556 | -46.473057 | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17064520 | acc-gh | 72.42.124.1 | 3 | 4 | 196.49.14.1 | 1 | 0.341 | 255 | NaN | NaN | ... | 30997 | acc | 0.406 | 34.842901 | 41.484510 | 30997 | Accra | 5.556020 | -0.196900 | True |
| 17064522 | acc-gh | 72.42.125.1 | 3 | 4 | 196.49.14.1 | 1 | 0.335 | 255 | NaN | NaN | ... | 30997 | acc | 0.320 | 34.229830 | 32.697151 | 30997 | Accra | 5.556020 | -0.196900 | True |
| 17064524 | acc-gh | 72.42.126.1 | 3 | 4 | 196.49.14.1 | 1 | 0.330 | 255 | NaN | NaN | ... | 30997 | acc | 0.368 | 33.718937 | 37.601723 | 30997 | Accra | 5.556020 | -0.196900 | True |
| 17064526 | acc-gh | 72.42.127.1 | 3 | 4 | 196.49.14.1 | 1 | 0.326 | 255 | NaN | NaN | ... | 30997 | acc | 0.331 | 33.310222 | 33.821115 | 30997 | Accra | 5.556020 | -0.196900 | True |
| 16999300 | acc-gh | 23.0.0.1 | 2 | 2 | 196.49.14.1 | 1 | 0.349 | 255 | NaN | NaN | ... | 30997 | acc | 16.481 | 35.660330 | 1684.005437 | 30997 | Accra | 5.556020 | -0.196900 | False |
3350019 rows × 24 columns
In [105]:
# find the nearest Airport for each inferred location
def merge_nearest_airport(inferred_locs_df, locs_df):
"""
Finds the nearest airport from final_df for each hop in inferred_df
and merges airport details and distance.
"""
# Drop airports with missing coordinates and reset index to keep track
locs_df['inferred_airport_lat'] = pd.to_numeric(locs_df['inferred_airport_lat'], errors='coerce')
locs_df['inferred_airport_lon'] = pd.to_numeric(locs_df['inferred_airport_lon'], errors='coerce')
airports_valid = locs_df.dropna(subset=['inferred_airport_lat', 'inferred_airport_lon']).copy()
# Convert valid airport coordinates to radians for BallTree
airports_valid['lat_rad'] = np.radians(airports_valid['inferred_airport_lat'])
airports_valid['lon_rad'] = np.radians(airports_valid['inferred_airport_lon'])
airport_coords_rad = airports_valid[['lat_rad', 'lon_rad']].values
# Identify hops with valid coordinates
valid_hop_mask = inferred_locs_df['hop_lat'].notna() & inferred_locs_df['hop_lon'].notna()
valid_hops = inferred_locs_df.loc[valid_hop_mask].copy()
# Convert valid hop coordinates to radians
valid_hops['hop_lat'] = pd.to_numeric(valid_hops['hop_lat'], errors='coerce')
valid_hops['hop_lon'] = pd.to_numeric(valid_hops['hop_lon'], errors='coerce')
valid_hops['lat_rad'] = np.radians(valid_hops['hop_lat'])
valid_hops['lon_rad'] = np.radians(valid_hops['hop_lon'])
hop_coords_rad = valid_hops[['lat_rad', 'lon_rad']].values
# Build tree on airport coordinates
tree = BallTree(airport_coords_rad, metric='haversine')
# Query the tree for the nearest airport (k=1) for each valid hop
# Returns distances in radians and indices *relative to airport_coords_rad*
distances_rad, indices_in_tree = tree.query(hop_coords_rad, k=1)
# Get the original indices from final_df corresponding to the nearest airports
nearest_airport_original_indices = airports_valid.index[indices_in_tree.flatten()]
# Calculate distances in kilometers
distances_km = distances_rad.flatten() * EARTH_RADIUS_KM
# Create a DataFrame with results, indexed by the original index of valid_hops
results_df = pd.DataFrame({
'nearest_airport_original_idx': nearest_airport_original_indices,
'distance_to_nearest_airport_km': distances_km
}, index=valid_hops.index) # Use the original index from inferred_df
# Create the new columns in the original inferred_df, initially filled with NaN
for col in ['nearest_airport_lat', 'nearest_airport_lon', 'nearest_airport_name',
'nearest_airport_iata', 'distance_to_nearest_airport_km']:
inferred_locs_df[col] = np.nan
# Fill distance for valid hops
inferred_locs_df.loc[results_df.index, 'distance_to_nearest_airport_km'] = results_df['distance_to_nearest_airport_km']
# Use the nearest_airport_original_idx to look up details in the original final_df
nearest_indices = results_df['nearest_airport_original_idx']
inferred_locs_df.loc[results_df.index, 'nearest_airport_lat'] = locs_df.loc[nearest_indices, 'inferred_airport_lat'].values
inferred_locs_df.loc[results_df.index, 'nearest_airport_lon'] = locs_df.loc[nearest_indices, 'inferred_airport_lon'].values
inferred_locs_df.loc[results_df.index, 'nearest_airport_name'] = locs_df.loc[nearest_indices, 'inferred_airport_name'].values
inferred_locs_df.loc[results_df.index, 'nearest_airport_iata'] = locs_df.loc[nearest_indices, 'inferred_airport_iata'].values
return inferred_locs_df
# takes some time
combined_df = merge_nearest_airport(p_hops, final_df)
combined_df.head()
/var/folders/k_/b2zc8kg571bcqh2l9j8mj82h0000gn/T/ipykernel_48014/3012844186.py:64: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '['Boeing Field King County International Airport' 'Chek Lap Kok International Airport' 'General Edward Lawrence Logan International Airport' ... 'Kotoka International Airport' 'Kotoka International Airport' 'Kotoka International Airport']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first. inferred_locs_df.loc[results_df.index, 'nearest_airport_name'] = locs_df.loc[nearest_indices, 'inferred_airport_name'].values /var/folders/k_/b2zc8kg571bcqh2l9j8mj82h0000gn/T/ipykernel_48014/3012844186.py:65: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '['BFI' 'HKG' 'BOS' ... 'ACC' 'ACC' 'ACC']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first. inferred_locs_df.loc[results_df.index, 'nearest_airport_iata'] = locs_df.loc[nearest_indices, 'inferred_airport_iata'].values
Out[105]:
| tx_hostname | dst | hop_count | probe_count | hop_addr | hop_probe_ttl | hop_rtt | hop_reply_ttl | ip4_address | hop_name | ... | hop_asn | hop_city | hop_lat | hop_lon | is_neighbor | nearest_airport_lat | nearest_airport_lon | nearest_airport_name | nearest_airport_iata | distance_to_nearest_airport_km | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 19208202 | puw-us | 23.11.40.7 | 47 | 60 | 23.203.145.249 | 43 | 289.470 | 219 | 23.203.145.249 | ae34.r03.border101.sea01.fab.netarch.akamai.com | ... | 20940 | Seattle | 47.449001 | -122.308998 | False | 47.529999 | -122.302002 | Boeing Field King County International Airport | BFI | 9.021869 |
| 3886303 | hkg5-cn | 23.11.40.7 | 44 | 56 | 23.40.189.1 | 43 | 453.065 | 28 | 23.40.189.1 | vlan100.r01.tor01.hkg01.fab.netarch.akamai.com | ... | 20940 | Hong Kong | 22.320304 | 114.198074 | False | 22.308889 | 113.914722 | Chek Lap Kok International Airport | HKG | 29.175437 |
| 16388104 | bed-us | 23.11.38.17 | 39 | 52 | 23.198.9.65 | 38 | 340.269 | 31 | 23.198.9.65 | vlan100.r05.tor01.bos01.fab.netarch.akamai.com | ... | 20940 | Boston | 42.364300 | -71.005203 | False | 42.364300 | -71.005203 | General Edward Lawrence Logan International Airport | BOS | 0.000033 |
| 2510835 | ind-us | 23.11.38.17 | 38 | 49 | 154.24.97.54 | 37 | 462.866 | 223 | NaN | NaN | ... | 174 | Indianapolis | 39.768379 | -86.158043 | False | 39.717300 | -86.294403 | Indianapolis International Airport | IND | 12.968696 |
| 19661446 | cpv-br | 23.11.38.17 | 40 | 52 | 23.32.63.57 | 37 | 590.487 | 225 | 23.32.63.57 | ae4.r01.gru01.icn.netarch.akamai.com | ... | 20940 | Sao Paulo | -23.435556 | -46.473057 | False | -23.509100 | -46.637798 | Campo de Marte Airport | SAO | 18.686953 |
5 rows × 29 columns
In [106]:
combined_df['distance_to_nearest_airport_km'].mean() # 20km mean for valid latency neighbors, 26k mean for p-hops
Out[106]:
26.525433633860494
In [107]:
# bogon p-hops
combined_df[combined_df['hop_addr'].str.startswith('10.')]
Out[107]:
| tx_hostname | dst | hop_count | probe_count | hop_addr | hop_probe_ttl | hop_rtt | hop_reply_ttl | ip4_address | hop_name | ... | hop_asn | hop_city | hop_lat | hop_lon | is_neighbor | nearest_airport_lat | nearest_airport_lon | nearest_airport_name | nearest_airport_iata | distance_to_nearest_airport_km | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10797560 | bkl-us | 44.31.110.0 | 34 | 37 | 10.64.1.186 | 32 | 72.655 | 52 | NaN | NaN | ... | - | <NA> | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN |
| 343127 | bdl-us | 44.31.110.0 | 33 | 34 | 10.64.1.182 | 31 | 74.293 | 58 | NaN | NaN | ... | - | <NA> | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN |
| 3994211 | osl-no | 193.19.225.2 | 33 | 44 | 10.79.1.174 | 31 | 266.681 | 49 | NaN | NaN | ... | - | <NA> | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN |
| 16824891 | lun-zm | 43.224.183.166 | 32 | 35 | 10.73.201.22 | 31 | 513.738 | 46 | NaN | NaN | ... | - | <NA> | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN |
| 6021659 | tlv3-il | 46.8.8.66 | 31 | 41 | 10.79.1.118 | 30 | 281.252 | 51 | NaN | NaN | ... | - | <NA> | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16896268 | aep2-ar | 104.192.141.0 | 7 | 10 | 10.3.7.97 | 3 | 1.842 | 253 | NaN | NaN | ... | - | <NA> | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN |
| 6355310 | hyd-in | 209.131.101.100 | 6 | 10 | 10.130.32.1 | 2 | 0.761 | 63 | NaN | NaN | ... | - | <NA> | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN |
| 6401646 | hyd-in | 156.154.58.1 | 6 | 10 | 10.130.32.1 | 2 | 1.165 | 63 | NaN | NaN | ... | - | <NA> | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN |
| 6407864 | hyd-in | 173.243.129.1 | 6 | 10 | 10.130.32.1 | 2 | 1.000 | 63 | NaN | NaN | ... | - | <NA> | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN |
| 22004682 | sao-br | 194.0.14.1 | 3 | 3 | 10.210.200.250 | 2 | 0.384 | 63 | NaN | NaN | ... | - | <NA> | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN |
32578 rows × 29 columns
In [108]:
# how many are NOT within 3 ms of the target
combined_df[~combined_df['is_neighbor']]
Out[108]:
| tx_hostname | dst | hop_count | probe_count | hop_addr | hop_probe_ttl | hop_rtt | hop_reply_ttl | ip4_address | hop_name | ... | hop_asn | hop_city | hop_lat | hop_lon | is_neighbor | nearest_airport_lat | nearest_airport_lon | nearest_airport_name | nearest_airport_iata | distance_to_nearest_airport_km | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 19208202 | puw-us | 23.11.40.7 | 47 | 60 | 23.203.145.249 | 43 | 289.470 | 219 | 23.203.145.249 | ae34.r03.border101.sea01.fab.netarch.akamai.com | ... | 20940 | Seattle | 47.449001 | -122.308998 | False | 47.529999 | -122.302002 | Boeing Field King County International Airport | BFI | 9.021869 |
| 3886303 | hkg5-cn | 23.11.40.7 | 44 | 56 | 23.40.189.1 | 43 | 453.065 | 28 | 23.40.189.1 | vlan100.r01.tor01.hkg01.fab.netarch.akamai.com | ... | 20940 | Hong Kong | 22.320304 | 114.198074 | False | 22.308889 | 113.914722 | Chek Lap Kok International Airport | HKG | 29.175437 |
| 16388104 | bed-us | 23.11.38.17 | 39 | 52 | 23.198.9.65 | 38 | 340.269 | 31 | 23.198.9.65 | vlan100.r05.tor01.bos01.fab.netarch.akamai.com | ... | 20940 | Boston | 42.364300 | -71.005203 | False | 42.364300 | -71.005203 | General Edward Lawrence Logan International Airport | BOS | 0.000033 |
| 2510835 | ind-us | 23.11.38.17 | 38 | 49 | 154.24.97.54 | 37 | 462.866 | 223 | NaN | NaN | ... | 174 | Indianapolis | 39.768379 | -86.158043 | False | 39.717300 | -86.294403 | Indianapolis International Airport | IND | 12.968696 |
| 19661446 | cpv-br | 23.11.38.17 | 40 | 52 | 23.32.63.57 | 37 | 590.487 | 225 | 23.32.63.57 | ae4.r01.gru01.icn.netarch.akamai.com | ... | 20940 | Sao Paulo | -23.435556 | -46.473057 | False | -23.509100 | -46.637798 | Campo de Marte Airport | SAO | 18.686953 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14509083 | ams5-nl | 199.232.205.0 | 3 | 4 | 213.46.228.254 | 1 | 7.781 | 255 | 213.46.228.254 | nl-ams05a-re1-fe-2-0.aorta.net | ... | 6830 | Amsterdam | 52.308601 | 4.763890 | False | 52.308601 | 4.763890 | Amsterdam Airport Schiphol | AMX | 0.000045 |
| 19007616 | scl3-cl | 34.34.193.1 | 5 | 8 | 200.0.206.73 | 1 | 1.199 | 255 | 200.0.206.73 | dnsp.redclara.net | ... | 27750 | <NA> | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN |
| 19059509 | scl3-cl | 172.217.60.0 | 4 | 6 | 200.0.206.73 | 1 | 1.648 | 255 | 200.0.206.73 | dnsp.redclara.net | ... | 27750 | <NA> | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN |
| 19045998 | scl3-cl | 136.124.0.150 | 5 | 8 | 200.0.206.73 | 1 | 1.131 | 255 | 200.0.206.73 | dnsp.redclara.net | ... | 27750 | <NA> | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN |
| 16999300 | acc-gh | 23.0.0.1 | 2 | 2 | 196.49.14.1 | 1 | 0.349 | 255 | NaN | NaN | ... | 30997 | Accra | 5.556020 | -0.196900 | False | 5.605190 | -0.166786 | Kotoka International Airport | ACC | 6.403097 |
760988 rows × 29 columns
In [109]:
# how many are within 3ms of the target
combined_df[combined_df['is_neighbor']]
Out[109]:
| tx_hostname | dst | hop_count | probe_count | hop_addr | hop_probe_ttl | hop_rtt | hop_reply_ttl | ip4_address | hop_name | ... | hop_asn | hop_city | hop_lat | hop_lon | is_neighbor | nearest_airport_lat | nearest_airport_lon | nearest_airport_name | nearest_airport_iata | distance_to_nearest_airport_km | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 16381107 | bed-us | 203.30.19.12 | 37 | 43 | 203.34.30.181 | 36 | 259.773 | 232 | 203.34.30.181 | xe-2-579-rt1.perma.zettagrid.com | ... | 7604 | Perth | -31.952240 | 115.861397 | True | -31.940300 | 115.967003 | Perth International Airport | PER | 10.052402 |
| 16367629 | bed-us | 103.79.5.1 | 37 | 41 | 103.197.232.191 | 36 | 232.999 | 233 | NaN | NaN | ... | 64098 | Sydney | -33.867851 | 151.207321 | True | -33.924400 | 150.988007 | Sydney Bankstown Airport | BWU | 21.196300 |
| 14059800 | mem-us | 91.237.67.0 | 37 | 39 | 87.44.56.138 | 36 | 279.795 | 244 | NaN | NaN | ... | 1213 | Dublin | 53.333061 | -6.248890 | True | 53.421299 | -6.270070 | Dublin Airport | DUB | 9.911669 |
| 16381076 | bed-us | 203.24.54.4 | 37 | 43 | 203.34.30.181 | 36 | 259.226 | 232 | 203.34.30.181 | xe-2-579-rt1.perma.zettagrid.com | ... | 7604 | Perth | -31.952240 | 115.861397 | True | -31.940300 | 115.967003 | Perth International Airport | PER | 10.052402 |
| 13329052 | lej-de | 103.79.5.1 | 36 | 47 | 103.197.232.191 | 35 | 311.270 | 217 | NaN | NaN | ... | 64098 | Sydney | -33.867851 | 151.207321 | True | -33.924400 | 150.988007 | Sydney Bankstown Airport | BWU | 21.196300 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17064518 | acc-gh | 72.42.123.1 | 3 | 4 | 196.49.14.1 | 1 | 0.342 | 255 | NaN | NaN | ... | 30997 | Accra | 5.556020 | -0.196900 | True | 5.605190 | -0.166786 | Kotoka International Airport | ACC | 6.403097 |
| 17064520 | acc-gh | 72.42.124.1 | 3 | 4 | 196.49.14.1 | 1 | 0.341 | 255 | NaN | NaN | ... | 30997 | Accra | 5.556020 | -0.196900 | True | 5.605190 | -0.166786 | Kotoka International Airport | ACC | 6.403097 |
| 17064522 | acc-gh | 72.42.125.1 | 3 | 4 | 196.49.14.1 | 1 | 0.335 | 255 | NaN | NaN | ... | 30997 | Accra | 5.556020 | -0.196900 | True | 5.605190 | -0.166786 | Kotoka International Airport | ACC | 6.403097 |
| 17064524 | acc-gh | 72.42.126.1 | 3 | 4 | 196.49.14.1 | 1 | 0.330 | 255 | NaN | NaN | ... | 30997 | Accra | 5.556020 | -0.196900 | True | 5.605190 | -0.166786 | Kotoka International Airport | ACC | 6.403097 |
| 17064526 | acc-gh | 72.42.127.1 | 3 | 4 | 196.49.14.1 | 1 | 0.326 | 255 | NaN | NaN | ... | 30997 | Accra | 5.556020 | -0.196900 | True | 5.605190 | -0.166786 | Kotoka International Airport | ACC | 6.403097 |
2589031 rows × 29 columns
In [110]:
# how many are latency neighbor hops
combined_df[combined_df['is_neighbor'] & (combined_df['pop_rtt'] < TRACEROUTE_MAX_LATENCY)]
Out[110]:
| tx_hostname | dst | hop_count | probe_count | hop_addr | hop_probe_ttl | hop_rtt | hop_reply_ttl | ip4_address | hop_name | ... | hop_asn | hop_city | hop_lat | hop_lon | is_neighbor | nearest_airport_lat | nearest_airport_lon | nearest_airport_name | nearest_airport_iata | distance_to_nearest_airport_km | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 24242602 | agb-de | 23.11.40.7 | 23 | 31 | 23.210.52.235 | 22 | 11.539 | 234 | 23.210.52.235 | ae3.decix-fra.netarch.akamai.com | ... | 20940 | Frankfurt am Main | 50.033333 | 8.570556 | True | 50.026402 | 8.543130 | Frankfurt am Main International Airport | FRA | 2.105217 |
| 2947846 | cjj-kr | 207.189.149.0 | 22 | 27 | 141.101.82.12 | 21 | 4.745 | 244 | NaN | NaN | ... | 13335 | Seoul | 37.566002 | 126.978401 | True | 37.558300 | 126.791000 | Gimpo International Airport | SEL | 16.540326 |
| 27203320 | oak5-us | 66.174.0.3 | 22 | 29 | 63.59.164.212 | 21 | 9.621 | 233 | NaN | NaN | ... | 22394 | San Jose | 37.339390 | -121.894958 | True | 37.362598 | -121.929001 | Norman Y. Mineta San Jose International Airport | SJC | 3.964158 |
| 11857756 | fra-de | 23.11.40.7 | 25 | 34 | 23.210.54.63 | 21 | 1.110 | 44 | 23.210.54.63 | ae34.r03.border101.fra03.fab.netarch.akamai.com | ... | 20940 | Frankfurt am Main | 50.033333 | 8.570556 | True | 50.026402 | 8.543130 | Frankfurt am Main International Airport | FRA | 2.105217 |
| 3005931 | cjj-kr | 43.159.115.11 | 22 | 29 | 30.37.127.233 | 20 | 4.694 | 244 | NaN | NaN | ... | 749 | <NA> | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17064518 | acc-gh | 72.42.123.1 | 3 | 4 | 196.49.14.1 | 1 | 0.342 | 255 | NaN | NaN | ... | 30997 | Accra | 5.556020 | -0.196900 | True | 5.605190 | -0.166786 | Kotoka International Airport | ACC | 6.403097 |
| 17064520 | acc-gh | 72.42.124.1 | 3 | 4 | 196.49.14.1 | 1 | 0.341 | 255 | NaN | NaN | ... | 30997 | Accra | 5.556020 | -0.196900 | True | 5.605190 | -0.166786 | Kotoka International Airport | ACC | 6.403097 |
| 17064522 | acc-gh | 72.42.125.1 | 3 | 4 | 196.49.14.1 | 1 | 0.335 | 255 | NaN | NaN | ... | 30997 | Accra | 5.556020 | -0.196900 | True | 5.605190 | -0.166786 | Kotoka International Airport | ACC | 6.403097 |
| 17064524 | acc-gh | 72.42.126.1 | 3 | 4 | 196.49.14.1 | 1 | 0.330 | 255 | NaN | NaN | ... | 30997 | Accra | 5.556020 | -0.196900 | True | 5.605190 | -0.166786 | Kotoka International Airport | ACC | 6.403097 |
| 17064526 | acc-gh | 72.42.127.1 | 3 | 4 | 196.49.14.1 | 1 | 0.326 | 255 | NaN | NaN | ... | 30997 | Accra | 5.556020 | -0.196900 | True | 5.605190 | -0.166786 | Kotoka International Airport | ACC | 6.403097 |
1472998 rows × 29 columns
In [111]:
# how many are invalid latency neighbors
invalids = combined_df[~combined_df['is_neighbor'] | (combined_df['pop_rtt'] > TRACEROUTE_MAX_LATENCY)]
invalids
Out[111]:
| tx_hostname | dst | hop_count | probe_count | hop_addr | hop_probe_ttl | hop_rtt | hop_reply_ttl | ip4_address | hop_name | ... | hop_asn | hop_city | hop_lat | hop_lon | is_neighbor | nearest_airport_lat | nearest_airport_lon | nearest_airport_name | nearest_airport_iata | distance_to_nearest_airport_km | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 19208202 | puw-us | 23.11.40.7 | 47 | 60 | 23.203.145.249 | 43 | 289.470 | 219 | 23.203.145.249 | ae34.r03.border101.sea01.fab.netarch.akamai.com | ... | 20940 | Seattle | 47.449001 | -122.308998 | False | 47.529999 | -122.302002 | Boeing Field King County International Airport | BFI | 9.021869 |
| 3886303 | hkg5-cn | 23.11.40.7 | 44 | 56 | 23.40.189.1 | 43 | 453.065 | 28 | 23.40.189.1 | vlan100.r01.tor01.hkg01.fab.netarch.akamai.com | ... | 20940 | Hong Kong | 22.320304 | 114.198074 | False | 22.308889 | 113.914722 | Chek Lap Kok International Airport | HKG | 29.175437 |
| 16388104 | bed-us | 23.11.38.17 | 39 | 52 | 23.198.9.65 | 38 | 340.269 | 31 | 23.198.9.65 | vlan100.r05.tor01.bos01.fab.netarch.akamai.com | ... | 20940 | Boston | 42.364300 | -71.005203 | False | 42.364300 | -71.005203 | General Edward Lawrence Logan International Airport | BOS | 0.000033 |
| 2510835 | ind-us | 23.11.38.17 | 38 | 49 | 154.24.97.54 | 37 | 462.866 | 223 | NaN | NaN | ... | 174 | Indianapolis | 39.768379 | -86.158043 | False | 39.717300 | -86.294403 | Indianapolis International Airport | IND | 12.968696 |
| 19661446 | cpv-br | 23.11.38.17 | 40 | 52 | 23.32.63.57 | 37 | 590.487 | 225 | 23.32.63.57 | ae4.r01.gru01.icn.netarch.akamai.com | ... | 20940 | Sao Paulo | -23.435556 | -46.473057 | False | -23.509100 | -46.637798 | Campo de Marte Airport | SAO | 18.686953 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14509083 | ams5-nl | 199.232.205.0 | 3 | 4 | 213.46.228.254 | 1 | 7.781 | 255 | 213.46.228.254 | nl-ams05a-re1-fe-2-0.aorta.net | ... | 6830 | Amsterdam | 52.308601 | 4.763890 | False | 52.308601 | 4.763890 | Amsterdam Airport Schiphol | AMX | 0.000045 |
| 19007616 | scl3-cl | 34.34.193.1 | 5 | 8 | 200.0.206.73 | 1 | 1.199 | 255 | 200.0.206.73 | dnsp.redclara.net | ... | 27750 | <NA> | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN |
| 19059509 | scl3-cl | 172.217.60.0 | 4 | 6 | 200.0.206.73 | 1 | 1.648 | 255 | 200.0.206.73 | dnsp.redclara.net | ... | 27750 | <NA> | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN |
| 19045998 | scl3-cl | 136.124.0.150 | 5 | 8 | 200.0.206.73 | 1 | 1.131 | 255 | 200.0.206.73 | dnsp.redclara.net | ... | 27750 | <NA> | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN |
| 16999300 | acc-gh | 23.0.0.1 | 2 | 2 | 196.49.14.1 | 1 | 0.349 | 255 | NaN | NaN | ... | 30997 | Accra | 5.556020 | -0.196900 | False | 5.605190 | -0.166786 | Kotoka International Airport | ACC | 6.403097 |
1876960 rows × 29 columns
In [112]:
# plot for invalids
rtt_data = invalids['pop_rtt'].dropna()
distance_data = invalids['distance_to_nearest_airport_km'].dropna()
def calculate_cdf(series):
"""Calculates sorted data and cumulative probabilities for a CDF."""
data_sorted = np.sort(series)
n = data_sorted.size
y_cdf = np.arange(1, n + 1) / n
return data_sorted, y_cdf
fig, axes = plt.subplots(figsize=(14, 6))
x_dist, y_dist_cdf = calculate_cdf(distance_data)
ax = axes
ax.plot(x_dist, y_dist_cdf, drawstyle='steps-post', linestyle='-')
ax.set_xlabel('Distance (km)')
ax.set_ylabel('Cumulative Probability')
ax.grid(True, linestyle='--', alpha=0.6)
ax.set_ylim(0, 1)
ax.set_xlim(1, 4_000)
ax.set_xscale('log')
plt.tight_layout()
plt.savefig("./plots/phop_distances_log.pdf", bbox_inches='tight', pad_inches=0)
plt.show()
Investigate probing cost¶
In [113]:
# infer hops adjacent to the VP with bogon addresses to be in the same ASN
df = traceroutes_df
MISSING_ASN_VALUE = '-'
df.sort_values(['tx_hostname', 'dst', 'hop_probe_ttl'], inplace=True)
# find valid AS hops
df['asn_is_valid'] = df['hop_asn'] != MISSING_ASN_VALUE
# Identify the initial segment before the first valid ASN within each group
df['seen_valid_asn'] = df.groupby(['tx_hostname', 'dst'])['asn_is_valid'].cummax()
# determine which rows need filling
rows_to_fill_mask = (df['hop_asn'] == MISSING_ASN_VALUE) & (~df['seen_valid_asn'])
# fill rows
df.loc[rows_to_fill_mask, 'hop_asn'] = df.loc[rows_to_fill_mask, 'vp_asn']
df.drop(columns=['asn_is_valid', 'seen_valid_asn'], inplace=True)
In [114]:
# fill in hop ASNs (hops in-between two hops with the same AS are inferred to be in that same AS, if they have no AS data)
df['hop_asn_temp'] = df['hop_asn'].replace(MISSING_ASN_VALUE, np.nan)
df['asn_ffill'] = df.groupby(['tx_hostname', 'dst'])['hop_asn_temp'].ffill()
df['asn_bfill'] = df.groupby(['tx_hostname', 'dst'])['hop_asn_temp'].bfill()
fill_condition = (
(df['hop_asn'] == MISSING_ASN_VALUE) & # Original was missing
(df['asn_ffill'] == df['asn_bfill']) & # Surrounding valid ASNs match
(df['asn_ffill'].notna()) # The matching ASN is a valid number
)
df.loc[fill_condition, 'hop_asn'] = df.loc[fill_condition, 'asn_ffill']
df.drop(columns=['hop_asn_temp', 'asn_ffill', 'asn_bfill'], inplace=True)
df.head()
Out[114]:
| tx_hostname | dst | hop_count | probe_count | hop_addr | hop_probe_ttl | hop_rtt | hop_reply_ttl | ip4_address | hop_name | ... | vp_asn | tx_airport | pop_rtt | ark_to_hop | ark_to_pop | hop_asn | hop_city | hop_lat | hop_lon | is_neighbor | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1324098 | abz2-uk | 1.0.0.0 | 11 | 11 | 137.50.19.1 | 1 | 0.318 | 255 | 137.50.19.1 | milliways2-sbx-one-ipv4.erg.abdn.ac.uk | ... | 786 | abz | 13.506 | 32.492793 | 1380.024115 | 786 | Aberdeen | 57.143688 | -2.09814 | False |
| 1324537 | abz2-uk | 1.0.0.0 | 11 | 12 | 137.50.19.1 | 1 | 0.326 | 255 | 137.50.19.1 | milliways2-sbx-one-ipv4.erg.abdn.ac.uk | ... | 786 | abz | 13.506 | 33.310222 | 1380.024115 | 786 | Aberdeen | 57.143688 | -2.09814 | False |
| 1325067 | abz2-uk | 1.0.0.0 | 11 | 11 | 137.50.19.1 | 1 | 0.348 | 255 | 137.50.19.1 | milliways2-sbx-one-ipv4.erg.abdn.ac.uk | ... | 786 | abz | 13.506 | 35.558151 | 1380.024115 | 786 | Aberdeen | 57.143688 | -2.09814 | False |
| 1325895 | abz2-uk | 1.0.0.0 | 11 | 11 | 137.50.19.1 | 1 | 0.336 | 255 | 137.50.19.1 | milliways2-sbx-one-ipv4.erg.abdn.ac.uk | ... | 786 | abz | 13.506 | 34.332008 | 1380.024115 | 786 | Aberdeen | 57.143688 | -2.09814 | False |
| 1327020 | abz2-uk | 1.0.0.0 | 11 | 11 | 137.50.19.1 | 1 | 0.314 | 255 | 137.50.19.1 | milliways2-sbx-one-ipv4.erg.abdn.ac.uk | ... | 786 | abz | 13.506 | 32.084079 | 1380.024115 | 786 | Aberdeen | 57.143688 | -2.09814 | False |
5 rows × 24 columns
In [115]:
# what is the average hop count?
df.groupby(['tx_hostname', 'dst'])['hop_count'].first().mean()
Out[115]:
9.828091099985004
In [116]:
# what is the average hop count (< 12 ms)
df[df['pop_rtt'] < TRACEROUTE_MAX_LATENCY].groupby(['tx_hostname', 'dst'])['hop_count'].first().mean()
Out[116]:
8.40202352968762
In [117]:
# average number of hops within the same AS as the VP
df[df['hop_asn'] == df['vp_asn']].groupby(['tx_hostname', 'dst']).size().mean()
Out[117]:
2.86832625940674
In [118]:
# average number of VPs per target
traceroutes_df.groupby('dst')['tx_hostname'].nunique().mean()
Out[118]:
244.19898070622497
In [119]:
# average number of VPs within 12 ms per target
traceroutes_df[traceroutes_df['pop_rtt'] < TRACEROUTE_MAX_LATENCY].groupby('dst')['tx_hostname'].nunique().mean()
Out[119]:
124.46592733109587
In [120]:
# plot number of hops against number of PoPs
traceroute_lengths = df.groupby(['tx_hostname', 'prefix'])['hop_count'].first()
avg_hop_counts_per_prefix = traceroute_lengths.groupby(level='prefix').mean().rename('avg_hop_count')
site_counts_per_prefix = merged_df[['prefix', 'TR']].drop_duplicates(subset=['prefix']).set_index('prefix')['TR']
plot_data = pd.concat([avg_hop_counts_per_prefix, site_counts_per_prefix], axis=1, join='inner')
plt.figure(figsize=(10, 6))
plt.scatter(plot_data['TR'], plot_data['avg_hop_count'], alpha=0.1)
plt.xlabel("PoPs detected")
plt.ylabel("Average hop count")
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig("./plots/hop_count_scatter.pdf", bbox_inches='tight', pad_inches=0)
plt.show()
In [121]:
# plot latency against hop count
# get traceroute lengths of each VP, target pair
traceroute_lengths = df.groupby(['tx_hostname', 'prefix'])['hop_count'].first()
avg_hop_counts_per_prefix = traceroute_lengths.groupby(level='prefix').mean().rename('avg_hop_count')
# get RTT towards each PoP
pop_rtts = df.groupby(['tx_hostname', 'prefix'])['pop_rtt'].first()
avg_rtts_per_prefix = pop_rtts.groupby(level='prefix').mean().rename('avg_pop_rtt') # Calculate average RTT
# combine ttl lengths and average rtt
plot_data = pd.concat([avg_hop_counts_per_prefix, avg_rtts_per_prefix], axis=1, join='inner')
print(plot_data)
avg_hop_count avg_pop_rtt prefix 1.0.0.0/24 7.802920 9.607292 1.1.1.0/24 7.703846 10.929742 1.10.10.0/24 14.062500 208.615852 1.12.0.0/24 14.458333 102.723392 1.12.12.0/24 13.632979 90.069745 ... ... ... 99.83.254.0/24 11.747368 15.629442 99.83.255.0/24 11.773196 15.709402 99.99.97.0/24 12.940171 75.167333 99.99.98.0/24 13.879310 85.667448 99.99.99.0/24 13.923077 82.285115 [13735 rows x 2 columns]
In [122]:
plt.figure(figsize=(10, 6))
plt.scatter(plot_data['avg_pop_rtt'], plot_data['avg_hop_count'], alpha=0.1)
plt.xlabel("Latency to PoP (ms)")
plt.xscale('log', base=2)
plt.xlim(1, 1000)
plt.ylabel("Average hop count")
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig("./plots/latency_hop_count.pdf", bbox_inches='tight', pad_inches=0)
plt.show()
Geolocation¶
In [12]:
# Helper Functions
def i_greedy_location_to_df(file_path):
# Load JSON data
with open(file_path, 'r') as f:
data = list(json.load(f))
rows = []
for element in data:
for entry in element['instances']:
row = []
row.append(element["prefix"])
row.append(entry['marker']['latitude'])
row.append(entry['marker']['longitude'])
row.append(entry['marker']['city'])
row.append(entry['marker']['code_country'])
rows.append(row)
return pd.DataFrame(rows, columns= ['Prefix', 'Latitude','Longitude','City', 'Country' ]).drop_duplicates()
def root_server_location_to_df(file_path):
# Load JSON data
with open(file_path, 'r') as f:
data = dict(json.load(f))
rows = []
for entry in data['Sites']:
row = []
row.append(data["IPv4"])
row.append(entry['Latitude'])
row.append(entry['Longitude'])
row.append(entry['Town'])
row.append(entry['Country'])
rows.append(row)
return pd.DataFrame(rows, columns= ['IPv4', 'Latitude','Longitude','City', 'Country' ]).drop_duplicates()
In [19]:
# Import the iGreedy Locations and Ground Truth Locations
i_greedy_locations = i_greedy_location_to_df('./data/2025-04-19_v4_locations.json') # Can be found here: https://github.com/anycast-census/anycast-census/tree/main/2025/04/03
root_server_letters = ['A','B','C','D','E','F','G','H','I','J','K','L','M',]
root_server_location_dict = {}
for letter in root_server_letters:
root_server_location_dict[letter] = root_server_location_to_df(f"./data/root_server_locations/{letter}.json") # Can be found here: https://root-servers.org/
# And have a list of them to loop over
addresses = [
'198.41.0.4',
'170.247.170.2',
'192.33.4.12',
'199.7.91.13',
'192.203.230.10',
'192.5.5.0',
#'192.112.36.4', # g-root unresponsive to ICMP
'198.97.190.53',
'192.36.148.17',
'192.58.128.30',
'193.0.14.129',
'199.7.83.42',
'202.12.27.33']
In [30]:
traceroute_means = []
traceroute_medians = []
traceroute_enumeration = []
traceroute_full_list = []
igreedy_means = []
igreedy_medians = []
igreedy_enumeration = []
igreedy_full_list = []
#Loop over all responsive root servers and pick the right ground truth
for address in addresses:
if address == '198.41.0.4':
groundtruth_df = root_server_location_dict['A']
elif address == '170.247.170.2':
groundtruth_df = root_server_location_dict['B']
elif address == '192.33.4.12':
groundtruth_df = root_server_location_dict['C']
elif address == '199.7.91.13':
groundtruth_df = root_server_location_dict['D']
elif address == '192.203.230.10':
groundtruth_df = root_server_location_dict['E']
elif address == '192.5.5.0':
groundtruth_df = root_server_location_dict['F']
elif address == '192.112.36.4':
groundtruth_df = root_server_location_dict['G']
elif address == '198.97.190.53':
groundtruth_df = root_server_location_dict['H']
elif address == '192.36.148.17':
groundtruth_df = root_server_location_dict['I']
elif address == '192.58.128.30':
groundtruth_df = root_server_location_dict['J']
elif address == '193.0.14.129':
groundtruth_df = root_server_location_dict['K']
elif address == '199.7.83.42':
groundtruth_df = root_server_location_dict['L']
elif address == '202.12.27.33':
groundtruth_df = root_server_location_dict['M']
else:
break
prefix = ".".join(address.split(".")[:3]) + ".0/24"
# Get the right locations from the respective datasets
traceroute_df = final_df[final_df['dst']==address].dropna()
traceroute_df['location'] = traceroute_df.apply(lambda row: (row['inferred_airport_lat'], row['inferred_airport_lon'], row['inferred_airport_name']), axis=1)
traceroute_locs = traceroute_df['location'].unique()
i_greedy_df = i_greedy_locations[i_greedy_locations['Prefix']==prefix].dropna()
i_greedy_df['location'] = i_greedy_df.apply(lambda row: (row['Latitude'], row['Longitude'], row['City']), axis=1)
i_greedy_locs = i_greedy_df['location'].unique()
groundtruth_df['location'] = groundtruth_df.apply(lambda row: (row['Latitude'], row['Longitude'], row['City']), axis=1)
gt_locs = groundtruth_df['location'].unique()
# Calculate the distance for all found locations to the closest ground truth locations for Traceroute and iGreedy
tcd_list = []
for loc in traceroute_locs:
closest_distance =1000000
adr = None
for gt_loc in gt_locs:
distance_between = distance.geodesic((loc[0], loc[1]),(gt_loc[0], gt_loc[1])).km
if distance_between < closest_distance:
closest_distance = distance_between
adr = (loc[2], gt_loc[2])
tcd_list.append(closest_distance)
traceroute_means.append(statistics.mean(tcd_list))
traceroute_medians.append(statistics.median(tcd_list))
traceroute_enumeration.append(len(tcd_list)/len(gt_locs))
icd_list = []
for loc in i_greedy_locs:
closest_distance =1000000
adr = None
for gt_loc in gt_locs:
distance_between = distance.geodesic((loc[0], loc[1]),(gt_loc[0], gt_loc[1])).km
if distance_between < closest_distance:
closest_distance = distance_between
adr = (loc[2], gt_loc[2])
icd_list.append(closest_distance)
igreedy_means.append(statistics.mean(icd_list))
igreedy_medians.append(statistics.median(icd_list))
igreedy_enumeration.append(len(icd_list)/len(gt_locs))
# Plot the results
fig, (ax1, ax2) = plt.subplots(1,2)
ax1.boxplot([traceroute_enumeration, igreedy_enumeration])
ax1.set_xticklabels(["TR", "GCD"])
ax1.set_title('Enumeration')
ax2.boxplot([traceroute_means, traceroute_medians, igreedy_means, igreedy_medians])
ax2.set_title('Distance to GT in km')
ax2.set_xticklabels(["TR \n Mean", "TR \n Median", "GCD \n Mean", "GCD \n Median"])
print(f"Plot for Traceroute Radius with {DISK_DISTANCE_KM}km")
plt.show()
Plot for Traceroute Radius with 100km