noodle-flights / data_scripts /build_planes.py
wyattmarshall's picture
Upload folder using huggingface_hub
9034f75 verified
#!/usr/bin/env python3
"""
Build a lean planes.json for the flight generator.
Sources
-------
- OpenFlights planes.dat (curated list of ~173 passenger aircraft) :contentReference[oaicite:0]{index=0}
- OpenFlights routes.dat (equipment field gives 3-letter IATA codes) :contentReference[oaicite:1]{index=1}
- OpenFlights airlines.dat (to ignore defunct airlines if desired)
Output (records-oriented JSON)
-----------------------------
[
{
"code" : "738",
"icao" : "B738",
"name" : "Boeing 737-800",
"routeCount": 4219,
"airlines" : { "AA": 0.18, "DL": 0.14, "WN": 0.12, ... }
},
...
]
"""
from pathlib import Path
import json
import numpy as np
import pandas as pd
# ────────── paths & params ──────────
PLANES_FILE = Path("data_scripts/planes.dat")
ROUTES_FILE = Path("data_scripts/routes.dat")
RETIRED_AIRLINES_FILE = Path("data_scripts/retired_airlines.csv")
OUT_FILE = Path("data/planes.json")
KEEP_CODES_WITH_MIN_ROUTES = 5 # discard ultra-rare types (set None to keep all)
ROUND_DIGITS = 4 # decimals for airline weights
# ────────── load raw files ──────────
planes_cols = ["name", "iata", "icao"]
planes = pd.read_csv(
PLANES_FILE,
header=None,
names=planes_cols,
na_values="\\N"
)
routes_cols = [
"airline_code", "airline_id",
"src_iata", "src_id",
"dst_iata", "dst_id",
"codeshare", "stops",
"equipment"
]
routes = pd.read_csv(
ROUTES_FILE,
header=None,
names=routes_cols,
na_values="\\N"
)
# Load retired airlines
retired_airlines = pd.read_csv(RETIRED_AIRLINES_FILE)
retired_codes = set(retired_airlines["code"].values)
# Filter out routes from retired airlines
routes = routes[~routes["airline_code"].isin(retired_codes)]
# ────────── explode equipment codes ──────────
routes = routes.dropna(subset=["equipment", "airline_code"])
routes["equipment_list"] = routes["equipment"].str.split()
routes_exploded = routes.explode("equipment_list", ignore_index=True)
# ────────── popularity proxy ──────────
popularity = (
routes_exploded["equipment_list"]
.value_counts()
.rename_axis("code")
.reset_index(name="routeCount")
)
# ────────── airlines per aircraft ──────────
equip_airline_counts = (
routes_exploded
.groupby(["equipment_list", "airline_code"])
.size()
.reset_index(name="flight_count")
)
def airlines_dict_for(code: str) -> dict:
subset = equip_airline_counts[equip_airline_counts["equipment_list"] == code]
if subset.empty:
return {}
total = subset["flight_count"].sum()
subset = subset.sort_values("flight_count", ascending=False)
return {
row["airline_code"]: round(row["flight_count"] / total, ROUND_DIGITS)
for _, row in subset.iterrows()
}
# ────────── merge & tidy ──────────
planes = planes[planes["iata"].notna()].drop_duplicates(subset="iata")
planes = planes.merge(popularity, how="left", left_on="iata", right_on="code")
planes = planes.drop(columns=["code"])
planes = planes.rename(columns={"iata": "code", "name": "name", "icao": "icao"})
planes["routeCount"] = planes["routeCount"].fillna(0).astype(int)
planes["airlines"] = planes["code"].apply(airlines_dict_for)
# optional trimming of rare types
if KEEP_CODES_WITH_MIN_ROUTES:
planes = planes[planes["routeCount"] >= KEEP_CODES_WITH_MIN_ROUTES]
# ────────── save ──────────
planes.to_json(OUT_FILE, orient="records", indent=2)
print(f"Saved {len(planes):,} plane types → {OUT_FILE.resolve()}")