Spaces:

bhanug2026
/

aviation-disruption-intelligence

Sleeping

File size: 21,701 Bytes

47c6cfd

"""
src/utils/generate_base_data.py
================================
Generates realistic synthetic historical base datasets for model training.
Simulates 18 months of aviation disruption data (Jan 2024 – Jun 2025)
with realistic correlations between conflict signals, disruptions, and prices.

Run: python -m src.utils.generate_base_data
"""

import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
import sys

SEED = 42
rng = np.random.default_rng(SEED)

PROCESSED_DIR = Path(__file__).parent.parent.parent / "data" / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# ── Helper ────────────────────────────────────────────────────────────────────

def _sin_wave(n, period, amplitude=1.0, phase=0.0):
    """Create a sinusoidal pattern."""
    return amplitude * np.sin(2 * np.pi * np.arange(n) / period + phase)


# ── Conflict Events ───────────────────────────────────────────────────────────

CONFLICT_EVENTS = [
    # (start_day_offset, duration_days, intensity, label)
    (0,   30,  0.8, "Iran-US escalation Jan 2024"),
    (45,  15,  0.6, "Gaza airspace closure Feb 2024"),
    (90,  20,  0.7, "Ukraine FIR disruptions Mar 2024"),
    (140, 10,  0.5, "Yemen Houthi attacks Apr 2024"),
    (180, 25,  0.9, "Iran-Israel direct exchange May 2024"),
    (220, 12,  0.5, "Pakistan-India tensions Jun 2024"),
    (270, 30,  0.7, "Middle East escalation Sep 2024"),
    (320, 20,  0.6, "Ukraine winter offensive Nov 2024"),
    (380, 15,  0.8, "Iran-US tensions Jan 2025"),
    (420, 40,  1.0, "Major conflict peak Feb-Mar 2025"),
    (470, 25,  0.7, "Post-conflict recovery Apr 2025"),
    (510, 20,  0.6, "Regional tensions May 2025"),
]

AIRPORTS = [
    ("OIII", "Tehran", "IR", "Middle East"),
    ("OMDB", "Dubai", "AE", "Middle East"),
    ("LLBG", "Tel Aviv", "IL", "Middle East"),
    ("HECA", "Cairo", "EG", "Middle East"),
    ("OJAM", "Amman", "JO", "Middle East"),
    ("UKBB", "Kyiv", "UA", "Eastern Europe"),
    ("UUEE", "Moscow", "RU", "Eastern Europe"),
    ("EPWA", "Warsaw", "PL", "Eastern Europe"),
    ("LHBP", "Budapest", "HU", "Eastern Europe"),
    ("OPKC", "Karachi", "PK", "South Asia"),
    ("VIDP", "Delhi", "IN", "South Asia"),
    ("EGLL", "London", "GB", "Western Europe"),
    ("LFPG", "Paris", "FR", "Western Europe"),
    ("EDDF", "Frankfurt", "DE", "Western Europe"),
    ("EHAM", "Amsterdam", "NL", "Western Europe"),
]

AIRLINES = [
    ("EK",  "Emirates",          "AE", "Middle East"),
    ("EY",  "Etihad",            "AE", "Middle East"),
    ("QR",  "Qatar Airways",     "QA", "Middle East"),
    ("TK",  "Turkish Airlines",  "TR", "Turkey"),
    ("LH",  "Lufthansa",         "DE", "Western Europe"),
    ("BA",  "British Airways",   "GB", "Western Europe"),
    ("AF",  "Air France",        "FR", "Western Europe"),
    ("PS",  "Ukraine Int'l",     "UA", "Eastern Europe"),
    ("PK",  "Pakistan Int'l",    "PK", "South Asia"),
    ("IR",  "Iran Air",          "IR", "Middle East"),
    ("AY",  "Finnair",           "FI", "Western Europe"),
    ("KL",  "KLM",               "NL", "Western Europe"),
]

ROUTES = [
    ("LHR", "DXB", "LH-ME", 350, 650),
    ("CDG", "DXB", "LH-ME", 330, 620),
    ("FRA", "DXB", "LH-ME", 320, 610),
    ("JFK", "DXB", "NA-ME", 580, 950),
    ("LHR", "TLV", "LH-ME", 280, 550),
    ("CDG", "TLV", "LH-ME", 260, 530),
    ("LHR", "BKK", "LH-AS", 420, 780),
    ("LHR", "KHI", "LH-SA", 310, 600),
    ("DXB", "DEL", "ME-SA", 180, 380),
    ("IST", "DXB", "ME-ME", 150, 320),
]


def build_conflict_signal(n_days: int) -> np.ndarray:
    """Build a day-level conflict intensity signal (0..1)."""
    signal = np.zeros(n_days)
    for start, dur, intensity, _ in CONFLICT_EVENTS:
        if start >= n_days:
            continue
        end = min(start + dur, n_days)
        # Ramp up / plateau / ramp down
        ramp = min(5, dur // 3)
        for d in range(start, end):
            offset = d - start
            if offset < ramp:
                signal[d] = intensity * offset / ramp
            elif offset > dur - ramp:
                signal[d] = intensity * (dur - offset) / ramp
            else:
                signal[d] = intensity
    # Add noise
    signal += rng.normal(0, 0.05, n_days)
    return np.clip(signal, 0, 1)


# ── Generate Flight Disruptions ───────────────────────────────────────────────

def generate_flight_disruptions() -> pd.DataFrame:
    """
    One row per (airport, 6-hour period) over 18 months.
    ~15 airports × 4 periods/day × 548 days ≈ 32,880 rows.
    """
    start_date = datetime(2024, 1, 1)
    n_days = 548  # Jan 2024 – Jun 2025
    periods_per_day = 4  # 00:00, 06:00, 12:00, 18:00

    conflict_signal = build_conflict_signal(n_days)

    # Oil price simulation: Brent crude, realistic range $70–$100
    oil_base = 82.0
    oil_trend = np.linspace(0, 10, n_days)  # slight upward trend
    oil_cycle = _sin_wave(n_days, 60, amplitude=8)
    oil_shock = np.zeros(n_days)
    for start, dur, intensity, _ in CONFLICT_EVENTS:
        if start < n_days:
            end = min(start + dur, n_days)
            oil_shock[start:end] += intensity * 12  # conflict → oil spike
    oil_price_daily = oil_base + oil_trend + oil_cycle + oil_shock + rng.normal(0, 1.5, n_days)
    oil_price_daily = np.clip(oil_price_daily, 60, 130)

    records = []
    for day_idx in range(n_days):
        date = start_date + timedelta(days=day_idx)
        conflict = conflict_signal[day_idx]
        oil = oil_price_daily[day_idx]
        oil_prev7 = oil_price_daily[max(0, day_idx - 7)]
        oil_change_pct = (oil - oil_prev7) / oil_prev7 * 100

        for period in range(periods_per_day):
            hour = period * 6
            ts = date + timedelta(hours=hour)

            for airport_code, airport_name, country, region in AIRPORTS:
                # Regional conflict modifier
                is_conflict_region = region in ["Middle East", "Eastern Europe", "South Asia"]
                regional_mult = 2.5 if is_conflict_region else 0.8

                # Cancellation rate (0..1)
                cancel_base = 0.05 + conflict * regional_mult * 0.35
                cancel_noise = rng.beta(1.5, 8) * 0.15
                cancellation_rate = np.clip(cancel_base + cancel_noise, 0, 0.95)

                # Delay minutes
                delay_base = 15 + conflict * regional_mult * 80
                delay_minutes = max(0, rng.normal(delay_base, 10))

                # Airspace risk score (0..4)
                if is_conflict_region:
                    risk_raw = conflict * 4 * regional_mult * 0.7
                    airspace_risk_score = min(4.0, risk_raw + rng.uniform(-0.3, 0.3))
                else:
                    airspace_risk_score = rng.uniform(0, 0.8)

                # Sentiment score (higher = more negative news)
                sentiment_base = conflict * regional_mult * 80
                sentiment_score = np.clip(
                    rng.normal(sentiment_base, 10), -100, 100
                )

                # Number of conflict events in region
                conflict_event_count = int(
                    rng.poisson(conflict * regional_mult * 5)
                )

                # Fuel pressure indicator
                fuel_pressure = (oil_change_pct / 20 + conflict * 0.3) * 50
                fuel_pressure_indicator = np.clip(fuel_pressure, 0, 100)

                # Disruption index (composite, 0..100)
                disruption_index = np.clip(
                    cancellation_rate * 40 + delay_minutes / 200 * 30 +
                    airspace_risk_score / 4 * 20 + conflict * regional_mult * 10,
                    0, 100
                )

                # Airport stress score (0..100)
                airport_stress_score = np.clip(
                    disruption_index * 0.7 + airspace_risk_score * 5 +
                    rng.normal(0, 3), 0, 100
                )

                # Binary target: is_high_disruption
                is_high_disruption = int(disruption_index > 50)

                records.append({
                    "timestamp": ts.isoformat(),
                    "date": date.strftime("%Y-%m-%d"),
                    "hour": hour,
                    "airport_code": airport_code,
                    "airport_name": airport_name,
                    "country": country,
                    "region": region,
                    "conflict_active": int(conflict > 0.3),
                    "conflict_intensity": round(conflict, 4),
                    "conflict_event_count": conflict_event_count,
                    "cancellation_rate": round(cancellation_rate, 4),
                    "avg_delay_minutes": round(delay_minutes, 1),
                    "cancellation_rate_24h": round(cancellation_rate, 4),
                    "avg_delay_24h": round(delay_minutes, 1),
                    "airspace_risk_score": round(airspace_risk_score, 3),
                    "sentiment_score": round(sentiment_score, 2),
                    "sentiment_momentum": round(rng.normal(0, 5), 2),
                    "oil_price": round(oil, 2),
                    "oil_price_change_pct": round(oil_change_pct, 3),
                    "fuel_pressure_indicator": round(fuel_pressure_indicator, 2),
                    "disruption_index": round(disruption_index, 2),
                    "disruption_index_lag6h": round(disruption_index * rng.uniform(0.85, 1.0), 2),
                    "airport_stress_score": round(airport_stress_score, 2),
                    "is_high_disruption": is_high_disruption,
                })

    df = pd.DataFrame(records)
    print(f"Flight disruptions: {len(df):,} rows | positive rate: {df.is_high_disruption.mean():.2%}")
    return df


# ── Generate Flight Prices ────────────────────────────────────────────────────

def generate_flight_prices() -> pd.DataFrame:
    """
    One row per (route, week) over 18 months with realistic price drivers.
    ~10 routes × 78 weeks ≈ 780 rows.
    """
    start_date = datetime(2024, 1, 1)
    n_weeks = 78
    n_days = n_weeks * 7

    conflict_signal = build_conflict_signal(n_days)
    oil_base = 82.0
    oil_price_daily = (
        oil_base
        + np.linspace(0, 10, n_days)
        + _sin_wave(n_days, 60, 8)
        + rng.normal(0, 1.5, n_days)
    )

    records = []
    for week_idx in range(n_weeks):
        day_idx = week_idx * 7
        date = start_date + timedelta(days=day_idx)
        conflict = conflict_signal[day_idx]
        oil = oil_price_daily[day_idx]
        oil_prev = oil_price_daily[max(0, day_idx - 14)]
        oil_change_pct = (oil - oil_prev) / oil_prev * 100

        sentiment_score = conflict * 70 + rng.normal(0, 8)
        sentiment_momentum = rng.normal(0, 5)

        for origin, dest, region_type, price_base, price_max in ROUTES:
            is_conflict_route = "ME" in region_type
            route_conflict_flag = int(is_conflict_route and conflict > 0.4)

            # Price model: base + oil spike + conflict premium + seasonality
            seasonal = _sin_wave(n_weeks, 52, 40)[week_idx]
            oil_premium = oil_change_pct * (2.5 if is_conflict_route else 1.2)
            conflict_premium = conflict * (120 if is_conflict_route else 40)
            demand_shock = rng.normal(0, 25)

            price = (price_base + seasonal + oil_premium + conflict_premium +
                     demand_shock)
            price = np.clip(price, price_base * 0.7, price_max * 1.4)

            disruption_index = conflict * (2.5 if is_conflict_route else 0.8) * 50
            disruption_index = np.clip(disruption_index + rng.normal(0, 5), 0, 100)

            fuel_pressure_indicator = np.clip(
                (oil_change_pct / 20 + conflict * 0.3) * 50 + rng.normal(0, 3), 0, 100
            )

            for days_to_dep in [7, 14, 30, 60]:
                booking_premium = max(0, (30 - days_to_dep) * 2.5)
                final_price = price + booking_premium + rng.normal(0, 15)
                final_price = max(80, final_price)

                records.append({
                    "timestamp": date.isoformat(),
                    "week": date.strftime("%Y-W%U"),
                    "origin": origin,
                    "destination": dest,
                    "route": f"{origin}-{dest}",
                    "region_type": region_type,
                    "route_conflict_flag": route_conflict_flag,
                    "days_to_departure": days_to_dep,
                    "day_of_week": date.weekday(),
                    "price_usd": round(final_price, 2),
                    "oil_price": round(oil, 2),
                    "oil_price_change_pct": round(oil_change_pct, 3),
                    "disruption_index": round(disruption_index, 2),
                    "cancellation_rate_24h": round(conflict * 0.3 + rng.uniform(0, 0.1), 4),
                    "sentiment_score": round(sentiment_score, 2),
                    "sentiment_momentum": round(sentiment_momentum, 2),
                    "fuel_pressure_indicator": round(fuel_pressure_indicator, 2),
                    "conflict_intensity": round(conflict, 4),
                })

    df = pd.DataFrame(records)
    print(f"Flight prices: {len(df):,} rows | price range: ${df.price_usd.min():.0f}–${df.price_usd.max():.0f}")
    return df


# ── Generate Oil Prices ───────────────────────────────────────────────────────

def generate_oil_prices() -> pd.DataFrame:
    """Daily Brent crude oil prices, Jan 2024 – Jun 2025."""
    n_days = 548
    start = datetime(2024, 1, 1)
    conflict_signal = build_conflict_signal(n_days)

    oil_base = 82.0
    oil_trend = np.linspace(0, 10, n_days)
    oil_cycle = _sin_wave(n_days, 60, 8)
    oil_shock = np.array([
        sum(intensity * 12 for s, dur, intensity, _ in CONFLICT_EVENTS
            if s <= d < s + dur)
        for d in range(n_days)
    ])
    prices = oil_base + oil_trend + oil_cycle + oil_shock + rng.normal(0, 1.5, n_days)
    prices = np.clip(prices, 60, 130)

    records = []
    for i, p in enumerate(prices):
        date = start + timedelta(days=i)
        prev = prices[max(0, i - 1)]
        pct = (p - prev) / prev * 100
        records.append({
            "date": date.strftime("%Y-%m-%d"),
            "brent_usd": round(p, 2),
            "wti_usd": round(p - rng.uniform(2, 5), 2),
            "pct_change": round(pct, 3),
            "rolling_7d_avg": round(np.mean(prices[max(0, i - 7):i + 1]), 2),
            "rolling_30d_avg": round(np.mean(prices[max(0, i - 30):i + 1]), 2),
            "conflict_intensity": round(conflict_signal[i], 4),
        })

    df = pd.DataFrame(records)
    print(f"Oil prices: {len(df):,} days")
    return df


# ── Generate Airspace Risk ────────────────────────────────────────────────────

def generate_airspace_risk() -> pd.DataFrame:
    """Curated airspace risk snapshots, updated weekly."""
    RISK_LEVELS = ["No Advisory", "Exercise Caution", "Increased Caution",
                   "Avoid if Possible", "Do Not Fly"]
    RISK_MAP = {r: i for i, r in enumerate(RISK_LEVELS)}

    countries = [
        ("IR", "Iran", "Middle East"),
        ("IQ", "Iraq", "Middle East"),
        ("IL", "Israel", "Middle East"),
        ("YE", "Yemen", "Middle East"),
        ("SY", "Syria", "Middle East"),
        ("UA", "Ukraine", "Eastern Europe"),
        ("RU", "Russia", "Eastern Europe"),
        ("PK", "Pakistan", "South Asia"),
        ("ET", "Ethiopia", "Africa"),
        ("LY", "Libya", "Africa"),
    ]

    SAFE_COUNTRIES = [
        ("DE", "Germany", "Western Europe"),
        ("FR", "France", "Western Europe"),
        ("GB", "United Kingdom", "Western Europe"),
        ("US", "United States", "North America"),
        ("AU", "Australia", "Asia-Pacific"),
    ]

    start = datetime(2024, 1, 1)
    n_weeks = 78
    conflict_signal = build_conflict_signal(n_weeks * 7)

    records = []
    for week_idx in range(n_weeks):
        day = start + timedelta(weeks=week_idx)
        conflict = conflict_signal[week_idx * 7]

        for code, name, region in countries:
            risk_float = min(4, conflict * 4 * 1.2 + rng.uniform(-0.5, 0.5))
            risk_idx = max(0, min(4, int(risk_float)))
            risk_level = RISK_LEVELS[risk_idx]
            records.append({
                "timestamp": day.isoformat(),
                "country_code": code,
                "country_name": name,
                "region": region,
                "risk_level": risk_level,
                "risk_score": risk_idx,
                "description": f"{risk_level}: based on current conflict activity",
                "source": "SafeAirspace",
                "is_conflict_affected": 1,
            })

        for code, name, region in SAFE_COUNTRIES:
            records.append({
                "timestamp": day.isoformat(),
                "country_code": code,
                "country_name": name,
                "region": region,
                "risk_level": "No Advisory",
                "risk_score": 0,
                "description": "No active advisories",
                "source": "SafeAirspace",
                "is_conflict_affected": 0,
            })

    df = pd.DataFrame(records)
    print(f"Airspace risk: {len(df):,} rows")
    return df


# ── Generate Sentiment (GDELT-style) ─────────────────────────────────────────

def generate_sentiment() -> pd.DataFrame:
    """Simulated GDELT news sentiment scores per region, every 6 hours."""
    n_days = 548
    start = datetime(2024, 1, 1)
    conflict_signal = build_conflict_signal(n_days)

    regions = ["Middle East", "Eastern Europe", "South Asia", "Global"]
    region_mults = {"Middle East": 1.5, "Eastern Europe": 1.2,
                    "South Asia": 1.0, "Global": 0.7}

    records = []
    for day_idx in range(n_days):
        for hour in [0, 6, 12, 18]:
            ts = start + timedelta(days=day_idx, hours=hour)
            conflict = conflict_signal[day_idx]
            for region in regions:
                mult = region_mults[region]
                # GDELT tone: negative = bad news (0 = neutral, negative = conflict)
                tone_base = -conflict * mult * 5
                tone = tone_base + rng.normal(0, 0.8)
                article_count = max(1, int(rng.poisson(20 + conflict * mult * 40)))
                records.append({
                    "timestamp": ts.isoformat(),
                    "region": region,
                    "tone_avg": round(tone, 3),
                    "article_count": article_count,
                    "sentiment_score": round(-tone * 10, 2),  # positive = more conflict news
                    "conflict_intensity": round(conflict, 4),
                })

    df = pd.DataFrame(records)
    # Add sentiment momentum
    df = df.sort_values(["region", "timestamp"]).reset_index(drop=True)
    df["sentiment_momentum"] = df.groupby("region")["sentiment_score"].diff().fillna(0)
    print(f"Sentiment: {len(df):,} rows")
    return df


# ── Main ──────────────────────────────────────────────────────────────────────

def main():
    print("=" * 60)
    print("Generating synthetic historical base datasets...")
    print("=" * 60)

    datasets = {
        "flight_disruptions.csv": generate_flight_disruptions,
        "flight_prices.csv": generate_flight_prices,
        "oil_prices.csv": generate_oil_prices,
        "airspace_risk.csv": generate_airspace_risk,
        "sentiment.csv": generate_sentiment,
    }

    for filename, generator in datasets.items():
        print(f"\n→ {filename}")
        df = generator()
        out = PROCESSED_DIR / filename
        df.to_csv(out, index=False)
        print(f"  Saved: {out}")

    print("\n✓ All base datasets generated successfully.")
    print(f"  Location: {PROCESSED_DIR}")


if __name__ == "__main__":
    # ── Bootstrap guard ───────────────────────────────────────────────────────
    # This script generates SYNTHETIC data for initial development/testing only.
    # The real pipeline reads from data/base/ (real Kaggle CSVs).
    # Only run this with the --bootstrap flag to avoid accidentally overwriting
    # or bypassing real data.
    #
    # Usage:  python -m src.utils.generate_base_data --bootstrap
    #
    if "--bootstrap" not in sys.argv:
        print("ERROR: Refusing to run without --bootstrap flag.")
        print("       This script generates synthetic data and should NOT be")
        print("       used as the default training source.")
        print("")
        print("       Run:  python -m src.utils.generate_base_data --bootstrap")
        print("       to explicitly opt in to synthetic data generation.")
        sys.exit(1)
    main()