""" src/utils/generate_base_data.py ================================ Generates realistic synthetic historical base datasets for model training. Simulates 18 months of aviation disruption data (Jan 2024 – Jun 2025) with realistic correlations between conflict signals, disruptions, and prices. Run: python -m src.utils.generate_base_data """ import numpy as np import pandas as pd from pathlib import Path from datetime import datetime, timedelta import sys SEED = 42 rng = np.random.default_rng(SEED) PROCESSED_DIR = Path(__file__).parent.parent.parent / "data" / "processed" PROCESSED_DIR.mkdir(parents=True, exist_ok=True) # ── Helper ──────────────────────────────────────────────────────────────────── def _sin_wave(n, period, amplitude=1.0, phase=0.0): """Create a sinusoidal pattern.""" return amplitude * np.sin(2 * np.pi * np.arange(n) / period + phase) # ── Conflict Events ─────────────────────────────────────────────────────────── CONFLICT_EVENTS = [ # (start_day_offset, duration_days, intensity, label) (0, 30, 0.8, "Iran-US escalation Jan 2024"), (45, 15, 0.6, "Gaza airspace closure Feb 2024"), (90, 20, 0.7, "Ukraine FIR disruptions Mar 2024"), (140, 10, 0.5, "Yemen Houthi attacks Apr 2024"), (180, 25, 0.9, "Iran-Israel direct exchange May 2024"), (220, 12, 0.5, "Pakistan-India tensions Jun 2024"), (270, 30, 0.7, "Middle East escalation Sep 2024"), (320, 20, 0.6, "Ukraine winter offensive Nov 2024"), (380, 15, 0.8, "Iran-US tensions Jan 2025"), (420, 40, 1.0, "Major conflict peak Feb-Mar 2025"), (470, 25, 0.7, "Post-conflict recovery Apr 2025"), (510, 20, 0.6, "Regional tensions May 2025"), ] AIRPORTS = [ ("OIII", "Tehran", "IR", "Middle East"), ("OMDB", "Dubai", "AE", "Middle East"), ("LLBG", "Tel Aviv", "IL", "Middle East"), ("HECA", "Cairo", "EG", "Middle East"), ("OJAM", "Amman", "JO", "Middle East"), ("UKBB", "Kyiv", "UA", "Eastern Europe"), ("UUEE", "Moscow", "RU", "Eastern Europe"), ("EPWA", "Warsaw", "PL", "Eastern Europe"), ("LHBP", "Budapest", "HU", "Eastern Europe"), ("OPKC", "Karachi", "PK", "South Asia"), ("VIDP", "Delhi", "IN", "South Asia"), ("EGLL", "London", "GB", "Western Europe"), ("LFPG", "Paris", "FR", "Western Europe"), ("EDDF", "Frankfurt", "DE", "Western Europe"), ("EHAM", "Amsterdam", "NL", "Western Europe"), ] AIRLINES = [ ("EK", "Emirates", "AE", "Middle East"), ("EY", "Etihad", "AE", "Middle East"), ("QR", "Qatar Airways", "QA", "Middle East"), ("TK", "Turkish Airlines", "TR", "Turkey"), ("LH", "Lufthansa", "DE", "Western Europe"), ("BA", "British Airways", "GB", "Western Europe"), ("AF", "Air France", "FR", "Western Europe"), ("PS", "Ukraine Int'l", "UA", "Eastern Europe"), ("PK", "Pakistan Int'l", "PK", "South Asia"), ("IR", "Iran Air", "IR", "Middle East"), ("AY", "Finnair", "FI", "Western Europe"), ("KL", "KLM", "NL", "Western Europe"), ] ROUTES = [ ("LHR", "DXB", "LH-ME", 350, 650), ("CDG", "DXB", "LH-ME", 330, 620), ("FRA", "DXB", "LH-ME", 320, 610), ("JFK", "DXB", "NA-ME", 580, 950), ("LHR", "TLV", "LH-ME", 280, 550), ("CDG", "TLV", "LH-ME", 260, 530), ("LHR", "BKK", "LH-AS", 420, 780), ("LHR", "KHI", "LH-SA", 310, 600), ("DXB", "DEL", "ME-SA", 180, 380), ("IST", "DXB", "ME-ME", 150, 320), ] def build_conflict_signal(n_days: int) -> np.ndarray: """Build a day-level conflict intensity signal (0..1).""" signal = np.zeros(n_days) for start, dur, intensity, _ in CONFLICT_EVENTS: if start >= n_days: continue end = min(start + dur, n_days) # Ramp up / plateau / ramp down ramp = min(5, dur // 3) for d in range(start, end): offset = d - start if offset < ramp: signal[d] = intensity * offset / ramp elif offset > dur - ramp: signal[d] = intensity * (dur - offset) / ramp else: signal[d] = intensity # Add noise signal += rng.normal(0, 0.05, n_days) return np.clip(signal, 0, 1) # ── Generate Flight Disruptions ─────────────────────────────────────────────── def generate_flight_disruptions() -> pd.DataFrame: """ One row per (airport, 6-hour period) over 18 months. ~15 airports × 4 periods/day × 548 days ≈ 32,880 rows. """ start_date = datetime(2024, 1, 1) n_days = 548 # Jan 2024 – Jun 2025 periods_per_day = 4 # 00:00, 06:00, 12:00, 18:00 conflict_signal = build_conflict_signal(n_days) # Oil price simulation: Brent crude, realistic range $70–$100 oil_base = 82.0 oil_trend = np.linspace(0, 10, n_days) # slight upward trend oil_cycle = _sin_wave(n_days, 60, amplitude=8) oil_shock = np.zeros(n_days) for start, dur, intensity, _ in CONFLICT_EVENTS: if start < n_days: end = min(start + dur, n_days) oil_shock[start:end] += intensity * 12 # conflict → oil spike oil_price_daily = oil_base + oil_trend + oil_cycle + oil_shock + rng.normal(0, 1.5, n_days) oil_price_daily = np.clip(oil_price_daily, 60, 130) records = [] for day_idx in range(n_days): date = start_date + timedelta(days=day_idx) conflict = conflict_signal[day_idx] oil = oil_price_daily[day_idx] oil_prev7 = oil_price_daily[max(0, day_idx - 7)] oil_change_pct = (oil - oil_prev7) / oil_prev7 * 100 for period in range(periods_per_day): hour = period * 6 ts = date + timedelta(hours=hour) for airport_code, airport_name, country, region in AIRPORTS: # Regional conflict modifier is_conflict_region = region in ["Middle East", "Eastern Europe", "South Asia"] regional_mult = 2.5 if is_conflict_region else 0.8 # Cancellation rate (0..1) cancel_base = 0.05 + conflict * regional_mult * 0.35 cancel_noise = rng.beta(1.5, 8) * 0.15 cancellation_rate = np.clip(cancel_base + cancel_noise, 0, 0.95) # Delay minutes delay_base = 15 + conflict * regional_mult * 80 delay_minutes = max(0, rng.normal(delay_base, 10)) # Airspace risk score (0..4) if is_conflict_region: risk_raw = conflict * 4 * regional_mult * 0.7 airspace_risk_score = min(4.0, risk_raw + rng.uniform(-0.3, 0.3)) else: airspace_risk_score = rng.uniform(0, 0.8) # Sentiment score (higher = more negative news) sentiment_base = conflict * regional_mult * 80 sentiment_score = np.clip( rng.normal(sentiment_base, 10), -100, 100 ) # Number of conflict events in region conflict_event_count = int( rng.poisson(conflict * regional_mult * 5) ) # Fuel pressure indicator fuel_pressure = (oil_change_pct / 20 + conflict * 0.3) * 50 fuel_pressure_indicator = np.clip(fuel_pressure, 0, 100) # Disruption index (composite, 0..100) disruption_index = np.clip( cancellation_rate * 40 + delay_minutes / 200 * 30 + airspace_risk_score / 4 * 20 + conflict * regional_mult * 10, 0, 100 ) # Airport stress score (0..100) airport_stress_score = np.clip( disruption_index * 0.7 + airspace_risk_score * 5 + rng.normal(0, 3), 0, 100 ) # Binary target: is_high_disruption is_high_disruption = int(disruption_index > 50) records.append({ "timestamp": ts.isoformat(), "date": date.strftime("%Y-%m-%d"), "hour": hour, "airport_code": airport_code, "airport_name": airport_name, "country": country, "region": region, "conflict_active": int(conflict > 0.3), "conflict_intensity": round(conflict, 4), "conflict_event_count": conflict_event_count, "cancellation_rate": round(cancellation_rate, 4), "avg_delay_minutes": round(delay_minutes, 1), "cancellation_rate_24h": round(cancellation_rate, 4), "avg_delay_24h": round(delay_minutes, 1), "airspace_risk_score": round(airspace_risk_score, 3), "sentiment_score": round(sentiment_score, 2), "sentiment_momentum": round(rng.normal(0, 5), 2), "oil_price": round(oil, 2), "oil_price_change_pct": round(oil_change_pct, 3), "fuel_pressure_indicator": round(fuel_pressure_indicator, 2), "disruption_index": round(disruption_index, 2), "disruption_index_lag6h": round(disruption_index * rng.uniform(0.85, 1.0), 2), "airport_stress_score": round(airport_stress_score, 2), "is_high_disruption": is_high_disruption, }) df = pd.DataFrame(records) print(f"Flight disruptions: {len(df):,} rows | positive rate: {df.is_high_disruption.mean():.2%}") return df # ── Generate Flight Prices ──────────────────────────────────────────────────── def generate_flight_prices() -> pd.DataFrame: """ One row per (route, week) over 18 months with realistic price drivers. ~10 routes × 78 weeks ≈ 780 rows. """ start_date = datetime(2024, 1, 1) n_weeks = 78 n_days = n_weeks * 7 conflict_signal = build_conflict_signal(n_days) oil_base = 82.0 oil_price_daily = ( oil_base + np.linspace(0, 10, n_days) + _sin_wave(n_days, 60, 8) + rng.normal(0, 1.5, n_days) ) records = [] for week_idx in range(n_weeks): day_idx = week_idx * 7 date = start_date + timedelta(days=day_idx) conflict = conflict_signal[day_idx] oil = oil_price_daily[day_idx] oil_prev = oil_price_daily[max(0, day_idx - 14)] oil_change_pct = (oil - oil_prev) / oil_prev * 100 sentiment_score = conflict * 70 + rng.normal(0, 8) sentiment_momentum = rng.normal(0, 5) for origin, dest, region_type, price_base, price_max in ROUTES: is_conflict_route = "ME" in region_type route_conflict_flag = int(is_conflict_route and conflict > 0.4) # Price model: base + oil spike + conflict premium + seasonality seasonal = _sin_wave(n_weeks, 52, 40)[week_idx] oil_premium = oil_change_pct * (2.5 if is_conflict_route else 1.2) conflict_premium = conflict * (120 if is_conflict_route else 40) demand_shock = rng.normal(0, 25) price = (price_base + seasonal + oil_premium + conflict_premium + demand_shock) price = np.clip(price, price_base * 0.7, price_max * 1.4) disruption_index = conflict * (2.5 if is_conflict_route else 0.8) * 50 disruption_index = np.clip(disruption_index + rng.normal(0, 5), 0, 100) fuel_pressure_indicator = np.clip( (oil_change_pct / 20 + conflict * 0.3) * 50 + rng.normal(0, 3), 0, 100 ) for days_to_dep in [7, 14, 30, 60]: booking_premium = max(0, (30 - days_to_dep) * 2.5) final_price = price + booking_premium + rng.normal(0, 15) final_price = max(80, final_price) records.append({ "timestamp": date.isoformat(), "week": date.strftime("%Y-W%U"), "origin": origin, "destination": dest, "route": f"{origin}-{dest}", "region_type": region_type, "route_conflict_flag": route_conflict_flag, "days_to_departure": days_to_dep, "day_of_week": date.weekday(), "price_usd": round(final_price, 2), "oil_price": round(oil, 2), "oil_price_change_pct": round(oil_change_pct, 3), "disruption_index": round(disruption_index, 2), "cancellation_rate_24h": round(conflict * 0.3 + rng.uniform(0, 0.1), 4), "sentiment_score": round(sentiment_score, 2), "sentiment_momentum": round(sentiment_momentum, 2), "fuel_pressure_indicator": round(fuel_pressure_indicator, 2), "conflict_intensity": round(conflict, 4), }) df = pd.DataFrame(records) print(f"Flight prices: {len(df):,} rows | price range: ${df.price_usd.min():.0f}–${df.price_usd.max():.0f}") return df # ── Generate Oil Prices ─────────────────────────────────────────────────────── def generate_oil_prices() -> pd.DataFrame: """Daily Brent crude oil prices, Jan 2024 – Jun 2025.""" n_days = 548 start = datetime(2024, 1, 1) conflict_signal = build_conflict_signal(n_days) oil_base = 82.0 oil_trend = np.linspace(0, 10, n_days) oil_cycle = _sin_wave(n_days, 60, 8) oil_shock = np.array([ sum(intensity * 12 for s, dur, intensity, _ in CONFLICT_EVENTS if s <= d < s + dur) for d in range(n_days) ]) prices = oil_base + oil_trend + oil_cycle + oil_shock + rng.normal(0, 1.5, n_days) prices = np.clip(prices, 60, 130) records = [] for i, p in enumerate(prices): date = start + timedelta(days=i) prev = prices[max(0, i - 1)] pct = (p - prev) / prev * 100 records.append({ "date": date.strftime("%Y-%m-%d"), "brent_usd": round(p, 2), "wti_usd": round(p - rng.uniform(2, 5), 2), "pct_change": round(pct, 3), "rolling_7d_avg": round(np.mean(prices[max(0, i - 7):i + 1]), 2), "rolling_30d_avg": round(np.mean(prices[max(0, i - 30):i + 1]), 2), "conflict_intensity": round(conflict_signal[i], 4), }) df = pd.DataFrame(records) print(f"Oil prices: {len(df):,} days") return df # ── Generate Airspace Risk ──────────────────────────────────────────────────── def generate_airspace_risk() -> pd.DataFrame: """Curated airspace risk snapshots, updated weekly.""" RISK_LEVELS = ["No Advisory", "Exercise Caution", "Increased Caution", "Avoid if Possible", "Do Not Fly"] RISK_MAP = {r: i for i, r in enumerate(RISK_LEVELS)} countries = [ ("IR", "Iran", "Middle East"), ("IQ", "Iraq", "Middle East"), ("IL", "Israel", "Middle East"), ("YE", "Yemen", "Middle East"), ("SY", "Syria", "Middle East"), ("UA", "Ukraine", "Eastern Europe"), ("RU", "Russia", "Eastern Europe"), ("PK", "Pakistan", "South Asia"), ("ET", "Ethiopia", "Africa"), ("LY", "Libya", "Africa"), ] SAFE_COUNTRIES = [ ("DE", "Germany", "Western Europe"), ("FR", "France", "Western Europe"), ("GB", "United Kingdom", "Western Europe"), ("US", "United States", "North America"), ("AU", "Australia", "Asia-Pacific"), ] start = datetime(2024, 1, 1) n_weeks = 78 conflict_signal = build_conflict_signal(n_weeks * 7) records = [] for week_idx in range(n_weeks): day = start + timedelta(weeks=week_idx) conflict = conflict_signal[week_idx * 7] for code, name, region in countries: risk_float = min(4, conflict * 4 * 1.2 + rng.uniform(-0.5, 0.5)) risk_idx = max(0, min(4, int(risk_float))) risk_level = RISK_LEVELS[risk_idx] records.append({ "timestamp": day.isoformat(), "country_code": code, "country_name": name, "region": region, "risk_level": risk_level, "risk_score": risk_idx, "description": f"{risk_level}: based on current conflict activity", "source": "SafeAirspace", "is_conflict_affected": 1, }) for code, name, region in SAFE_COUNTRIES: records.append({ "timestamp": day.isoformat(), "country_code": code, "country_name": name, "region": region, "risk_level": "No Advisory", "risk_score": 0, "description": "No active advisories", "source": "SafeAirspace", "is_conflict_affected": 0, }) df = pd.DataFrame(records) print(f"Airspace risk: {len(df):,} rows") return df # ── Generate Sentiment (GDELT-style) ───────────────────────────────────────── def generate_sentiment() -> pd.DataFrame: """Simulated GDELT news sentiment scores per region, every 6 hours.""" n_days = 548 start = datetime(2024, 1, 1) conflict_signal = build_conflict_signal(n_days) regions = ["Middle East", "Eastern Europe", "South Asia", "Global"] region_mults = {"Middle East": 1.5, "Eastern Europe": 1.2, "South Asia": 1.0, "Global": 0.7} records = [] for day_idx in range(n_days): for hour in [0, 6, 12, 18]: ts = start + timedelta(days=day_idx, hours=hour) conflict = conflict_signal[day_idx] for region in regions: mult = region_mults[region] # GDELT tone: negative = bad news (0 = neutral, negative = conflict) tone_base = -conflict * mult * 5 tone = tone_base + rng.normal(0, 0.8) article_count = max(1, int(rng.poisson(20 + conflict * mult * 40))) records.append({ "timestamp": ts.isoformat(), "region": region, "tone_avg": round(tone, 3), "article_count": article_count, "sentiment_score": round(-tone * 10, 2), # positive = more conflict news "conflict_intensity": round(conflict, 4), }) df = pd.DataFrame(records) # Add sentiment momentum df = df.sort_values(["region", "timestamp"]).reset_index(drop=True) df["sentiment_momentum"] = df.groupby("region")["sentiment_score"].diff().fillna(0) print(f"Sentiment: {len(df):,} rows") return df # ── Main ────────────────────────────────────────────────────────────────────── def main(): print("=" * 60) print("Generating synthetic historical base datasets...") print("=" * 60) datasets = { "flight_disruptions.csv": generate_flight_disruptions, "flight_prices.csv": generate_flight_prices, "oil_prices.csv": generate_oil_prices, "airspace_risk.csv": generate_airspace_risk, "sentiment.csv": generate_sentiment, } for filename, generator in datasets.items(): print(f"\n→ {filename}") df = generator() out = PROCESSED_DIR / filename df.to_csv(out, index=False) print(f" Saved: {out}") print("\n✓ All base datasets generated successfully.") print(f" Location: {PROCESSED_DIR}") if __name__ == "__main__": # ── Bootstrap guard ─────────────────────────────────────────────────────── # This script generates SYNTHETIC data for initial development/testing only. # The real pipeline reads from data/base/ (real Kaggle CSVs). # Only run this with the --bootstrap flag to avoid accidentally overwriting # or bypassing real data. # # Usage: python -m src.utils.generate_base_data --bootstrap # if "--bootstrap" not in sys.argv: print("ERROR: Refusing to run without --bootstrap flag.") print(" This script generates synthetic data and should NOT be") print(" used as the default training source.") print("") print(" Run: python -m src.utils.generate_base_data --bootstrap") print(" to explicitly opt in to synthetic data generation.") sys.exit(1) main()