| """ |
| src/utils/generate_base_data.py |
| ================================ |
| Generates realistic synthetic historical base datasets for model training. |
| Simulates 18 months of aviation disruption data (Jan 2024 – Jun 2025) |
| with realistic correlations between conflict signals, disruptions, and prices. |
| |
| Run: python -m src.utils.generate_base_data |
| """ |
|
|
| import numpy as np |
| import pandas as pd |
| from pathlib import Path |
| from datetime import datetime, timedelta |
| import sys |
|
|
| SEED = 42 |
| rng = np.random.default_rng(SEED) |
|
|
| PROCESSED_DIR = Path(__file__).parent.parent.parent / "data" / "processed" |
| PROCESSED_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| |
|
|
| def _sin_wave(n, period, amplitude=1.0, phase=0.0): |
| """Create a sinusoidal pattern.""" |
| return amplitude * np.sin(2 * np.pi * np.arange(n) / period + phase) |
|
|
|
|
| |
|
|
| CONFLICT_EVENTS = [ |
| |
| (0, 30, 0.8, "Iran-US escalation Jan 2024"), |
| (45, 15, 0.6, "Gaza airspace closure Feb 2024"), |
| (90, 20, 0.7, "Ukraine FIR disruptions Mar 2024"), |
| (140, 10, 0.5, "Yemen Houthi attacks Apr 2024"), |
| (180, 25, 0.9, "Iran-Israel direct exchange May 2024"), |
| (220, 12, 0.5, "Pakistan-India tensions Jun 2024"), |
| (270, 30, 0.7, "Middle East escalation Sep 2024"), |
| (320, 20, 0.6, "Ukraine winter offensive Nov 2024"), |
| (380, 15, 0.8, "Iran-US tensions Jan 2025"), |
| (420, 40, 1.0, "Major conflict peak Feb-Mar 2025"), |
| (470, 25, 0.7, "Post-conflict recovery Apr 2025"), |
| (510, 20, 0.6, "Regional tensions May 2025"), |
| ] |
|
|
| AIRPORTS = [ |
| ("OIII", "Tehran", "IR", "Middle East"), |
| ("OMDB", "Dubai", "AE", "Middle East"), |
| ("LLBG", "Tel Aviv", "IL", "Middle East"), |
| ("HECA", "Cairo", "EG", "Middle East"), |
| ("OJAM", "Amman", "JO", "Middle East"), |
| ("UKBB", "Kyiv", "UA", "Eastern Europe"), |
| ("UUEE", "Moscow", "RU", "Eastern Europe"), |
| ("EPWA", "Warsaw", "PL", "Eastern Europe"), |
| ("LHBP", "Budapest", "HU", "Eastern Europe"), |
| ("OPKC", "Karachi", "PK", "South Asia"), |
| ("VIDP", "Delhi", "IN", "South Asia"), |
| ("EGLL", "London", "GB", "Western Europe"), |
| ("LFPG", "Paris", "FR", "Western Europe"), |
| ("EDDF", "Frankfurt", "DE", "Western Europe"), |
| ("EHAM", "Amsterdam", "NL", "Western Europe"), |
| ] |
|
|
| AIRLINES = [ |
| ("EK", "Emirates", "AE", "Middle East"), |
| ("EY", "Etihad", "AE", "Middle East"), |
| ("QR", "Qatar Airways", "QA", "Middle East"), |
| ("TK", "Turkish Airlines", "TR", "Turkey"), |
| ("LH", "Lufthansa", "DE", "Western Europe"), |
| ("BA", "British Airways", "GB", "Western Europe"), |
| ("AF", "Air France", "FR", "Western Europe"), |
| ("PS", "Ukraine Int'l", "UA", "Eastern Europe"), |
| ("PK", "Pakistan Int'l", "PK", "South Asia"), |
| ("IR", "Iran Air", "IR", "Middle East"), |
| ("AY", "Finnair", "FI", "Western Europe"), |
| ("KL", "KLM", "NL", "Western Europe"), |
| ] |
|
|
| ROUTES = [ |
| ("LHR", "DXB", "LH-ME", 350, 650), |
| ("CDG", "DXB", "LH-ME", 330, 620), |
| ("FRA", "DXB", "LH-ME", 320, 610), |
| ("JFK", "DXB", "NA-ME", 580, 950), |
| ("LHR", "TLV", "LH-ME", 280, 550), |
| ("CDG", "TLV", "LH-ME", 260, 530), |
| ("LHR", "BKK", "LH-AS", 420, 780), |
| ("LHR", "KHI", "LH-SA", 310, 600), |
| ("DXB", "DEL", "ME-SA", 180, 380), |
| ("IST", "DXB", "ME-ME", 150, 320), |
| ] |
|
|
|
|
| def build_conflict_signal(n_days: int) -> np.ndarray: |
| """Build a day-level conflict intensity signal (0..1).""" |
| signal = np.zeros(n_days) |
| for start, dur, intensity, _ in CONFLICT_EVENTS: |
| if start >= n_days: |
| continue |
| end = min(start + dur, n_days) |
| |
| ramp = min(5, dur // 3) |
| for d in range(start, end): |
| offset = d - start |
| if offset < ramp: |
| signal[d] = intensity * offset / ramp |
| elif offset > dur - ramp: |
| signal[d] = intensity * (dur - offset) / ramp |
| else: |
| signal[d] = intensity |
| |
| signal += rng.normal(0, 0.05, n_days) |
| return np.clip(signal, 0, 1) |
|
|
|
|
| |
|
|
| def generate_flight_disruptions() -> pd.DataFrame: |
| """ |
| One row per (airport, 6-hour period) over 18 months. |
| ~15 airports × 4 periods/day × 548 days ≈ 32,880 rows. |
| """ |
| start_date = datetime(2024, 1, 1) |
| n_days = 548 |
| periods_per_day = 4 |
|
|
| conflict_signal = build_conflict_signal(n_days) |
|
|
| |
| oil_base = 82.0 |
| oil_trend = np.linspace(0, 10, n_days) |
| oil_cycle = _sin_wave(n_days, 60, amplitude=8) |
| oil_shock = np.zeros(n_days) |
| for start, dur, intensity, _ in CONFLICT_EVENTS: |
| if start < n_days: |
| end = min(start + dur, n_days) |
| oil_shock[start:end] += intensity * 12 |
| oil_price_daily = oil_base + oil_trend + oil_cycle + oil_shock + rng.normal(0, 1.5, n_days) |
| oil_price_daily = np.clip(oil_price_daily, 60, 130) |
|
|
| records = [] |
| for day_idx in range(n_days): |
| date = start_date + timedelta(days=day_idx) |
| conflict = conflict_signal[day_idx] |
| oil = oil_price_daily[day_idx] |
| oil_prev7 = oil_price_daily[max(0, day_idx - 7)] |
| oil_change_pct = (oil - oil_prev7) / oil_prev7 * 100 |
|
|
| for period in range(periods_per_day): |
| hour = period * 6 |
| ts = date + timedelta(hours=hour) |
|
|
| for airport_code, airport_name, country, region in AIRPORTS: |
| |
| is_conflict_region = region in ["Middle East", "Eastern Europe", "South Asia"] |
| regional_mult = 2.5 if is_conflict_region else 0.8 |
|
|
| |
| cancel_base = 0.05 + conflict * regional_mult * 0.35 |
| cancel_noise = rng.beta(1.5, 8) * 0.15 |
| cancellation_rate = np.clip(cancel_base + cancel_noise, 0, 0.95) |
|
|
| |
| delay_base = 15 + conflict * regional_mult * 80 |
| delay_minutes = max(0, rng.normal(delay_base, 10)) |
|
|
| |
| if is_conflict_region: |
| risk_raw = conflict * 4 * regional_mult * 0.7 |
| airspace_risk_score = min(4.0, risk_raw + rng.uniform(-0.3, 0.3)) |
| else: |
| airspace_risk_score = rng.uniform(0, 0.8) |
|
|
| |
| sentiment_base = conflict * regional_mult * 80 |
| sentiment_score = np.clip( |
| rng.normal(sentiment_base, 10), -100, 100 |
| ) |
|
|
| |
| conflict_event_count = int( |
| rng.poisson(conflict * regional_mult * 5) |
| ) |
|
|
| |
| fuel_pressure = (oil_change_pct / 20 + conflict * 0.3) * 50 |
| fuel_pressure_indicator = np.clip(fuel_pressure, 0, 100) |
|
|
| |
| disruption_index = np.clip( |
| cancellation_rate * 40 + delay_minutes / 200 * 30 + |
| airspace_risk_score / 4 * 20 + conflict * regional_mult * 10, |
| 0, 100 |
| ) |
|
|
| |
| airport_stress_score = np.clip( |
| disruption_index * 0.7 + airspace_risk_score * 5 + |
| rng.normal(0, 3), 0, 100 |
| ) |
|
|
| |
| is_high_disruption = int(disruption_index > 50) |
|
|
| records.append({ |
| "timestamp": ts.isoformat(), |
| "date": date.strftime("%Y-%m-%d"), |
| "hour": hour, |
| "airport_code": airport_code, |
| "airport_name": airport_name, |
| "country": country, |
| "region": region, |
| "conflict_active": int(conflict > 0.3), |
| "conflict_intensity": round(conflict, 4), |
| "conflict_event_count": conflict_event_count, |
| "cancellation_rate": round(cancellation_rate, 4), |
| "avg_delay_minutes": round(delay_minutes, 1), |
| "cancellation_rate_24h": round(cancellation_rate, 4), |
| "avg_delay_24h": round(delay_minutes, 1), |
| "airspace_risk_score": round(airspace_risk_score, 3), |
| "sentiment_score": round(sentiment_score, 2), |
| "sentiment_momentum": round(rng.normal(0, 5), 2), |
| "oil_price": round(oil, 2), |
| "oil_price_change_pct": round(oil_change_pct, 3), |
| "fuel_pressure_indicator": round(fuel_pressure_indicator, 2), |
| "disruption_index": round(disruption_index, 2), |
| "disruption_index_lag6h": round(disruption_index * rng.uniform(0.85, 1.0), 2), |
| "airport_stress_score": round(airport_stress_score, 2), |
| "is_high_disruption": is_high_disruption, |
| }) |
|
|
| df = pd.DataFrame(records) |
| print(f"Flight disruptions: {len(df):,} rows | positive rate: {df.is_high_disruption.mean():.2%}") |
| return df |
|
|
|
|
| |
|
|
| def generate_flight_prices() -> pd.DataFrame: |
| """ |
| One row per (route, week) over 18 months with realistic price drivers. |
| ~10 routes × 78 weeks ≈ 780 rows. |
| """ |
| start_date = datetime(2024, 1, 1) |
| n_weeks = 78 |
| n_days = n_weeks * 7 |
|
|
| conflict_signal = build_conflict_signal(n_days) |
| oil_base = 82.0 |
| oil_price_daily = ( |
| oil_base |
| + np.linspace(0, 10, n_days) |
| + _sin_wave(n_days, 60, 8) |
| + rng.normal(0, 1.5, n_days) |
| ) |
|
|
| records = [] |
| for week_idx in range(n_weeks): |
| day_idx = week_idx * 7 |
| date = start_date + timedelta(days=day_idx) |
| conflict = conflict_signal[day_idx] |
| oil = oil_price_daily[day_idx] |
| oil_prev = oil_price_daily[max(0, day_idx - 14)] |
| oil_change_pct = (oil - oil_prev) / oil_prev * 100 |
|
|
| sentiment_score = conflict * 70 + rng.normal(0, 8) |
| sentiment_momentum = rng.normal(0, 5) |
|
|
| for origin, dest, region_type, price_base, price_max in ROUTES: |
| is_conflict_route = "ME" in region_type |
| route_conflict_flag = int(is_conflict_route and conflict > 0.4) |
|
|
| |
| seasonal = _sin_wave(n_weeks, 52, 40)[week_idx] |
| oil_premium = oil_change_pct * (2.5 if is_conflict_route else 1.2) |
| conflict_premium = conflict * (120 if is_conflict_route else 40) |
| demand_shock = rng.normal(0, 25) |
|
|
| price = (price_base + seasonal + oil_premium + conflict_premium + |
| demand_shock) |
| price = np.clip(price, price_base * 0.7, price_max * 1.4) |
|
|
| disruption_index = conflict * (2.5 if is_conflict_route else 0.8) * 50 |
| disruption_index = np.clip(disruption_index + rng.normal(0, 5), 0, 100) |
|
|
| fuel_pressure_indicator = np.clip( |
| (oil_change_pct / 20 + conflict * 0.3) * 50 + rng.normal(0, 3), 0, 100 |
| ) |
|
|
| for days_to_dep in [7, 14, 30, 60]: |
| booking_premium = max(0, (30 - days_to_dep) * 2.5) |
| final_price = price + booking_premium + rng.normal(0, 15) |
| final_price = max(80, final_price) |
|
|
| records.append({ |
| "timestamp": date.isoformat(), |
| "week": date.strftime("%Y-W%U"), |
| "origin": origin, |
| "destination": dest, |
| "route": f"{origin}-{dest}", |
| "region_type": region_type, |
| "route_conflict_flag": route_conflict_flag, |
| "days_to_departure": days_to_dep, |
| "day_of_week": date.weekday(), |
| "price_usd": round(final_price, 2), |
| "oil_price": round(oil, 2), |
| "oil_price_change_pct": round(oil_change_pct, 3), |
| "disruption_index": round(disruption_index, 2), |
| "cancellation_rate_24h": round(conflict * 0.3 + rng.uniform(0, 0.1), 4), |
| "sentiment_score": round(sentiment_score, 2), |
| "sentiment_momentum": round(sentiment_momentum, 2), |
| "fuel_pressure_indicator": round(fuel_pressure_indicator, 2), |
| "conflict_intensity": round(conflict, 4), |
| }) |
|
|
| df = pd.DataFrame(records) |
| print(f"Flight prices: {len(df):,} rows | price range: ${df.price_usd.min():.0f}–${df.price_usd.max():.0f}") |
| return df |
|
|
|
|
| |
|
|
| def generate_oil_prices() -> pd.DataFrame: |
| """Daily Brent crude oil prices, Jan 2024 – Jun 2025.""" |
| n_days = 548 |
| start = datetime(2024, 1, 1) |
| conflict_signal = build_conflict_signal(n_days) |
|
|
| oil_base = 82.0 |
| oil_trend = np.linspace(0, 10, n_days) |
| oil_cycle = _sin_wave(n_days, 60, 8) |
| oil_shock = np.array([ |
| sum(intensity * 12 for s, dur, intensity, _ in CONFLICT_EVENTS |
| if s <= d < s + dur) |
| for d in range(n_days) |
| ]) |
| prices = oil_base + oil_trend + oil_cycle + oil_shock + rng.normal(0, 1.5, n_days) |
| prices = np.clip(prices, 60, 130) |
|
|
| records = [] |
| for i, p in enumerate(prices): |
| date = start + timedelta(days=i) |
| prev = prices[max(0, i - 1)] |
| pct = (p - prev) / prev * 100 |
| records.append({ |
| "date": date.strftime("%Y-%m-%d"), |
| "brent_usd": round(p, 2), |
| "wti_usd": round(p - rng.uniform(2, 5), 2), |
| "pct_change": round(pct, 3), |
| "rolling_7d_avg": round(np.mean(prices[max(0, i - 7):i + 1]), 2), |
| "rolling_30d_avg": round(np.mean(prices[max(0, i - 30):i + 1]), 2), |
| "conflict_intensity": round(conflict_signal[i], 4), |
| }) |
|
|
| df = pd.DataFrame(records) |
| print(f"Oil prices: {len(df):,} days") |
| return df |
|
|
|
|
| |
|
|
| def generate_airspace_risk() -> pd.DataFrame: |
| """Curated airspace risk snapshots, updated weekly.""" |
| RISK_LEVELS = ["No Advisory", "Exercise Caution", "Increased Caution", |
| "Avoid if Possible", "Do Not Fly"] |
| RISK_MAP = {r: i for i, r in enumerate(RISK_LEVELS)} |
|
|
| countries = [ |
| ("IR", "Iran", "Middle East"), |
| ("IQ", "Iraq", "Middle East"), |
| ("IL", "Israel", "Middle East"), |
| ("YE", "Yemen", "Middle East"), |
| ("SY", "Syria", "Middle East"), |
| ("UA", "Ukraine", "Eastern Europe"), |
| ("RU", "Russia", "Eastern Europe"), |
| ("PK", "Pakistan", "South Asia"), |
| ("ET", "Ethiopia", "Africa"), |
| ("LY", "Libya", "Africa"), |
| ] |
|
|
| SAFE_COUNTRIES = [ |
| ("DE", "Germany", "Western Europe"), |
| ("FR", "France", "Western Europe"), |
| ("GB", "United Kingdom", "Western Europe"), |
| ("US", "United States", "North America"), |
| ("AU", "Australia", "Asia-Pacific"), |
| ] |
|
|
| start = datetime(2024, 1, 1) |
| n_weeks = 78 |
| conflict_signal = build_conflict_signal(n_weeks * 7) |
|
|
| records = [] |
| for week_idx in range(n_weeks): |
| day = start + timedelta(weeks=week_idx) |
| conflict = conflict_signal[week_idx * 7] |
|
|
| for code, name, region in countries: |
| risk_float = min(4, conflict * 4 * 1.2 + rng.uniform(-0.5, 0.5)) |
| risk_idx = max(0, min(4, int(risk_float))) |
| risk_level = RISK_LEVELS[risk_idx] |
| records.append({ |
| "timestamp": day.isoformat(), |
| "country_code": code, |
| "country_name": name, |
| "region": region, |
| "risk_level": risk_level, |
| "risk_score": risk_idx, |
| "description": f"{risk_level}: based on current conflict activity", |
| "source": "SafeAirspace", |
| "is_conflict_affected": 1, |
| }) |
|
|
| for code, name, region in SAFE_COUNTRIES: |
| records.append({ |
| "timestamp": day.isoformat(), |
| "country_code": code, |
| "country_name": name, |
| "region": region, |
| "risk_level": "No Advisory", |
| "risk_score": 0, |
| "description": "No active advisories", |
| "source": "SafeAirspace", |
| "is_conflict_affected": 0, |
| }) |
|
|
| df = pd.DataFrame(records) |
| print(f"Airspace risk: {len(df):,} rows") |
| return df |
|
|
|
|
| |
|
|
| def generate_sentiment() -> pd.DataFrame: |
| """Simulated GDELT news sentiment scores per region, every 6 hours.""" |
| n_days = 548 |
| start = datetime(2024, 1, 1) |
| conflict_signal = build_conflict_signal(n_days) |
|
|
| regions = ["Middle East", "Eastern Europe", "South Asia", "Global"] |
| region_mults = {"Middle East": 1.5, "Eastern Europe": 1.2, |
| "South Asia": 1.0, "Global": 0.7} |
|
|
| records = [] |
| for day_idx in range(n_days): |
| for hour in [0, 6, 12, 18]: |
| ts = start + timedelta(days=day_idx, hours=hour) |
| conflict = conflict_signal[day_idx] |
| for region in regions: |
| mult = region_mults[region] |
| |
| tone_base = -conflict * mult * 5 |
| tone = tone_base + rng.normal(0, 0.8) |
| article_count = max(1, int(rng.poisson(20 + conflict * mult * 40))) |
| records.append({ |
| "timestamp": ts.isoformat(), |
| "region": region, |
| "tone_avg": round(tone, 3), |
| "article_count": article_count, |
| "sentiment_score": round(-tone * 10, 2), |
| "conflict_intensity": round(conflict, 4), |
| }) |
|
|
| df = pd.DataFrame(records) |
| |
| df = df.sort_values(["region", "timestamp"]).reset_index(drop=True) |
| df["sentiment_momentum"] = df.groupby("region")["sentiment_score"].diff().fillna(0) |
| print(f"Sentiment: {len(df):,} rows") |
| return df |
|
|
|
|
| |
|
|
| def main(): |
| print("=" * 60) |
| print("Generating synthetic historical base datasets...") |
| print("=" * 60) |
|
|
| datasets = { |
| "flight_disruptions.csv": generate_flight_disruptions, |
| "flight_prices.csv": generate_flight_prices, |
| "oil_prices.csv": generate_oil_prices, |
| "airspace_risk.csv": generate_airspace_risk, |
| "sentiment.csv": generate_sentiment, |
| } |
|
|
| for filename, generator in datasets.items(): |
| print(f"\n→ {filename}") |
| df = generator() |
| out = PROCESSED_DIR / filename |
| df.to_csv(out, index=False) |
| print(f" Saved: {out}") |
|
|
| print("\n✓ All base datasets generated successfully.") |
| print(f" Location: {PROCESSED_DIR}") |
|
|
|
|
| if __name__ == "__main__": |
| |
| |
| |
| |
| |
| |
| |
| |
| if "--bootstrap" not in sys.argv: |
| print("ERROR: Refusing to run without --bootstrap flag.") |
| print(" This script generates synthetic data and should NOT be") |
| print(" used as the default training source.") |
| print("") |
| print(" Run: python -m src.utils.generate_base_data --bootstrap") |
| print(" to explicitly opt in to synthetic data generation.") |
| sys.exit(1) |
| main() |
|
|