File size: 21,701 Bytes
47c6cfd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
"""
src/utils/generate_base_data.py
================================
Generates realistic synthetic historical base datasets for model training.
Simulates 18 months of aviation disruption data (Jan 2024 – Jun 2025)
with realistic correlations between conflict signals, disruptions, and prices.

Run: python -m src.utils.generate_base_data
"""

import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
import sys

SEED = 42
rng = np.random.default_rng(SEED)

PROCESSED_DIR = Path(__file__).parent.parent.parent / "data" / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# ── Helper ────────────────────────────────────────────────────────────────────

def _sin_wave(n, period, amplitude=1.0, phase=0.0):
    """Create a sinusoidal pattern."""
    return amplitude * np.sin(2 * np.pi * np.arange(n) / period + phase)


# ── Conflict Events ───────────────────────────────────────────────────────────

CONFLICT_EVENTS = [
    # (start_day_offset, duration_days, intensity, label)
    (0,   30,  0.8, "Iran-US escalation Jan 2024"),
    (45,  15,  0.6, "Gaza airspace closure Feb 2024"),
    (90,  20,  0.7, "Ukraine FIR disruptions Mar 2024"),
    (140, 10,  0.5, "Yemen Houthi attacks Apr 2024"),
    (180, 25,  0.9, "Iran-Israel direct exchange May 2024"),
    (220, 12,  0.5, "Pakistan-India tensions Jun 2024"),
    (270, 30,  0.7, "Middle East escalation Sep 2024"),
    (320, 20,  0.6, "Ukraine winter offensive Nov 2024"),
    (380, 15,  0.8, "Iran-US tensions Jan 2025"),
    (420, 40,  1.0, "Major conflict peak Feb-Mar 2025"),
    (470, 25,  0.7, "Post-conflict recovery Apr 2025"),
    (510, 20,  0.6, "Regional tensions May 2025"),
]

AIRPORTS = [
    ("OIII", "Tehran", "IR", "Middle East"),
    ("OMDB", "Dubai", "AE", "Middle East"),
    ("LLBG", "Tel Aviv", "IL", "Middle East"),
    ("HECA", "Cairo", "EG", "Middle East"),
    ("OJAM", "Amman", "JO", "Middle East"),
    ("UKBB", "Kyiv", "UA", "Eastern Europe"),
    ("UUEE", "Moscow", "RU", "Eastern Europe"),
    ("EPWA", "Warsaw", "PL", "Eastern Europe"),
    ("LHBP", "Budapest", "HU", "Eastern Europe"),
    ("OPKC", "Karachi", "PK", "South Asia"),
    ("VIDP", "Delhi", "IN", "South Asia"),
    ("EGLL", "London", "GB", "Western Europe"),
    ("LFPG", "Paris", "FR", "Western Europe"),
    ("EDDF", "Frankfurt", "DE", "Western Europe"),
    ("EHAM", "Amsterdam", "NL", "Western Europe"),
]

AIRLINES = [
    ("EK",  "Emirates",          "AE", "Middle East"),
    ("EY",  "Etihad",            "AE", "Middle East"),
    ("QR",  "Qatar Airways",     "QA", "Middle East"),
    ("TK",  "Turkish Airlines",  "TR", "Turkey"),
    ("LH",  "Lufthansa",         "DE", "Western Europe"),
    ("BA",  "British Airways",   "GB", "Western Europe"),
    ("AF",  "Air France",        "FR", "Western Europe"),
    ("PS",  "Ukraine Int'l",     "UA", "Eastern Europe"),
    ("PK",  "Pakistan Int'l",    "PK", "South Asia"),
    ("IR",  "Iran Air",          "IR", "Middle East"),
    ("AY",  "Finnair",           "FI", "Western Europe"),
    ("KL",  "KLM",               "NL", "Western Europe"),
]

ROUTES = [
    ("LHR", "DXB", "LH-ME", 350, 650),
    ("CDG", "DXB", "LH-ME", 330, 620),
    ("FRA", "DXB", "LH-ME", 320, 610),
    ("JFK", "DXB", "NA-ME", 580, 950),
    ("LHR", "TLV", "LH-ME", 280, 550),
    ("CDG", "TLV", "LH-ME", 260, 530),
    ("LHR", "BKK", "LH-AS", 420, 780),
    ("LHR", "KHI", "LH-SA", 310, 600),
    ("DXB", "DEL", "ME-SA", 180, 380),
    ("IST", "DXB", "ME-ME", 150, 320),
]


def build_conflict_signal(n_days: int) -> np.ndarray:
    """Build a day-level conflict intensity signal (0..1)."""
    signal = np.zeros(n_days)
    for start, dur, intensity, _ in CONFLICT_EVENTS:
        if start >= n_days:
            continue
        end = min(start + dur, n_days)
        # Ramp up / plateau / ramp down
        ramp = min(5, dur // 3)
        for d in range(start, end):
            offset = d - start
            if offset < ramp:
                signal[d] = intensity * offset / ramp
            elif offset > dur - ramp:
                signal[d] = intensity * (dur - offset) / ramp
            else:
                signal[d] = intensity
    # Add noise
    signal += rng.normal(0, 0.05, n_days)
    return np.clip(signal, 0, 1)


# ── Generate Flight Disruptions ───────────────────────────────────────────────

def generate_flight_disruptions() -> pd.DataFrame:
    """
    One row per (airport, 6-hour period) over 18 months.
    ~15 airports × 4 periods/day × 548 days ≈ 32,880 rows.
    """
    start_date = datetime(2024, 1, 1)
    n_days = 548  # Jan 2024 – Jun 2025
    periods_per_day = 4  # 00:00, 06:00, 12:00, 18:00

    conflict_signal = build_conflict_signal(n_days)

    # Oil price simulation: Brent crude, realistic range $70–$100
    oil_base = 82.0
    oil_trend = np.linspace(0, 10, n_days)  # slight upward trend
    oil_cycle = _sin_wave(n_days, 60, amplitude=8)
    oil_shock = np.zeros(n_days)
    for start, dur, intensity, _ in CONFLICT_EVENTS:
        if start < n_days:
            end = min(start + dur, n_days)
            oil_shock[start:end] += intensity * 12  # conflict → oil spike
    oil_price_daily = oil_base + oil_trend + oil_cycle + oil_shock + rng.normal(0, 1.5, n_days)
    oil_price_daily = np.clip(oil_price_daily, 60, 130)

    records = []
    for day_idx in range(n_days):
        date = start_date + timedelta(days=day_idx)
        conflict = conflict_signal[day_idx]
        oil = oil_price_daily[day_idx]
        oil_prev7 = oil_price_daily[max(0, day_idx - 7)]
        oil_change_pct = (oil - oil_prev7) / oil_prev7 * 100

        for period in range(periods_per_day):
            hour = period * 6
            ts = date + timedelta(hours=hour)

            for airport_code, airport_name, country, region in AIRPORTS:
                # Regional conflict modifier
                is_conflict_region = region in ["Middle East", "Eastern Europe", "South Asia"]
                regional_mult = 2.5 if is_conflict_region else 0.8

                # Cancellation rate (0..1)
                cancel_base = 0.05 + conflict * regional_mult * 0.35
                cancel_noise = rng.beta(1.5, 8) * 0.15
                cancellation_rate = np.clip(cancel_base + cancel_noise, 0, 0.95)

                # Delay minutes
                delay_base = 15 + conflict * regional_mult * 80
                delay_minutes = max(0, rng.normal(delay_base, 10))

                # Airspace risk score (0..4)
                if is_conflict_region:
                    risk_raw = conflict * 4 * regional_mult * 0.7
                    airspace_risk_score = min(4.0, risk_raw + rng.uniform(-0.3, 0.3))
                else:
                    airspace_risk_score = rng.uniform(0, 0.8)

                # Sentiment score (higher = more negative news)
                sentiment_base = conflict * regional_mult * 80
                sentiment_score = np.clip(
                    rng.normal(sentiment_base, 10), -100, 100
                )

                # Number of conflict events in region
                conflict_event_count = int(
                    rng.poisson(conflict * regional_mult * 5)
                )

                # Fuel pressure indicator
                fuel_pressure = (oil_change_pct / 20 + conflict * 0.3) * 50
                fuel_pressure_indicator = np.clip(fuel_pressure, 0, 100)

                # Disruption index (composite, 0..100)
                disruption_index = np.clip(
                    cancellation_rate * 40 + delay_minutes / 200 * 30 +
                    airspace_risk_score / 4 * 20 + conflict * regional_mult * 10,
                    0, 100
                )

                # Airport stress score (0..100)
                airport_stress_score = np.clip(
                    disruption_index * 0.7 + airspace_risk_score * 5 +
                    rng.normal(0, 3), 0, 100
                )

                # Binary target: is_high_disruption
                is_high_disruption = int(disruption_index > 50)

                records.append({
                    "timestamp": ts.isoformat(),
                    "date": date.strftime("%Y-%m-%d"),
                    "hour": hour,
                    "airport_code": airport_code,
                    "airport_name": airport_name,
                    "country": country,
                    "region": region,
                    "conflict_active": int(conflict > 0.3),
                    "conflict_intensity": round(conflict, 4),
                    "conflict_event_count": conflict_event_count,
                    "cancellation_rate": round(cancellation_rate, 4),
                    "avg_delay_minutes": round(delay_minutes, 1),
                    "cancellation_rate_24h": round(cancellation_rate, 4),
                    "avg_delay_24h": round(delay_minutes, 1),
                    "airspace_risk_score": round(airspace_risk_score, 3),
                    "sentiment_score": round(sentiment_score, 2),
                    "sentiment_momentum": round(rng.normal(0, 5), 2),
                    "oil_price": round(oil, 2),
                    "oil_price_change_pct": round(oil_change_pct, 3),
                    "fuel_pressure_indicator": round(fuel_pressure_indicator, 2),
                    "disruption_index": round(disruption_index, 2),
                    "disruption_index_lag6h": round(disruption_index * rng.uniform(0.85, 1.0), 2),
                    "airport_stress_score": round(airport_stress_score, 2),
                    "is_high_disruption": is_high_disruption,
                })

    df = pd.DataFrame(records)
    print(f"Flight disruptions: {len(df):,} rows | positive rate: {df.is_high_disruption.mean():.2%}")
    return df


# ── Generate Flight Prices ────────────────────────────────────────────────────

def generate_flight_prices() -> pd.DataFrame:
    """
    One row per (route, week) over 18 months with realistic price drivers.
    ~10 routes × 78 weeks ≈ 780 rows.
    """
    start_date = datetime(2024, 1, 1)
    n_weeks = 78
    n_days = n_weeks * 7

    conflict_signal = build_conflict_signal(n_days)
    oil_base = 82.0
    oil_price_daily = (
        oil_base
        + np.linspace(0, 10, n_days)
        + _sin_wave(n_days, 60, 8)
        + rng.normal(0, 1.5, n_days)
    )

    records = []
    for week_idx in range(n_weeks):
        day_idx = week_idx * 7
        date = start_date + timedelta(days=day_idx)
        conflict = conflict_signal[day_idx]
        oil = oil_price_daily[day_idx]
        oil_prev = oil_price_daily[max(0, day_idx - 14)]
        oil_change_pct = (oil - oil_prev) / oil_prev * 100

        sentiment_score = conflict * 70 + rng.normal(0, 8)
        sentiment_momentum = rng.normal(0, 5)

        for origin, dest, region_type, price_base, price_max in ROUTES:
            is_conflict_route = "ME" in region_type
            route_conflict_flag = int(is_conflict_route and conflict > 0.4)

            # Price model: base + oil spike + conflict premium + seasonality
            seasonal = _sin_wave(n_weeks, 52, 40)[week_idx]
            oil_premium = oil_change_pct * (2.5 if is_conflict_route else 1.2)
            conflict_premium = conflict * (120 if is_conflict_route else 40)
            demand_shock = rng.normal(0, 25)

            price = (price_base + seasonal + oil_premium + conflict_premium +
                     demand_shock)
            price = np.clip(price, price_base * 0.7, price_max * 1.4)

            disruption_index = conflict * (2.5 if is_conflict_route else 0.8) * 50
            disruption_index = np.clip(disruption_index + rng.normal(0, 5), 0, 100)

            fuel_pressure_indicator = np.clip(
                (oil_change_pct / 20 + conflict * 0.3) * 50 + rng.normal(0, 3), 0, 100
            )

            for days_to_dep in [7, 14, 30, 60]:
                booking_premium = max(0, (30 - days_to_dep) * 2.5)
                final_price = price + booking_premium + rng.normal(0, 15)
                final_price = max(80, final_price)

                records.append({
                    "timestamp": date.isoformat(),
                    "week": date.strftime("%Y-W%U"),
                    "origin": origin,
                    "destination": dest,
                    "route": f"{origin}-{dest}",
                    "region_type": region_type,
                    "route_conflict_flag": route_conflict_flag,
                    "days_to_departure": days_to_dep,
                    "day_of_week": date.weekday(),
                    "price_usd": round(final_price, 2),
                    "oil_price": round(oil, 2),
                    "oil_price_change_pct": round(oil_change_pct, 3),
                    "disruption_index": round(disruption_index, 2),
                    "cancellation_rate_24h": round(conflict * 0.3 + rng.uniform(0, 0.1), 4),
                    "sentiment_score": round(sentiment_score, 2),
                    "sentiment_momentum": round(sentiment_momentum, 2),
                    "fuel_pressure_indicator": round(fuel_pressure_indicator, 2),
                    "conflict_intensity": round(conflict, 4),
                })

    df = pd.DataFrame(records)
    print(f"Flight prices: {len(df):,} rows | price range: ${df.price_usd.min():.0f}–${df.price_usd.max():.0f}")
    return df


# ── Generate Oil Prices ───────────────────────────────────────────────────────

def generate_oil_prices() -> pd.DataFrame:
    """Daily Brent crude oil prices, Jan 2024 – Jun 2025."""
    n_days = 548
    start = datetime(2024, 1, 1)
    conflict_signal = build_conflict_signal(n_days)

    oil_base = 82.0
    oil_trend = np.linspace(0, 10, n_days)
    oil_cycle = _sin_wave(n_days, 60, 8)
    oil_shock = np.array([
        sum(intensity * 12 for s, dur, intensity, _ in CONFLICT_EVENTS
            if s <= d < s + dur)
        for d in range(n_days)
    ])
    prices = oil_base + oil_trend + oil_cycle + oil_shock + rng.normal(0, 1.5, n_days)
    prices = np.clip(prices, 60, 130)

    records = []
    for i, p in enumerate(prices):
        date = start + timedelta(days=i)
        prev = prices[max(0, i - 1)]
        pct = (p - prev) / prev * 100
        records.append({
            "date": date.strftime("%Y-%m-%d"),
            "brent_usd": round(p, 2),
            "wti_usd": round(p - rng.uniform(2, 5), 2),
            "pct_change": round(pct, 3),
            "rolling_7d_avg": round(np.mean(prices[max(0, i - 7):i + 1]), 2),
            "rolling_30d_avg": round(np.mean(prices[max(0, i - 30):i + 1]), 2),
            "conflict_intensity": round(conflict_signal[i], 4),
        })

    df = pd.DataFrame(records)
    print(f"Oil prices: {len(df):,} days")
    return df


# ── Generate Airspace Risk ────────────────────────────────────────────────────

def generate_airspace_risk() -> pd.DataFrame:
    """Curated airspace risk snapshots, updated weekly."""
    RISK_LEVELS = ["No Advisory", "Exercise Caution", "Increased Caution",
                   "Avoid if Possible", "Do Not Fly"]
    RISK_MAP = {r: i for i, r in enumerate(RISK_LEVELS)}

    countries = [
        ("IR", "Iran", "Middle East"),
        ("IQ", "Iraq", "Middle East"),
        ("IL", "Israel", "Middle East"),
        ("YE", "Yemen", "Middle East"),
        ("SY", "Syria", "Middle East"),
        ("UA", "Ukraine", "Eastern Europe"),
        ("RU", "Russia", "Eastern Europe"),
        ("PK", "Pakistan", "South Asia"),
        ("ET", "Ethiopia", "Africa"),
        ("LY", "Libya", "Africa"),
    ]

    SAFE_COUNTRIES = [
        ("DE", "Germany", "Western Europe"),
        ("FR", "France", "Western Europe"),
        ("GB", "United Kingdom", "Western Europe"),
        ("US", "United States", "North America"),
        ("AU", "Australia", "Asia-Pacific"),
    ]

    start = datetime(2024, 1, 1)
    n_weeks = 78
    conflict_signal = build_conflict_signal(n_weeks * 7)

    records = []
    for week_idx in range(n_weeks):
        day = start + timedelta(weeks=week_idx)
        conflict = conflict_signal[week_idx * 7]

        for code, name, region in countries:
            risk_float = min(4, conflict * 4 * 1.2 + rng.uniform(-0.5, 0.5))
            risk_idx = max(0, min(4, int(risk_float)))
            risk_level = RISK_LEVELS[risk_idx]
            records.append({
                "timestamp": day.isoformat(),
                "country_code": code,
                "country_name": name,
                "region": region,
                "risk_level": risk_level,
                "risk_score": risk_idx,
                "description": f"{risk_level}: based on current conflict activity",
                "source": "SafeAirspace",
                "is_conflict_affected": 1,
            })

        for code, name, region in SAFE_COUNTRIES:
            records.append({
                "timestamp": day.isoformat(),
                "country_code": code,
                "country_name": name,
                "region": region,
                "risk_level": "No Advisory",
                "risk_score": 0,
                "description": "No active advisories",
                "source": "SafeAirspace",
                "is_conflict_affected": 0,
            })

    df = pd.DataFrame(records)
    print(f"Airspace risk: {len(df):,} rows")
    return df


# ── Generate Sentiment (GDELT-style) ─────────────────────────────────────────

def generate_sentiment() -> pd.DataFrame:
    """Simulated GDELT news sentiment scores per region, every 6 hours."""
    n_days = 548
    start = datetime(2024, 1, 1)
    conflict_signal = build_conflict_signal(n_days)

    regions = ["Middle East", "Eastern Europe", "South Asia", "Global"]
    region_mults = {"Middle East": 1.5, "Eastern Europe": 1.2,
                    "South Asia": 1.0, "Global": 0.7}

    records = []
    for day_idx in range(n_days):
        for hour in [0, 6, 12, 18]:
            ts = start + timedelta(days=day_idx, hours=hour)
            conflict = conflict_signal[day_idx]
            for region in regions:
                mult = region_mults[region]
                # GDELT tone: negative = bad news (0 = neutral, negative = conflict)
                tone_base = -conflict * mult * 5
                tone = tone_base + rng.normal(0, 0.8)
                article_count = max(1, int(rng.poisson(20 + conflict * mult * 40)))
                records.append({
                    "timestamp": ts.isoformat(),
                    "region": region,
                    "tone_avg": round(tone, 3),
                    "article_count": article_count,
                    "sentiment_score": round(-tone * 10, 2),  # positive = more conflict news
                    "conflict_intensity": round(conflict, 4),
                })

    df = pd.DataFrame(records)
    # Add sentiment momentum
    df = df.sort_values(["region", "timestamp"]).reset_index(drop=True)
    df["sentiment_momentum"] = df.groupby("region")["sentiment_score"].diff().fillna(0)
    print(f"Sentiment: {len(df):,} rows")
    return df


# ── Main ──────────────────────────────────────────────────────────────────────

def main():
    print("=" * 60)
    print("Generating synthetic historical base datasets...")
    print("=" * 60)

    datasets = {
        "flight_disruptions.csv": generate_flight_disruptions,
        "flight_prices.csv": generate_flight_prices,
        "oil_prices.csv": generate_oil_prices,
        "airspace_risk.csv": generate_airspace_risk,
        "sentiment.csv": generate_sentiment,
    }

    for filename, generator in datasets.items():
        print(f"\n→ {filename}")
        df = generator()
        out = PROCESSED_DIR / filename
        df.to_csv(out, index=False)
        print(f"  Saved: {out}")

    print("\n✓ All base datasets generated successfully.")
    print(f"  Location: {PROCESSED_DIR}")


if __name__ == "__main__":
    # ── Bootstrap guard ───────────────────────────────────────────────────────
    # This script generates SYNTHETIC data for initial development/testing only.
    # The real pipeline reads from data/base/ (real Kaggle CSVs).
    # Only run this with the --bootstrap flag to avoid accidentally overwriting
    # or bypassing real data.
    #
    # Usage:  python -m src.utils.generate_base_data --bootstrap
    #
    if "--bootstrap" not in sys.argv:
        print("ERROR: Refusing to run without --bootstrap flag.")
        print("       This script generates synthetic data and should NOT be")
        print("       used as the default training source.")
        print("")
        print("       Run:  python -m src.utils.generate_base_data --bootstrap")
        print("       to explicitly opt in to synthetic data generation.")
        sys.exit(1)
    main()