""" Bioweather Production Data Generator v2.0 EmpedocLabs © 2025 Generates clinically-plausible weather → headache risk data with: - 15 distinct biometeo conditions - Seasonal/geographic variation - Multi-trigger overlap scoring - Graded risk (not just if/else buckets) - 20,000+ samples for robust training """ import numpy as np import pandas as pd def generate_production_data(n: int = 25000, seed: int = 42) -> pd.DataFrame: rng = np.random.default_rng(seed) rows = [] for _ in range(n): # ── Base weather with seasonal coherence ───────────────────── season = rng.choice(["winter", "spring", "summer", "autumn"], p=[0.25, 0.25, 0.25, 0.25]) if season == "winter": temp = rng.normal(-2, 8) humidity = rng.normal(70, 15) uv = rng.integers(0, 4) wind = abs(rng.normal(15, 12)) elif season == "spring": temp = rng.normal(14, 7) humidity = rng.normal(55, 18) uv = rng.integers(2, 8) wind = abs(rng.normal(18, 10)) elif season == "summer": temp = rng.normal(28, 6) humidity = rng.normal(55, 20) uv = rng.integers(5, 11) wind = abs(rng.normal(12, 8)) else: # autumn temp = rng.normal(12, 8) humidity = rng.normal(65, 15) uv = rng.integers(1, 6) wind = abs(rng.normal(16, 10)) temp = np.clip(temp, -15, 45) humidity = np.clip(humidity, 8, 99) uv = int(np.clip(uv, 0, 11)) wind = np.clip(wind, 0, 70) pressure = rng.normal(1013, 12) pressure = np.clip(pressure, 970, 1050) # Pressure change: occasional fronts if rng.random() < 0.10: p_drop = rng.normal(-8, 3) # cold front elif rng.random() < 0.08: p_drop = rng.normal(7, 2.5) # high pressure ridge else: p_drop = rng.normal(0, 2.5) p_drop = np.clip(p_drop, -15, 15) # Temp change: some days have big swings if rng.random() < 0.07: t_change = rng.choice([-1, 1]) * abs(rng.normal(10, 3)) else: t_change = rng.normal(0, 3) t_change = np.clip(t_change, -15, 15) # ── Additive risk scoring (multiple triggers stack) ────────── risk = 5.0 # baseline condition_scores = {} # condition_id → contribution # 1. Pressure drop (strongest weather trigger per literature) if p_drop <= -8: contribution = 35 + abs(p_drop) * 1.5 condition_scores[1] = contribution risk += contribution elif p_drop <= -4: contribution = 15 + abs(p_drop) * 1.2 condition_scores[10] = contribution risk += contribution elif p_drop <= -2: contribution = 8 + abs(p_drop) * 0.8 condition_scores[10] = contribution risk += contribution # 2. Pressure rise if p_drop >= 8: contribution = 25 + p_drop * 1.0 condition_scores[2] = contribution risk += contribution elif p_drop >= 4: contribution = 12 + p_drop * 0.7 condition_scores[11] = contribution risk += contribution elif p_drop >= 2: contribution = 6 + p_drop * 0.5 condition_scores[11] = contribution risk += contribution # 3. Sauna effect (heat + humidity) if temp >= 28 and humidity >= 65: strength = (temp - 28) * 2 + (humidity - 65) * 0.5 condition_scores[3] = strength risk += strength # 4. Wind if wind >= 40: condition_scores[4] = 25 + (wind - 40) * 0.8 risk += condition_scores[4] elif wind >= 20: condition_scores[12] = 10 + (wind - 20) * 0.3 risk += condition_scores[12] # 5. UV glare if uv >= 8: condition_scores[5] = 20 + (uv - 8) * 3 risk += condition_scores[5] elif uv >= 6 and temp > 15: condition_scores[5] = 8 + (uv - 6) * 2 risk += condition_scores[5] # 6. Bitter cold if temp <= -5: condition_scores[6] = 25 + abs(temp + 5) * 2 risk += condition_scores[6] elif temp <= 2: condition_scores[6] = 10 + abs(temp - 2) * 1.5 risk += condition_scores[6] # 7. Drastic temp drop if t_change <= -8: condition_scores[7] = 30 + abs(t_change) * 1.5 risk += condition_scores[7] elif t_change <= -5: condition_scores[7] = 12 + abs(t_change) * 0.8 risk += condition_scores[7] # 8. Heat shock if t_change >= 8: condition_scores[8] = 28 + t_change * 1.2 risk += condition_scores[8] elif t_change >= 5: condition_scores[8] = 10 + t_change * 0.7 risk += condition_scores[8] # 9. Heavy dampness if humidity >= 88 and wind <= 12: condition_scores[9] = 15 + (humidity - 88) * 0.8 risk += condition_scores[9] # 13. Dry air if humidity <= 25: condition_scores[13] = 18 + (25 - humidity) * 0.8 risk += condition_scores[13] elif humidity <= 32: condition_scores[13] = 8 + (32 - humidity) * 0.5 risk += condition_scores[13] # 14. Stagnant & gloomy if uv <= 2 and humidity >= 72 and wind <= 10 and temp < 18: condition_scores[14] = 10 + (humidity - 72) * 0.3 risk += condition_scores[14] # ── Determine primary condition ────────────────────────────── if condition_scores: label = max(condition_scores, key=condition_scores.get) else: label = 0 # clear skies # ── Add realistic noise ────────────────────────────────────── risk += rng.normal(0, 2.5) risk = int(np.clip(round(risk), 0, 100)) rows.append([ round(temp, 1), round(pressure, 1), round(humidity, 1), round(wind, 1), uv, round(p_drop, 2), round(t_change, 2), risk, label, ]) df = pd.DataFrame(rows, columns=[ "temp_c", "pressure_hpa", "humidity", "wind_kph", "uv_index", "pressure_drop", "temp_change", "risk_score", "advice_label", ]) print(f"✅ Generated {len(df):,} samples") print(f" Risk: mean={df['risk_score'].mean():.1f}, std={df['risk_score'].std():.1f}") print(f" Conditions: {df['advice_label'].value_counts().sort_index().to_dict()}") return df if __name__ == "__main__": df = generate_production_data() df.to_csv("smart_weather_data.csv", index=False) print(f"💾 Saved → smart_weather_data.csv")