import os import sys import pandas as pd import numpy as np from datetime import datetime, timedelta import random def generate_training_data(days=30, num_meters=900, output_path="training_data.csv"): print(f"Generating {days} days of high-fidelity historical data for {num_meters} meters...") start_date = datetime(2026, 4, 1) # Start before May 2026 to have historical baseline hours = days * 24 # Karnataka 2026 Holidays (Partial for training data window) holidays_2026 = [ datetime(2026, 4, 3), # Good Friday datetime(2026, 4, 14), # Ambedkar Jayanthi datetime(2026, 4, 20), # Basava Jayanthi datetime(2026, 5, 1), # May Day datetime(2026, 5, 28), # Bakrid ] # Pre-generate meter profiles meters = [] for i in range(num_meters): is_commercial = i % 10 >= 7 # 30% commercial meter_id = f"MTR-{i:04d}" is_thief = random.random() < 0.05 # 5% thieves tamper_type = "Normal" if is_thief: if is_commercial: tamper_type = random.choice(["Phase Bypass", "ESD Attack", "Magnetic Interference"]) else: tamper_type = random.choice(["Subsidized Abuse", "PF Mismatch", "Phase Bypass"]) is_fp = not is_thief and random.random() < 0.05 fp_type = "None" if is_fp: fp_type = random.choice(["Solar Mask", "Vacation Filter"]) meters.append({ "meter_id": meter_id, "type": "Commercial" if is_commercial else "Domestic", "tamper_type": tamper_type, "fp_type": fp_type, "baseline_kw": 2.5 if is_commercial else 0.8 }) records = [] for h in range(hours): current_time = start_date + timedelta(hours=h) hour_of_day = current_time.hour is_night = 23 <= hour_of_day or hour_of_day <= 6 is_day = 10 <= hour_of_day <= 16 is_holiday = any(h.date() == current_time.date() for h in holidays_2026) # Calculate cluster average for peer deviation domestic_avg = 0.8 + 1.2 * np.sin(np.pi * hour_of_day / 12) commercial_avg = 3.5 + 2.0 * np.sin(np.pi * (hour_of_day - 6) / 12) # Simulate Weather # Temperature peaks around 14:00 (2 PM) base_temp = 25.0 + 8.0 * np.sin(np.pi * (hour_of_day - 8) / 12) if is_night: base_temp = 22.0 + np.random.normal(0, 1.0) temp_c = round(max(20.0, base_temp + np.random.normal(0, 2.0)), 1) # Solar radiation peaks at 12:00 solar_ghi = 0.0 if 6 <= hour_of_day <= 18: solar_ghi = round(800.0 * np.sin(np.pi * (hour_of_day - 6) / 12) + np.random.normal(0, 50.0), 1) solar_ghi = max(0.0, solar_ghi) for m in meters: c_type = m["type"] tamper = m["tamper_type"] fp = m["fp_type"] # Base generation if c_type == "Domestic": base_kw = 0.5 + 1.0 * np.exp(-0.5 * ((hour_of_day - 8) / 2)**2) + 2.5 * np.exp(-0.5 * ((hour_of_day - 20) / 2)**2) cluster_ref = domestic_avg else: base_kw = 2.0 + 8.5 * np.exp(-0.5 * ((hour_of_day - 14) / 4)**2) cluster_ref = commercial_avg # Holiday adjustment if is_holiday: if c_type == "Commercial": base_kw *= 0.3 else: base_kw *= 1.2 actual_kw = max(0.1, base_kw + np.random.normal(0, base_kw * 0.1)) reported_kw = actual_kw pf = np.random.uniform(0.92, 0.99) v_avg = np.random.normal(230, 2.0) v_r = v_avg + np.random.normal(0, 0.5); v_y = v_avg + np.random.normal(0, 0.5); v_b = v_avg + np.random.normal(0, 0.5) total_current = (reported_kw * 1000) / (v_avg * pf * 3 if reported_kw > 0 else 1) i_r = total_current + np.random.normal(0, 0.1); i_y = total_current + np.random.normal(0, 0.1); i_b = total_current + np.random.normal(0, 0.1) # Indicators is_tampering_now = (tamper != "Normal") and (random.random() < 0.8) if is_tampering_now: if tamper == "Phase Bypass": reported_kw *= 0.4 i_b = 0.0 # Physical bypass signature elif tamper == "PF Mismatch": pf = np.random.uniform(0.4, 0.6) elif tamper == "ESD Attack": v_r = v_y = v_b = 0.0 reported_kw = 0.0 elif tamper == "Magnetic Interference": reported_kw *= 0.7 pf = np.random.uniform(0.65, 0.75) elif tamper == "Subsidized Abuse": if is_night: reported_kw += 12.0 # False Positives if fp == "Solar Mask" and is_day: reported_kw = max(0.05, reported_kw - 3.0) elif fp == "Vacation Filter": reported_kw = 0.06 + np.random.normal(0, 0.01) # Engineered Features for ML is_weekend = 1 if current_time.weekday() >= 5 else 0 peer_dev = (reported_kw - cluster_ref) / cluster_ref if cluster_ref > 0 else 0 records.append({ "timestamp": current_time.strftime("%Y-%m-%d %H:%M:%S"), "meter_id": m["meter_id"], "consumer_type": c_type, "reported_kw": round(reported_kw, 3), "v_r": round(v_r, 1), "v_y": round(v_y, 1), "v_b": round(v_b, 1), "i_r": round(i_r, 2), "i_y": round(i_y, 2), "i_b": round(i_b, 2), "power_factor": round(pf, 2), "is_holiday": 1 if is_holiday else 0, "is_weekend": is_weekend, "peer_dev_pct": round(peer_dev * 100, 2), "is_anomaly": 1 if tamper != "Normal" else 0, "tamper_type": tamper, "temperature_c": temp_c, "solar_ghi": solar_ghi }) df = pd.DataFrame(records) df.to_csv(output_path, index=False) print(f"Saved {len(df)} records with enhanced indicators to {output_path}") if __name__ == "__main__": generate_training_data()