vidyth-raksha / backend /ml_engine /generate_training_data.py
Darshan03's picture
Deployment Sync: Vidyut Rakshak Platform
f88c990
import os
import sys
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
def generate_training_data(days=30, num_meters=900, output_path="training_data.csv"):
print(f"Generating {days} days of high-fidelity historical data for {num_meters} meters...")
start_date = datetime(2026, 4, 1) # Start before May 2026 to have historical baseline
hours = days * 24
# Karnataka 2026 Holidays (Partial for training data window)
holidays_2026 = [
datetime(2026, 4, 3), # Good Friday
datetime(2026, 4, 14), # Ambedkar Jayanthi
datetime(2026, 4, 20), # Basava Jayanthi
datetime(2026, 5, 1), # May Day
datetime(2026, 5, 28), # Bakrid
]
# Pre-generate meter profiles
meters = []
for i in range(num_meters):
is_commercial = i % 10 >= 7 # 30% commercial
meter_id = f"MTR-{i:04d}"
is_thief = random.random() < 0.05 # 5% thieves
tamper_type = "Normal"
if is_thief:
if is_commercial:
tamper_type = random.choice(["Phase Bypass", "ESD Attack", "Magnetic Interference"])
else:
tamper_type = random.choice(["Subsidized Abuse", "PF Mismatch", "Phase Bypass"])
is_fp = not is_thief and random.random() < 0.05
fp_type = "None"
if is_fp:
fp_type = random.choice(["Solar Mask", "Vacation Filter"])
meters.append({
"meter_id": meter_id,
"type": "Commercial" if is_commercial else "Domestic",
"tamper_type": tamper_type,
"fp_type": fp_type,
"baseline_kw": 2.5 if is_commercial else 0.8
})
records = []
for h in range(hours):
current_time = start_date + timedelta(hours=h)
hour_of_day = current_time.hour
is_night = 23 <= hour_of_day or hour_of_day <= 6
is_day = 10 <= hour_of_day <= 16
is_holiday = any(h.date() == current_time.date() for h in holidays_2026)
# Calculate cluster average for peer deviation
domestic_avg = 0.8 + 1.2 * np.sin(np.pi * hour_of_day / 12)
commercial_avg = 3.5 + 2.0 * np.sin(np.pi * (hour_of_day - 6) / 12)
# Simulate Weather
# Temperature peaks around 14:00 (2 PM)
base_temp = 25.0 + 8.0 * np.sin(np.pi * (hour_of_day - 8) / 12)
if is_night:
base_temp = 22.0 + np.random.normal(0, 1.0)
temp_c = round(max(20.0, base_temp + np.random.normal(0, 2.0)), 1)
# Solar radiation peaks at 12:00
solar_ghi = 0.0
if 6 <= hour_of_day <= 18:
solar_ghi = round(800.0 * np.sin(np.pi * (hour_of_day - 6) / 12) + np.random.normal(0, 50.0), 1)
solar_ghi = max(0.0, solar_ghi)
for m in meters:
c_type = m["type"]
tamper = m["tamper_type"]
fp = m["fp_type"]
# Base generation
if c_type == "Domestic":
base_kw = 0.5 + 1.0 * np.exp(-0.5 * ((hour_of_day - 8) / 2)**2) + 2.5 * np.exp(-0.5 * ((hour_of_day - 20) / 2)**2)
cluster_ref = domestic_avg
else:
base_kw = 2.0 + 8.5 * np.exp(-0.5 * ((hour_of_day - 14) / 4)**2)
cluster_ref = commercial_avg
# Holiday adjustment
if is_holiday:
if c_type == "Commercial": base_kw *= 0.3
else: base_kw *= 1.2
actual_kw = max(0.1, base_kw + np.random.normal(0, base_kw * 0.1))
reported_kw = actual_kw
pf = np.random.uniform(0.92, 0.99)
v_avg = np.random.normal(230, 2.0)
v_r = v_avg + np.random.normal(0, 0.5); v_y = v_avg + np.random.normal(0, 0.5); v_b = v_avg + np.random.normal(0, 0.5)
total_current = (reported_kw * 1000) / (v_avg * pf * 3 if reported_kw > 0 else 1)
i_r = total_current + np.random.normal(0, 0.1); i_y = total_current + np.random.normal(0, 0.1); i_b = total_current + np.random.normal(0, 0.1)
# Indicators
is_tampering_now = (tamper != "Normal") and (random.random() < 0.8)
if is_tampering_now:
if tamper == "Phase Bypass":
reported_kw *= 0.4
i_b = 0.0 # Physical bypass signature
elif tamper == "PF Mismatch":
pf = np.random.uniform(0.4, 0.6)
elif tamper == "ESD Attack":
v_r = v_y = v_b = 0.0
reported_kw = 0.0
elif tamper == "Magnetic Interference":
reported_kw *= 0.7
pf = np.random.uniform(0.65, 0.75)
elif tamper == "Subsidized Abuse":
if is_night: reported_kw += 12.0
# False Positives
if fp == "Solar Mask" and is_day:
reported_kw = max(0.05, reported_kw - 3.0)
elif fp == "Vacation Filter":
reported_kw = 0.06 + np.random.normal(0, 0.01)
# Engineered Features for ML
is_weekend = 1 if current_time.weekday() >= 5 else 0
peer_dev = (reported_kw - cluster_ref) / cluster_ref if cluster_ref > 0 else 0
records.append({
"timestamp": current_time.strftime("%Y-%m-%d %H:%M:%S"),
"meter_id": m["meter_id"],
"consumer_type": c_type,
"reported_kw": round(reported_kw, 3),
"v_r": round(v_r, 1), "v_y": round(v_y, 1), "v_b": round(v_b, 1),
"i_r": round(i_r, 2), "i_y": round(i_y, 2), "i_b": round(i_b, 2),
"power_factor": round(pf, 2),
"is_holiday": 1 if is_holiday else 0,
"is_weekend": is_weekend,
"peer_dev_pct": round(peer_dev * 100, 2),
"is_anomaly": 1 if tamper != "Normal" else 0,
"tamper_type": tamper,
"temperature_c": temp_c,
"solar_ghi": solar_ghi
})
df = pd.DataFrame(records)
df.to_csv(output_path, index=False)
print(f"Saved {len(df)} records with enhanced indicators to {output_path}")
if __name__ == "__main__":
generate_training_data()