""" Synthetic Training Data Generator for Insurance Claims Decision Support ======================================================================== GOVERNANCE CONSTRAINTS: - Generates data ONLY for features defined in decision_spec.yaml - Uses FROZEN decision boundaries to assign labels - Synthetic data for demonstration purposes only - No real customer data used Purpose: Create training dataset with proper input features """ import pandas as pd import numpy as np import random from datetime import datetime # FROZEN DECISION BOUNDARIES - DO NOT MODIFY DECISION_BOUNDARIES = { 'damage_thresholds': { 'low': 5000, 'medium': 15000, 'high': 50000 }, 'risk_weights': { 'low': 1.0, 'medium': 1.5, 'high': 2.0 }, 'injury_multiplier': 1.8, 'severity_thresholds': { 'low': 5, 'medium': 15 } } def calculate_severity_score(claim_type, damage_amount, injury_involved, risk_factor): """ Calculate severity score using FROZEN decision boundaries. This replicates the logic from decision_spec.yaml. """ # Base score from damage amount if damage_amount < DECISION_BOUNDARIES['damage_thresholds']['low']: damage_score = 2 elif damage_amount < DECISION_BOUNDARIES['damage_thresholds']['medium']: damage_score = 5 elif damage_amount < DECISION_BOUNDARIES['damage_thresholds']['high']: damage_score = 10 else: damage_score = 20 # Apply risk weight risk_weight = DECISION_BOUNDARIES['risk_weights'][risk_factor] score = damage_score * risk_weight # Apply injury multiplier if injury_involved: score *= DECISION_BOUNDARIES['injury_multiplier'] # Determine severity level if score < DECISION_BOUNDARIES['severity_thresholds']['low']: return 'low' elif score < DECISION_BOUNDARIES['severity_thresholds']['medium']: return 'medium' else: return 'high' def generate_synthetic_dataset(n_samples=1000, random_seed=42): """ Generate synthetic training data based on decision_spec.yaml. Args: n_samples: Number of samples to generate random_seed: Random seed for reproducibility Returns: DataFrame with input features and target labels """ random.seed(random_seed) np.random.seed(random_seed) print("=" * 70) print("GENERATING SYNTHETIC TRAINING DATA") print("=" * 70) print(f"Samples to generate: {n_samples}") print(f"Random seed: {random_seed}") print(f"\nFeatures (from decision_spec.yaml):") print(" - claim_type: categorical (Auto, Property, Health, Liability)") print(" - damage_amount: numeric (USD)") print(" - injury_involved: boolean") print(" - risk_factor: categorical (low, medium, high)") print(f"\nTarget: severity (low, medium, high)") print(f"Calculation: Using FROZEN decision boundaries") data = [] for i in range(n_samples): # Generate random input features claim_type = random.choice(['Auto', 'Property', 'Health', 'Liability']) # Generate damage amount with realistic distribution # Log-normal distribution for realistic claim amounts damage_amount = np.random.lognormal(mean=9, sigma=1.2) damage_amount = round(min(damage_amount, 200000), 2) # Cap at $200k # Injury more likely for Auto and Liability claims if claim_type in ['Auto', 'Liability']: injury_involved = random.choices([True, False], weights=[0.3, 0.7])[0] else: injury_involved = random.choices([True, False], weights=[0.1, 0.9])[0] # Risk factor distribution risk_factor = random.choices( ['low', 'medium', 'high'], weights=[0.5, 0.35, 0.15] )[0] # Calculate severity using FROZEN boundaries severity = calculate_severity_score( claim_type, damage_amount, injury_involved, risk_factor ) data.append({ 'claim_type': claim_type, 'damage_amount': damage_amount, 'injury_involved': injury_involved, 'risk_factor': risk_factor, 'severity': severity }) df = pd.DataFrame(data) print(f"\n{'='*70}") print("DATASET GENERATION COMPLETE") print(f"{'='*70}") print(f"Total samples: {len(df)}") print(f"\nFeature summary:") print(df.describe(include='all')) print(f"\nTarget distribution:") print(df['severity'].value_counts()) print(f"\nSample rows:") print(df.head(10)) return df if __name__ == "__main__": # Generate dataset df = generate_synthetic_dataset(n_samples=1000) # Save to CSV output_file = 'synthetic_training_data.csv' df.to_csv(output_file, index=False) print(f"\n{'='*70}") print(f"Dataset saved to: {output_file}") print(f"{'='*70}") print("\nGOVERNANCE STATUS: ✓ COMPLIANT") print(" - Uses only allowed features from decision_spec.yaml") print(" - Applies FROZEN decision boundaries") print(" - Synthetic data (no real customer information)") print(" - Suitable for demonstration/training purposes only")