File size: 5,293 Bytes

9bbbb5b

"""
Synthetic Training Data Generator for Insurance Claims Decision Support
========================================================================

GOVERNANCE CONSTRAINTS:
- Generates data ONLY for features defined in decision_spec.yaml
- Uses FROZEN decision boundaries to assign labels
- Synthetic data for demonstration purposes only
- No real customer data used

Purpose: Create training dataset with proper input features
"""

import pandas as pd
import numpy as np
import random
from datetime import datetime

# FROZEN DECISION BOUNDARIES - DO NOT MODIFY
DECISION_BOUNDARIES = {
    'damage_thresholds': {
        'low': 5000,
        'medium': 15000,
        'high': 50000
    },
    'risk_weights': {
        'low': 1.0,
        'medium': 1.5,
        'high': 2.0
    },
    'injury_multiplier': 1.8,
    'severity_thresholds': {
        'low': 5,
        'medium': 15
    }
}

def calculate_severity_score(claim_type, damage_amount, injury_involved, risk_factor):
    """
    Calculate severity score using FROZEN decision boundaries.
    This replicates the logic from decision_spec.yaml.
    """
    # Base score from damage amount
    if damage_amount < DECISION_BOUNDARIES['damage_thresholds']['low']:
        damage_score = 2
    elif damage_amount < DECISION_BOUNDARIES['damage_thresholds']['medium']:
        damage_score = 5
    elif damage_amount < DECISION_BOUNDARIES['damage_thresholds']['high']:
        damage_score = 10
    else:
        damage_score = 20
    
    # Apply risk weight
    risk_weight = DECISION_BOUNDARIES['risk_weights'][risk_factor]
    score = damage_score * risk_weight
    
    # Apply injury multiplier
    if injury_involved:
        score *= DECISION_BOUNDARIES['injury_multiplier']
    
    # Determine severity level
    if score < DECISION_BOUNDARIES['severity_thresholds']['low']:
        return 'low'
    elif score < DECISION_BOUNDARIES['severity_thresholds']['medium']:
        return 'medium'
    else:
        return 'high'

def generate_synthetic_dataset(n_samples=1000, random_seed=42):
    """
    Generate synthetic training data based on decision_spec.yaml.
    
    Args:
        n_samples: Number of samples to generate
        random_seed: Random seed for reproducibility
    
    Returns:
        DataFrame with input features and target labels
    """
    random.seed(random_seed)
    np.random.seed(random_seed)
    
    print("=" * 70)
    print("GENERATING SYNTHETIC TRAINING DATA")
    print("=" * 70)
    print(f"Samples to generate: {n_samples}")
    print(f"Random seed: {random_seed}")
    print(f"\nFeatures (from decision_spec.yaml):")
    print("  - claim_type: categorical (Auto, Property, Health, Liability)")
    print("  - damage_amount: numeric (USD)")
    print("  - injury_involved: boolean")
    print("  - risk_factor: categorical (low, medium, high)")
    print(f"\nTarget: severity (low, medium, high)")
    print(f"Calculation: Using FROZEN decision boundaries")
    
    data = []
    
    for i in range(n_samples):
        # Generate random input features
        claim_type = random.choice(['Auto', 'Property', 'Health', 'Liability'])
        
        # Generate damage amount with realistic distribution
        # Log-normal distribution for realistic claim amounts
        damage_amount = np.random.lognormal(mean=9, sigma=1.2)
        damage_amount = round(min(damage_amount, 200000), 2)  # Cap at $200k
        
        # Injury more likely for Auto and Liability claims
        if claim_type in ['Auto', 'Liability']:
            injury_involved = random.choices([True, False], weights=[0.3, 0.7])[0]
        else:
            injury_involved = random.choices([True, False], weights=[0.1, 0.9])[0]
        
        # Risk factor distribution
        risk_factor = random.choices(
            ['low', 'medium', 'high'],
            weights=[0.5, 0.35, 0.15]
        )[0]
        
        # Calculate severity using FROZEN boundaries
        severity = calculate_severity_score(
            claim_type, damage_amount, injury_involved, risk_factor
        )
        
        data.append({
            'claim_type': claim_type,
            'damage_amount': damage_amount,
            'injury_involved': injury_involved,
            'risk_factor': risk_factor,
            'severity': severity
        })
    
    df = pd.DataFrame(data)
    
    print(f"\n{'='*70}")
    print("DATASET GENERATION COMPLETE")
    print(f"{'='*70}")
    print(f"Total samples: {len(df)}")
    print(f"\nFeature summary:")
    print(df.describe(include='all'))
    print(f"\nTarget distribution:")
    print(df['severity'].value_counts())
    print(f"\nSample rows:")
    print(df.head(10))
    
    return df

if __name__ == "__main__":
    # Generate dataset
    df = generate_synthetic_dataset(n_samples=1000)
    
    # Save to CSV
    output_file = 'synthetic_training_data.csv'
    df.to_csv(output_file, index=False)
    print(f"\n{'='*70}")
    print(f"Dataset saved to: {output_file}")
    print(f"{'='*70}")
    print("\nGOVERNANCE STATUS: ✓ COMPLIANT")
    print("  - Uses only allowed features from decision_spec.yaml")
    print("  - Applies FROZEN decision boundaries")
    print("  - Synthetic data (no real customer information)")
    print("  - Suitable for demonstration/training purposes only")