claims-advisory-scoring-model / generate_training_data.py
BDR-AI's picture
Add trained model artifacts and synthetic data generator
9bbbb5b verified
"""
Synthetic Training Data Generator for Insurance Claims Decision Support
========================================================================
GOVERNANCE CONSTRAINTS:
- Generates data ONLY for features defined in decision_spec.yaml
- Uses FROZEN decision boundaries to assign labels
- Synthetic data for demonstration purposes only
- No real customer data used
Purpose: Create training dataset with proper input features
"""
import pandas as pd
import numpy as np
import random
from datetime import datetime
# FROZEN DECISION BOUNDARIES - DO NOT MODIFY
DECISION_BOUNDARIES = {
'damage_thresholds': {
'low': 5000,
'medium': 15000,
'high': 50000
},
'risk_weights': {
'low': 1.0,
'medium': 1.5,
'high': 2.0
},
'injury_multiplier': 1.8,
'severity_thresholds': {
'low': 5,
'medium': 15
}
}
def calculate_severity_score(claim_type, damage_amount, injury_involved, risk_factor):
"""
Calculate severity score using FROZEN decision boundaries.
This replicates the logic from decision_spec.yaml.
"""
# Base score from damage amount
if damage_amount < DECISION_BOUNDARIES['damage_thresholds']['low']:
damage_score = 2
elif damage_amount < DECISION_BOUNDARIES['damage_thresholds']['medium']:
damage_score = 5
elif damage_amount < DECISION_BOUNDARIES['damage_thresholds']['high']:
damage_score = 10
else:
damage_score = 20
# Apply risk weight
risk_weight = DECISION_BOUNDARIES['risk_weights'][risk_factor]
score = damage_score * risk_weight
# Apply injury multiplier
if injury_involved:
score *= DECISION_BOUNDARIES['injury_multiplier']
# Determine severity level
if score < DECISION_BOUNDARIES['severity_thresholds']['low']:
return 'low'
elif score < DECISION_BOUNDARIES['severity_thresholds']['medium']:
return 'medium'
else:
return 'high'
def generate_synthetic_dataset(n_samples=1000, random_seed=42):
"""
Generate synthetic training data based on decision_spec.yaml.
Args:
n_samples: Number of samples to generate
random_seed: Random seed for reproducibility
Returns:
DataFrame with input features and target labels
"""
random.seed(random_seed)
np.random.seed(random_seed)
print("=" * 70)
print("GENERATING SYNTHETIC TRAINING DATA")
print("=" * 70)
print(f"Samples to generate: {n_samples}")
print(f"Random seed: {random_seed}")
print(f"\nFeatures (from decision_spec.yaml):")
print(" - claim_type: categorical (Auto, Property, Health, Liability)")
print(" - damage_amount: numeric (USD)")
print(" - injury_involved: boolean")
print(" - risk_factor: categorical (low, medium, high)")
print(f"\nTarget: severity (low, medium, high)")
print(f"Calculation: Using FROZEN decision boundaries")
data = []
for i in range(n_samples):
# Generate random input features
claim_type = random.choice(['Auto', 'Property', 'Health', 'Liability'])
# Generate damage amount with realistic distribution
# Log-normal distribution for realistic claim amounts
damage_amount = np.random.lognormal(mean=9, sigma=1.2)
damage_amount = round(min(damage_amount, 200000), 2) # Cap at $200k
# Injury more likely for Auto and Liability claims
if claim_type in ['Auto', 'Liability']:
injury_involved = random.choices([True, False], weights=[0.3, 0.7])[0]
else:
injury_involved = random.choices([True, False], weights=[0.1, 0.9])[0]
# Risk factor distribution
risk_factor = random.choices(
['low', 'medium', 'high'],
weights=[0.5, 0.35, 0.15]
)[0]
# Calculate severity using FROZEN boundaries
severity = calculate_severity_score(
claim_type, damage_amount, injury_involved, risk_factor
)
data.append({
'claim_type': claim_type,
'damage_amount': damage_amount,
'injury_involved': injury_involved,
'risk_factor': risk_factor,
'severity': severity
})
df = pd.DataFrame(data)
print(f"\n{'='*70}")
print("DATASET GENERATION COMPLETE")
print(f"{'='*70}")
print(f"Total samples: {len(df)}")
print(f"\nFeature summary:")
print(df.describe(include='all'))
print(f"\nTarget distribution:")
print(df['severity'].value_counts())
print(f"\nSample rows:")
print(df.head(10))
return df
if __name__ == "__main__":
# Generate dataset
df = generate_synthetic_dataset(n_samples=1000)
# Save to CSV
output_file = 'synthetic_training_data.csv'
df.to_csv(output_file, index=False)
print(f"\n{'='*70}")
print(f"Dataset saved to: {output_file}")
print(f"{'='*70}")
print("\nGOVERNANCE STATUS: ✓ COMPLIANT")
print(" - Uses only allowed features from decision_spec.yaml")
print(" - Applies FROZEN decision boundaries")
print(" - Synthetic data (no real customer information)")
print(" - Suitable for demonstration/training purposes only")