|
|
""" |
|
|
Synthetic Training Data Generator for Insurance Claims Decision Support |
|
|
======================================================================== |
|
|
|
|
|
GOVERNANCE CONSTRAINTS: |
|
|
- Generates data ONLY for features defined in decision_spec.yaml |
|
|
- Uses FROZEN decision boundaries to assign labels |
|
|
- Synthetic data for demonstration purposes only |
|
|
- No real customer data used |
|
|
|
|
|
Purpose: Create training dataset with proper input features |
|
|
""" |
|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import random |
|
|
from datetime import datetime |
|
|
|
|
|
|
|
|
DECISION_BOUNDARIES = { |
|
|
'damage_thresholds': { |
|
|
'low': 5000, |
|
|
'medium': 15000, |
|
|
'high': 50000 |
|
|
}, |
|
|
'risk_weights': { |
|
|
'low': 1.0, |
|
|
'medium': 1.5, |
|
|
'high': 2.0 |
|
|
}, |
|
|
'injury_multiplier': 1.8, |
|
|
'severity_thresholds': { |
|
|
'low': 5, |
|
|
'medium': 15 |
|
|
} |
|
|
} |
|
|
|
|
|
def calculate_severity_score(claim_type, damage_amount, injury_involved, risk_factor): |
|
|
""" |
|
|
Calculate severity score using FROZEN decision boundaries. |
|
|
This replicates the logic from decision_spec.yaml. |
|
|
""" |
|
|
|
|
|
if damage_amount < DECISION_BOUNDARIES['damage_thresholds']['low']: |
|
|
damage_score = 2 |
|
|
elif damage_amount < DECISION_BOUNDARIES['damage_thresholds']['medium']: |
|
|
damage_score = 5 |
|
|
elif damage_amount < DECISION_BOUNDARIES['damage_thresholds']['high']: |
|
|
damage_score = 10 |
|
|
else: |
|
|
damage_score = 20 |
|
|
|
|
|
|
|
|
risk_weight = DECISION_BOUNDARIES['risk_weights'][risk_factor] |
|
|
score = damage_score * risk_weight |
|
|
|
|
|
|
|
|
if injury_involved: |
|
|
score *= DECISION_BOUNDARIES['injury_multiplier'] |
|
|
|
|
|
|
|
|
if score < DECISION_BOUNDARIES['severity_thresholds']['low']: |
|
|
return 'low' |
|
|
elif score < DECISION_BOUNDARIES['severity_thresholds']['medium']: |
|
|
return 'medium' |
|
|
else: |
|
|
return 'high' |
|
|
|
|
|
def generate_synthetic_dataset(n_samples=1000, random_seed=42): |
|
|
""" |
|
|
Generate synthetic training data based on decision_spec.yaml. |
|
|
|
|
|
Args: |
|
|
n_samples: Number of samples to generate |
|
|
random_seed: Random seed for reproducibility |
|
|
|
|
|
Returns: |
|
|
DataFrame with input features and target labels |
|
|
""" |
|
|
random.seed(random_seed) |
|
|
np.random.seed(random_seed) |
|
|
|
|
|
print("=" * 70) |
|
|
print("GENERATING SYNTHETIC TRAINING DATA") |
|
|
print("=" * 70) |
|
|
print(f"Samples to generate: {n_samples}") |
|
|
print(f"Random seed: {random_seed}") |
|
|
print(f"\nFeatures (from decision_spec.yaml):") |
|
|
print(" - claim_type: categorical (Auto, Property, Health, Liability)") |
|
|
print(" - damage_amount: numeric (USD)") |
|
|
print(" - injury_involved: boolean") |
|
|
print(" - risk_factor: categorical (low, medium, high)") |
|
|
print(f"\nTarget: severity (low, medium, high)") |
|
|
print(f"Calculation: Using FROZEN decision boundaries") |
|
|
|
|
|
data = [] |
|
|
|
|
|
for i in range(n_samples): |
|
|
|
|
|
claim_type = random.choice(['Auto', 'Property', 'Health', 'Liability']) |
|
|
|
|
|
|
|
|
|
|
|
damage_amount = np.random.lognormal(mean=9, sigma=1.2) |
|
|
damage_amount = round(min(damage_amount, 200000), 2) |
|
|
|
|
|
|
|
|
if claim_type in ['Auto', 'Liability']: |
|
|
injury_involved = random.choices([True, False], weights=[0.3, 0.7])[0] |
|
|
else: |
|
|
injury_involved = random.choices([True, False], weights=[0.1, 0.9])[0] |
|
|
|
|
|
|
|
|
risk_factor = random.choices( |
|
|
['low', 'medium', 'high'], |
|
|
weights=[0.5, 0.35, 0.15] |
|
|
)[0] |
|
|
|
|
|
|
|
|
severity = calculate_severity_score( |
|
|
claim_type, damage_amount, injury_involved, risk_factor |
|
|
) |
|
|
|
|
|
data.append({ |
|
|
'claim_type': claim_type, |
|
|
'damage_amount': damage_amount, |
|
|
'injury_involved': injury_involved, |
|
|
'risk_factor': risk_factor, |
|
|
'severity': severity |
|
|
}) |
|
|
|
|
|
df = pd.DataFrame(data) |
|
|
|
|
|
print(f"\n{'='*70}") |
|
|
print("DATASET GENERATION COMPLETE") |
|
|
print(f"{'='*70}") |
|
|
print(f"Total samples: {len(df)}") |
|
|
print(f"\nFeature summary:") |
|
|
print(df.describe(include='all')) |
|
|
print(f"\nTarget distribution:") |
|
|
print(df['severity'].value_counts()) |
|
|
print(f"\nSample rows:") |
|
|
print(df.head(10)) |
|
|
|
|
|
return df |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
df = generate_synthetic_dataset(n_samples=1000) |
|
|
|
|
|
|
|
|
output_file = 'synthetic_training_data.csv' |
|
|
df.to_csv(output_file, index=False) |
|
|
print(f"\n{'='*70}") |
|
|
print(f"Dataset saved to: {output_file}") |
|
|
print(f"{'='*70}") |
|
|
print("\nGOVERNANCE STATUS: ✓ COMPLIANT") |
|
|
print(" - Uses only allowed features from decision_spec.yaml") |
|
|
print(" - Applies FROZEN decision boundaries") |
|
|
print(" - Synthetic data (no real customer information)") |
|
|
print(" - Suitable for demonstration/training purposes only") |
|
|
|