File size: 5,293 Bytes
9bbbb5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
"""
Synthetic Training Data Generator for Insurance Claims Decision Support
========================================================================
GOVERNANCE CONSTRAINTS:
- Generates data ONLY for features defined in decision_spec.yaml
- Uses FROZEN decision boundaries to assign labels
- Synthetic data for demonstration purposes only
- No real customer data used
Purpose: Create training dataset with proper input features
"""
import pandas as pd
import numpy as np
import random
from datetime import datetime
# FROZEN DECISION BOUNDARIES - DO NOT MODIFY
DECISION_BOUNDARIES = {
'damage_thresholds': {
'low': 5000,
'medium': 15000,
'high': 50000
},
'risk_weights': {
'low': 1.0,
'medium': 1.5,
'high': 2.0
},
'injury_multiplier': 1.8,
'severity_thresholds': {
'low': 5,
'medium': 15
}
}
def calculate_severity_score(claim_type, damage_amount, injury_involved, risk_factor):
"""
Calculate severity score using FROZEN decision boundaries.
This replicates the logic from decision_spec.yaml.
"""
# Base score from damage amount
if damage_amount < DECISION_BOUNDARIES['damage_thresholds']['low']:
damage_score = 2
elif damage_amount < DECISION_BOUNDARIES['damage_thresholds']['medium']:
damage_score = 5
elif damage_amount < DECISION_BOUNDARIES['damage_thresholds']['high']:
damage_score = 10
else:
damage_score = 20
# Apply risk weight
risk_weight = DECISION_BOUNDARIES['risk_weights'][risk_factor]
score = damage_score * risk_weight
# Apply injury multiplier
if injury_involved:
score *= DECISION_BOUNDARIES['injury_multiplier']
# Determine severity level
if score < DECISION_BOUNDARIES['severity_thresholds']['low']:
return 'low'
elif score < DECISION_BOUNDARIES['severity_thresholds']['medium']:
return 'medium'
else:
return 'high'
def generate_synthetic_dataset(n_samples=1000, random_seed=42):
"""
Generate synthetic training data based on decision_spec.yaml.
Args:
n_samples: Number of samples to generate
random_seed: Random seed for reproducibility
Returns:
DataFrame with input features and target labels
"""
random.seed(random_seed)
np.random.seed(random_seed)
print("=" * 70)
print("GENERATING SYNTHETIC TRAINING DATA")
print("=" * 70)
print(f"Samples to generate: {n_samples}")
print(f"Random seed: {random_seed}")
print(f"\nFeatures (from decision_spec.yaml):")
print(" - claim_type: categorical (Auto, Property, Health, Liability)")
print(" - damage_amount: numeric (USD)")
print(" - injury_involved: boolean")
print(" - risk_factor: categorical (low, medium, high)")
print(f"\nTarget: severity (low, medium, high)")
print(f"Calculation: Using FROZEN decision boundaries")
data = []
for i in range(n_samples):
# Generate random input features
claim_type = random.choice(['Auto', 'Property', 'Health', 'Liability'])
# Generate damage amount with realistic distribution
# Log-normal distribution for realistic claim amounts
damage_amount = np.random.lognormal(mean=9, sigma=1.2)
damage_amount = round(min(damage_amount, 200000), 2) # Cap at $200k
# Injury more likely for Auto and Liability claims
if claim_type in ['Auto', 'Liability']:
injury_involved = random.choices([True, False], weights=[0.3, 0.7])[0]
else:
injury_involved = random.choices([True, False], weights=[0.1, 0.9])[0]
# Risk factor distribution
risk_factor = random.choices(
['low', 'medium', 'high'],
weights=[0.5, 0.35, 0.15]
)[0]
# Calculate severity using FROZEN boundaries
severity = calculate_severity_score(
claim_type, damage_amount, injury_involved, risk_factor
)
data.append({
'claim_type': claim_type,
'damage_amount': damage_amount,
'injury_involved': injury_involved,
'risk_factor': risk_factor,
'severity': severity
})
df = pd.DataFrame(data)
print(f"\n{'='*70}")
print("DATASET GENERATION COMPLETE")
print(f"{'='*70}")
print(f"Total samples: {len(df)}")
print(f"\nFeature summary:")
print(df.describe(include='all'))
print(f"\nTarget distribution:")
print(df['severity'].value_counts())
print(f"\nSample rows:")
print(df.head(10))
return df
if __name__ == "__main__":
# Generate dataset
df = generate_synthetic_dataset(n_samples=1000)
# Save to CSV
output_file = 'synthetic_training_data.csv'
df.to_csv(output_file, index=False)
print(f"\n{'='*70}")
print(f"Dataset saved to: {output_file}")
print(f"{'='*70}")
print("\nGOVERNANCE STATUS: ✓ COMPLIANT")
print(" - Uses only allowed features from decision_spec.yaml")
print(" - Applies FROZEN decision boundaries")
print(" - Synthetic data (no real customer information)")
print(" - Suitable for demonstration/training purposes only")
|