claims-advisory-scoring-model / generate_training_data.py

Add trained model artifacts and synthetic data generator

9bbbb5b verified 8 days ago

5.29 kB

	"""
	Synthetic Training Data Generator for Insurance Claims Decision Support
	========================================================================

	GOVERNANCE CONSTRAINTS:
	- Generates data ONLY for features defined in decision_spec.yaml
	- Uses FROZEN decision boundaries to assign labels
	- Synthetic data for demonstration purposes only
	- No real customer data used

	Purpose: Create training dataset with proper input features
	"""

	import pandas as pd
	import numpy as np
	import random
	from datetime import datetime

	# FROZEN DECISION BOUNDARIES - DO NOT MODIFY
	DECISION_BOUNDARIES = {
	'damage_thresholds': {
	'low': 5000,
	'medium': 15000,
	'high': 50000
	},
	'risk_weights': {
	'low': 1.0,
	'medium': 1.5,
	'high': 2.0
	},
	'injury_multiplier': 1.8,
	'severity_thresholds': {
	'low': 5,
	'medium': 15
	}
	}

	def calculate_severity_score(claim_type, damage_amount, injury_involved, risk_factor):
	"""
	Calculate severity score using FROZEN decision boundaries.
	This replicates the logic from decision_spec.yaml.
	"""
	# Base score from damage amount
	if damage_amount < DECISION_BOUNDARIES['damage_thresholds']['low']:
	damage_score = 2
	elif damage_amount < DECISION_BOUNDARIES['damage_thresholds']['medium']:
	damage_score = 5
	elif damage_amount < DECISION_BOUNDARIES['damage_thresholds']['high']:
	damage_score = 10
	else:
	damage_score = 20

	# Apply risk weight
	risk_weight = DECISION_BOUNDARIES['risk_weights'][risk_factor]
	score = damage_score * risk_weight

	# Apply injury multiplier
	if injury_involved:
	score *= DECISION_BOUNDARIES['injury_multiplier']

	# Determine severity level
	if score < DECISION_BOUNDARIES['severity_thresholds']['low']:
	return 'low'
	elif score < DECISION_BOUNDARIES['severity_thresholds']['medium']:
	return 'medium'
	else:
	return 'high'

	def generate_synthetic_dataset(n_samples=1000, random_seed=42):
	"""
	Generate synthetic training data based on decision_spec.yaml.

	Args:
	n_samples: Number of samples to generate
	random_seed: Random seed for reproducibility

	Returns:
	DataFrame with input features and target labels
	"""
	random.seed(random_seed)
	np.random.seed(random_seed)

	print("=" * 70)
	print("GENERATING SYNTHETIC TRAINING DATA")
	print("=" * 70)
	print(f"Samples to generate: {n_samples}")
	print(f"Random seed: {random_seed}")
	print(f"\nFeatures (from decision_spec.yaml):")
	print(" - claim_type: categorical (Auto, Property, Health, Liability)")
	print(" - damage_amount: numeric (USD)")
	print(" - injury_involved: boolean")
	print(" - risk_factor: categorical (low, medium, high)")
	print(f"\nTarget: severity (low, medium, high)")
	print(f"Calculation: Using FROZEN decision boundaries")

	data = []

	for i in range(n_samples):
	# Generate random input features
	claim_type = random.choice(['Auto', 'Property', 'Health', 'Liability'])

	# Generate damage amount with realistic distribution
	# Log-normal distribution for realistic claim amounts
	damage_amount = np.random.lognormal(mean=9, sigma=1.2)
	damage_amount = round(min(damage_amount, 200000), 2) # Cap at $200k

	# Injury more likely for Auto and Liability claims
	if claim_type in ['Auto', 'Liability']:
	injury_involved = random.choices([True, False], weights=[0.3, 0.7])[0]
	else:
	injury_involved = random.choices([True, False], weights=[0.1, 0.9])[0]

	# Risk factor distribution
	risk_factor = random.choices(
	['low', 'medium', 'high'],
	weights=[0.5, 0.35, 0.15]
	)[0]

	# Calculate severity using FROZEN boundaries
	severity = calculate_severity_score(
	claim_type, damage_amount, injury_involved, risk_factor
	)

	data.append({
	'claim_type': claim_type,
	'damage_amount': damage_amount,
	'injury_involved': injury_involved,
	'risk_factor': risk_factor,
	'severity': severity
	})

	df = pd.DataFrame(data)

	print(f"\n{'='*70}")
	print("DATASET GENERATION COMPLETE")
	print(f"{'='*70}")
	print(f"Total samples: {len(df)}")
	print(f"\nFeature summary:")
	print(df.describe(include='all'))
	print(f"\nTarget distribution:")
	print(df['severity'].value_counts())
	print(f"\nSample rows:")
	print(df.head(10))

	return df

	if __name__ == "__main__":
	# Generate dataset
	df = generate_synthetic_dataset(n_samples=1000)

	# Save to CSV
	output_file = 'synthetic_training_data.csv'
	df.to_csv(output_file, index=False)
	print(f"\n{'='*70}")
	print(f"Dataset saved to: {output_file}")
	print(f"{'='*70}")
	print("\nGOVERNANCE STATUS: ✓ COMPLIANT")
	print(" - Uses only allowed features from decision_spec.yaml")
	print(" - Applies FROZEN decision boundaries")
	print(" - Synthetic data (no real customer information)")
	print(" - Suitable for demonstration/training purposes only")