BDR-AI commited on
Commit
9bbbb5b
·
verified ·
1 Parent(s): fc407ce

Add trained model artifacts and synthetic data generator

Browse files

Added files:
- generate_training_data.py: Synthetic training data generator using FROZEN decision boundaries
- encoders.pkl: Feature encoders for model
- model.pkl: Trained XGBoost classifier (99% accuracy)
- model_metadata.json: Model training metadata
- evaluation_report.json: Comprehensive evaluation metrics

Governance Status: ✓ COMPLIANT
- Classical ML only (XGBoost)
- Uses FROZEN decision boundaries from decision_spec.yaml
- All outputs ADVISORY ONLY
- Human-in-the-loop MANDATORY

encoders.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49f72154c797f02c02e2687aac56f6f2f4b97178f5856925e1bdb961db1f9dab
3
+ size 1011
evaluation_report.json ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "evaluation_date": "2026-01-04T16:45:44.784636",
3
+ "model_file": "model.pkl",
4
+ "test_samples": 200,
5
+ "classification_metrics": {
6
+ "accuracy": 0.99,
7
+ "precision": [
8
+ 0.9583333333333334,
9
+ 1.0,
10
+ 1.0
11
+ ],
12
+ "recall": [
13
+ 1.0,
14
+ 1.0,
15
+ 0.9777777777777777
16
+ ],
17
+ "f1_score": [
18
+ 0.9787234042553191,
19
+ 1.0,
20
+ 0.9887640449438202
21
+ ],
22
+ "support": [
23
+ 46,
24
+ 64,
25
+ 90
26
+ ],
27
+ "confusion_matrix": [
28
+ [
29
+ 46,
30
+ 0,
31
+ 0
32
+ ],
33
+ [
34
+ 0,
35
+ 64,
36
+ 0
37
+ ],
38
+ [
39
+ 2,
40
+ 0,
41
+ 88
42
+ ]
43
+ ],
44
+ "log_loss": 0.02118674763321642,
45
+ "classification_report": {
46
+ "high": {
47
+ "precision": 0.9583333333333334,
48
+ "recall": 1.0,
49
+ "f1-score": 0.9787234042553191,
50
+ "support": 46.0
51
+ },
52
+ "low": {
53
+ "precision": 1.0,
54
+ "recall": 1.0,
55
+ "f1-score": 1.0,
56
+ "support": 64.0
57
+ },
58
+ "medium": {
59
+ "precision": 1.0,
60
+ "recall": 0.9777777777777777,
61
+ "f1-score": 0.9887640449438202,
62
+ "support": 90.0
63
+ },
64
+ "accuracy": 0.99,
65
+ "macro avg": {
66
+ "precision": 0.9861111111111112,
67
+ "recall": 0.9925925925925926,
68
+ "f1-score": 0.9891624830663798,
69
+ "support": 200.0
70
+ },
71
+ "weighted avg": {
72
+ "precision": 0.9904166666666667,
73
+ "recall": 0.99,
74
+ "f1-score": 0.9900502032034424,
75
+ "support": 200.0
76
+ }
77
+ }
78
+ },
79
+ "confidence_metrics": {
80
+ "mean_confidence": 0.9866664409637451,
81
+ "median_confidence": 0.9977442026138306,
82
+ "min_confidence": 0.547796905040741,
83
+ "max_confidence": 0.9994937181472778,
84
+ "std_confidence": 0.049998048692941666
85
+ },
86
+ "feature_importance": {
87
+ "claim_type": 0.0050552659668028355,
88
+ "damage_amount": 0.5835694074630737,
89
+ "injury_involved": 0.2242950052022934,
90
+ "risk_factor": 0.18708032369613647
91
+ },
92
+ "uncertainty_metrics": {
93
+ "mean_entropy": 0.04878337308764458,
94
+ "mean_normalized_entropy": 0.044404540210962296,
95
+ "low_uncertainty_count": 195,
96
+ "medium_uncertainty_count": 2,
97
+ "high_uncertainty_count": 3
98
+ },
99
+ "governance_compliance": true
100
+ }
generate_training_data.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Synthetic Training Data Generator for Insurance Claims Decision Support
3
+ ========================================================================
4
+
5
+ GOVERNANCE CONSTRAINTS:
6
+ - Generates data ONLY for features defined in decision_spec.yaml
7
+ - Uses FROZEN decision boundaries to assign labels
8
+ - Synthetic data for demonstration purposes only
9
+ - No real customer data used
10
+
11
+ Purpose: Create training dataset with proper input features
12
+ """
13
+
14
+ import pandas as pd
15
+ import numpy as np
16
+ import random
17
+ from datetime import datetime
18
+
19
+ # FROZEN DECISION BOUNDARIES - DO NOT MODIFY
20
+ DECISION_BOUNDARIES = {
21
+ 'damage_thresholds': {
22
+ 'low': 5000,
23
+ 'medium': 15000,
24
+ 'high': 50000
25
+ },
26
+ 'risk_weights': {
27
+ 'low': 1.0,
28
+ 'medium': 1.5,
29
+ 'high': 2.0
30
+ },
31
+ 'injury_multiplier': 1.8,
32
+ 'severity_thresholds': {
33
+ 'low': 5,
34
+ 'medium': 15
35
+ }
36
+ }
37
+
38
+ def calculate_severity_score(claim_type, damage_amount, injury_involved, risk_factor):
39
+ """
40
+ Calculate severity score using FROZEN decision boundaries.
41
+ This replicates the logic from decision_spec.yaml.
42
+ """
43
+ # Base score from damage amount
44
+ if damage_amount < DECISION_BOUNDARIES['damage_thresholds']['low']:
45
+ damage_score = 2
46
+ elif damage_amount < DECISION_BOUNDARIES['damage_thresholds']['medium']:
47
+ damage_score = 5
48
+ elif damage_amount < DECISION_BOUNDARIES['damage_thresholds']['high']:
49
+ damage_score = 10
50
+ else:
51
+ damage_score = 20
52
+
53
+ # Apply risk weight
54
+ risk_weight = DECISION_BOUNDARIES['risk_weights'][risk_factor]
55
+ score = damage_score * risk_weight
56
+
57
+ # Apply injury multiplier
58
+ if injury_involved:
59
+ score *= DECISION_BOUNDARIES['injury_multiplier']
60
+
61
+ # Determine severity level
62
+ if score < DECISION_BOUNDARIES['severity_thresholds']['low']:
63
+ return 'low'
64
+ elif score < DECISION_BOUNDARIES['severity_thresholds']['medium']:
65
+ return 'medium'
66
+ else:
67
+ return 'high'
68
+
69
+ def generate_synthetic_dataset(n_samples=1000, random_seed=42):
70
+ """
71
+ Generate synthetic training data based on decision_spec.yaml.
72
+
73
+ Args:
74
+ n_samples: Number of samples to generate
75
+ random_seed: Random seed for reproducibility
76
+
77
+ Returns:
78
+ DataFrame with input features and target labels
79
+ """
80
+ random.seed(random_seed)
81
+ np.random.seed(random_seed)
82
+
83
+ print("=" * 70)
84
+ print("GENERATING SYNTHETIC TRAINING DATA")
85
+ print("=" * 70)
86
+ print(f"Samples to generate: {n_samples}")
87
+ print(f"Random seed: {random_seed}")
88
+ print(f"\nFeatures (from decision_spec.yaml):")
89
+ print(" - claim_type: categorical (Auto, Property, Health, Liability)")
90
+ print(" - damage_amount: numeric (USD)")
91
+ print(" - injury_involved: boolean")
92
+ print(" - risk_factor: categorical (low, medium, high)")
93
+ print(f"\nTarget: severity (low, medium, high)")
94
+ print(f"Calculation: Using FROZEN decision boundaries")
95
+
96
+ data = []
97
+
98
+ for i in range(n_samples):
99
+ # Generate random input features
100
+ claim_type = random.choice(['Auto', 'Property', 'Health', 'Liability'])
101
+
102
+ # Generate damage amount with realistic distribution
103
+ # Log-normal distribution for realistic claim amounts
104
+ damage_amount = np.random.lognormal(mean=9, sigma=1.2)
105
+ damage_amount = round(min(damage_amount, 200000), 2) # Cap at $200k
106
+
107
+ # Injury more likely for Auto and Liability claims
108
+ if claim_type in ['Auto', 'Liability']:
109
+ injury_involved = random.choices([True, False], weights=[0.3, 0.7])[0]
110
+ else:
111
+ injury_involved = random.choices([True, False], weights=[0.1, 0.9])[0]
112
+
113
+ # Risk factor distribution
114
+ risk_factor = random.choices(
115
+ ['low', 'medium', 'high'],
116
+ weights=[0.5, 0.35, 0.15]
117
+ )[0]
118
+
119
+ # Calculate severity using FROZEN boundaries
120
+ severity = calculate_severity_score(
121
+ claim_type, damage_amount, injury_involved, risk_factor
122
+ )
123
+
124
+ data.append({
125
+ 'claim_type': claim_type,
126
+ 'damage_amount': damage_amount,
127
+ 'injury_involved': injury_involved,
128
+ 'risk_factor': risk_factor,
129
+ 'severity': severity
130
+ })
131
+
132
+ df = pd.DataFrame(data)
133
+
134
+ print(f"\n{'='*70}")
135
+ print("DATASET GENERATION COMPLETE")
136
+ print(f"{'='*70}")
137
+ print(f"Total samples: {len(df)}")
138
+ print(f"\nFeature summary:")
139
+ print(df.describe(include='all'))
140
+ print(f"\nTarget distribution:")
141
+ print(df['severity'].value_counts())
142
+ print(f"\nSample rows:")
143
+ print(df.head(10))
144
+
145
+ return df
146
+
147
+ if __name__ == "__main__":
148
+ # Generate dataset
149
+ df = generate_synthetic_dataset(n_samples=1000)
150
+
151
+ # Save to CSV
152
+ output_file = 'synthetic_training_data.csv'
153
+ df.to_csv(output_file, index=False)
154
+ print(f"\n{'='*70}")
155
+ print(f"Dataset saved to: {output_file}")
156
+ print(f"{'='*70}")
157
+ print("\nGOVERNANCE STATUS: ✓ COMPLIANT")
158
+ print(" - Uses only allowed features from decision_spec.yaml")
159
+ print(" - Applies FROZEN decision boundaries")
160
+ print(" - Synthetic data (no real customer information)")
161
+ print(" - Suitable for demonstration/training purposes only")
model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45fc639c24d8d59d3456b5fff794ee4a37b32bda0a7c83020e8935f647f206c2
3
+ size 349966
model_metadata.json ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "XGBoost Classifier",
3
+ "model_architecture": "Classical ML (tree-based gradient boosting)",
4
+ "governance_status": "ADVISORY ONLY - NO AUTONOMOUS DECISIONS",
5
+ "human_review_required": true,
6
+ "training_date": "2026-01-04T16:44:20.621562",
7
+ "dataset": "BDR-AI/insurance_decision_boundaries_v1",
8
+ "dataset_type": "synthetic",
9
+ "features": [
10
+ "claim_type",
11
+ "damage_amount",
12
+ "injury_involved",
13
+ "risk_factor"
14
+ ],
15
+ "target": "severity (advisory levels: Low/Medium/High)",
16
+ "decision_boundaries": {
17
+ "damage_thresholds": {
18
+ "low": 5000,
19
+ "medium": 15000,
20
+ "high": 50000
21
+ },
22
+ "risk_weights": {
23
+ "low": 1.0,
24
+ "medium": 1.5,
25
+ "high": 2.0
26
+ },
27
+ "injury_multiplier": 1.8,
28
+ "severity_thresholds": {
29
+ "low": 5,
30
+ "medium": 15
31
+ }
32
+ },
33
+ "metrics": {
34
+ "accuracy": 0.99,
35
+ "classification_report": {
36
+ "high": {
37
+ "precision": 0.9583333333333334,
38
+ "recall": 1.0,
39
+ "f1-score": 0.9787234042553191,
40
+ "support": 46.0
41
+ },
42
+ "low": {
43
+ "precision": 1.0,
44
+ "recall": 1.0,
45
+ "f1-score": 1.0,
46
+ "support": 64.0
47
+ },
48
+ "medium": {
49
+ "precision": 1.0,
50
+ "recall": 0.9777777777777777,
51
+ "f1-score": 0.9887640449438202,
52
+ "support": 90.0
53
+ },
54
+ "accuracy": 0.99,
55
+ "macro avg": {
56
+ "precision": 0.9861111111111112,
57
+ "recall": 0.9925925925925926,
58
+ "f1-score": 0.9891624830663798,
59
+ "support": 200.0
60
+ },
61
+ "weighted avg": {
62
+ "precision": 0.9904166666666667,
63
+ "recall": 0.99,
64
+ "f1-score": 0.9900502032034424,
65
+ "support": 200.0
66
+ }
67
+ },
68
+ "confusion_matrix": [
69
+ [
70
+ 46,
71
+ 0,
72
+ 0
73
+ ],
74
+ [
75
+ 0,
76
+ 64,
77
+ 0
78
+ ],
79
+ [
80
+ 2,
81
+ 0,
82
+ 88
83
+ ]
84
+ ],
85
+ "feature_importance": {
86
+ "claim_type": 0.0050552659668028355,
87
+ "damage_amount": 0.5835694074630737,
88
+ "injury_involved": 0.2242950052022934,
89
+ "risk_factor": 0.18708032369613647
90
+ }
91
+ }
92
+ }