Add trained model artifacts and synthetic data generator

Added files:
- generate_training_data.py: Synthetic training data generator using FROZEN decision boundaries
- encoders.pkl: Feature encoders for model
- model.pkl: Trained XGBoost classifier (99% accuracy)
- model_metadata.json: Model training metadata
- evaluation_report.json: Comprehensive evaluation metrics

Governance Status: ✓ COMPLIANT
- Classical ML only (XGBoost)
- Uses FROZEN decision boundaries from decision_spec.yaml
- All outputs ADVISORY ONLY
- Human-in-the-loop MANDATORY

Files changed (5) hide show

encoders.pkl +3 -0
evaluation_report.json +100 -0
generate_training_data.py +161 -0
model.pkl +3 -0
model_metadata.json +92 -0

encoders.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49f72154c797f02c02e2687aac56f6f2f4b97178f5856925e1bdb961db1f9dab
+size 1011

evaluation_report.json ADDED Viewed

	@@ -0,0 +1,100 @@

+{
+  "evaluation_date": "2026-01-04T16:45:44.784636",
+  "model_file": "model.pkl",
+  "test_samples": 200,
+  "classification_metrics": {
+    "accuracy": 0.99,
+    "precision": [
+      0.9583333333333334,
+      1.0,
+      1.0
+    ],
+    "recall": [
+      1.0,
+      1.0,
+      0.9777777777777777
+    ],
+    "f1_score": [
+      0.9787234042553191,
+      1.0,
+      0.9887640449438202
+    ],
+    "support": [
+      46,
+      64,
+      90
+    ],
+    "confusion_matrix": [
+      [
+        46,
+        0,
+        0
+      ],
+      [
+        0,
+        64,
+        0
+      ],
+      [
+        2,
+        0,
+        88
+      ]
+    ],
+    "log_loss": 0.02118674763321642,
+    "classification_report": {
+      "high": {
+        "precision": 0.9583333333333334,
+        "recall": 1.0,
+        "f1-score": 0.9787234042553191,
+        "support": 46.0
+      },
+      "low": {
+        "precision": 1.0,
+        "recall": 1.0,
+        "f1-score": 1.0,
+        "support": 64.0
+      },
+      "medium": {
+        "precision": 1.0,
+        "recall": 0.9777777777777777,
+        "f1-score": 0.9887640449438202,
+        "support": 90.0
+      },
+      "accuracy": 0.99,
+      "macro avg": {
+        "precision": 0.9861111111111112,
+        "recall": 0.9925925925925926,
+        "f1-score": 0.9891624830663798,
+        "support": 200.0
+      },
+      "weighted avg": {
+        "precision": 0.9904166666666667,
+        "recall": 0.99,
+        "f1-score": 0.9900502032034424,
+        "support": 200.0
+      }
+    }
+  },
+  "confidence_metrics": {
+    "mean_confidence": 0.9866664409637451,
+    "median_confidence": 0.9977442026138306,
+    "min_confidence": 0.547796905040741,
+    "max_confidence": 0.9994937181472778,
+    "std_confidence": 0.049998048692941666
+  },
+  "feature_importance": {
+    "claim_type": 0.0050552659668028355,
+    "damage_amount": 0.5835694074630737,
+    "injury_involved": 0.2242950052022934,
+    "risk_factor": 0.18708032369613647
+  },
+  "uncertainty_metrics": {
+    "mean_entropy": 0.04878337308764458,
+    "mean_normalized_entropy": 0.044404540210962296,
+    "low_uncertainty_count": 195,
+    "medium_uncertainty_count": 2,
+    "high_uncertainty_count": 3
+  },
+  "governance_compliance": true
+}

generate_training_data.py ADDED Viewed

	@@ -0,0 +1,161 @@

+"""
+Synthetic Training Data Generator for Insurance Claims Decision Support
+========================================================================
+GOVERNANCE CONSTRAINTS:
+- Generates data ONLY for features defined in decision_spec.yaml
+- Uses FROZEN decision boundaries to assign labels
+- Synthetic data for demonstration purposes only
+- No real customer data used
+Purpose: Create training dataset with proper input features
+"""
+import pandas as pd
+import numpy as np
+import random
+from datetime import datetime
+# FROZEN DECISION BOUNDARIES - DO NOT MODIFY
+DECISION_BOUNDARIES = {
+    'damage_thresholds': {
+        'low': 5000,
+        'medium': 15000,
+        'high': 50000
+    },
+    'risk_weights': {
+        'low': 1.0,
+        'medium': 1.5,
+        'high': 2.0
+    },
+    'injury_multiplier': 1.8,
+    'severity_thresholds': {
+        'low': 5,
+        'medium': 15
+    }
+}
+def calculate_severity_score(claim_type, damage_amount, injury_involved, risk_factor):
+    """
+    Calculate severity score using FROZEN decision boundaries.
+    This replicates the logic from decision_spec.yaml.
+    """
+    # Base score from damage amount
+    if damage_amount < DECISION_BOUNDARIES['damage_thresholds']['low']:
+        damage_score = 2
+    elif damage_amount < DECISION_BOUNDARIES['damage_thresholds']['medium']:
+        damage_score = 5
+    elif damage_amount < DECISION_BOUNDARIES['damage_thresholds']['high']:
+        damage_score = 10
+    else:
+        damage_score = 20
+    # Apply risk weight
+    risk_weight = DECISION_BOUNDARIES['risk_weights'][risk_factor]
+    score = damage_score * risk_weight
+    # Apply injury multiplier
+    if injury_involved:
+        score *= DECISION_BOUNDARIES['injury_multiplier']
+    # Determine severity level
+    if score < DECISION_BOUNDARIES['severity_thresholds']['low']:
+        return 'low'
+    elif score < DECISION_BOUNDARIES['severity_thresholds']['medium']:
+        return 'medium'
+    else:
+        return 'high'
+def generate_synthetic_dataset(n_samples=1000, random_seed=42):
+    """
+    Generate synthetic training data based on decision_spec.yaml.
+    Args:
+        n_samples: Number of samples to generate
+        random_seed: Random seed for reproducibility
+    Returns:
+        DataFrame with input features and target labels
+    """
+    random.seed(random_seed)
+    np.random.seed(random_seed)
+    print("=" * 70)
+    print("GENERATING SYNTHETIC TRAINING DATA")
+    print("=" * 70)
+    print(f"Samples to generate: {n_samples}")
+    print(f"Random seed: {random_seed}")
+    print(f"\nFeatures (from decision_spec.yaml):")
+    print("  - claim_type: categorical (Auto, Property, Health, Liability)")
+    print("  - damage_amount: numeric (USD)")
+    print("  - injury_involved: boolean")
+    print("  - risk_factor: categorical (low, medium, high)")
+    print(f"\nTarget: severity (low, medium, high)")
+    print(f"Calculation: Using FROZEN decision boundaries")
+    data = []
+    for i in range(n_samples):
+        # Generate random input features
+        claim_type = random.choice(['Auto', 'Property', 'Health', 'Liability'])
+        # Generate damage amount with realistic distribution
+        # Log-normal distribution for realistic claim amounts
+        damage_amount = np.random.lognormal(mean=9, sigma=1.2)
+        damage_amount = round(min(damage_amount, 200000), 2)  # Cap at $200k
+        # Injury more likely for Auto and Liability claims
+        if claim_type in ['Auto', 'Liability']:
+            injury_involved = random.choices([True, False], weights=[0.3, 0.7])[0]
+        else:
+            injury_involved = random.choices([True, False], weights=[0.1, 0.9])[0]
+        # Risk factor distribution
+        risk_factor = random.choices(
+            ['low', 'medium', 'high'],
+            weights=[0.5, 0.35, 0.15]
+        )[0]
+        # Calculate severity using FROZEN boundaries
+        severity = calculate_severity_score(
+            claim_type, damage_amount, injury_involved, risk_factor
+        )
+        data.append({
+            'claim_type': claim_type,
+            'damage_amount': damage_amount,
+            'injury_involved': injury_involved,
+            'risk_factor': risk_factor,
+            'severity': severity
+        })
+    df = pd.DataFrame(data)
+    print(f"\n{'='*70}")
+    print("DATASET GENERATION COMPLETE")
+    print(f"{'='*70}")
+    print(f"Total samples: {len(df)}")
+    print(f"\nFeature summary:")
+    print(df.describe(include='all'))
+    print(f"\nTarget distribution:")
+    print(df['severity'].value_counts())
+    print(f"\nSample rows:")
+    print(df.head(10))
+    return df
+if __name__ == "__main__":
+    # Generate dataset
+    df = generate_synthetic_dataset(n_samples=1000)
+    # Save to CSV
+    output_file = 'synthetic_training_data.csv'
+    df.to_csv(output_file, index=False)
+    print(f"\n{'='*70}")
+    print(f"Dataset saved to: {output_file}")
+    print(f"{'='*70}")
+    print("\nGOVERNANCE STATUS: ✓ COMPLIANT")
+    print("  - Uses only allowed features from decision_spec.yaml")
+    print("  - Applies FROZEN decision boundaries")
+    print("  - Synthetic data (no real customer information)")
+    print("  - Suitable for demonstration/training purposes only")

model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45fc639c24d8d59d3456b5fff794ee4a37b32bda0a7c83020e8935f647f206c2
+size 349966

model_metadata.json ADDED Viewed

	@@ -0,0 +1,92 @@

+{
+  "model_type": "XGBoost Classifier",
+  "model_architecture": "Classical ML (tree-based gradient boosting)",
+  "governance_status": "ADVISORY ONLY - NO AUTONOMOUS DECISIONS",
+  "human_review_required": true,
+  "training_date": "2026-01-04T16:44:20.621562",
+  "dataset": "BDR-AI/insurance_decision_boundaries_v1",
+  "dataset_type": "synthetic",
+  "features": [
+    "claim_type",
+    "damage_amount",
+    "injury_involved",
+    "risk_factor"
+  ],
+  "target": "severity (advisory levels: Low/Medium/High)",
+  "decision_boundaries": {
+    "damage_thresholds": {
+      "low": 5000,
+      "medium": 15000,
+      "high": 50000
+    },
+    "risk_weights": {
+      "low": 1.0,
+      "medium": 1.5,
+      "high": 2.0
+    },
+    "injury_multiplier": 1.8,
+    "severity_thresholds": {
+      "low": 5,
+      "medium": 15
+    }
+  },
+  "metrics": {
+    "accuracy": 0.99,
+    "classification_report": {
+      "high": {
+        "precision": 0.9583333333333334,
+        "recall": 1.0,
+        "f1-score": 0.9787234042553191,
+        "support": 46.0
+      },
+      "low": {
+        "precision": 1.0,
+        "recall": 1.0,
+        "f1-score": 1.0,
+        "support": 64.0
+      },
+      "medium": {
+        "precision": 1.0,
+        "recall": 0.9777777777777777,
+        "f1-score": 0.9887640449438202,
+        "support": 90.0
+      },
+      "accuracy": 0.99,
+      "macro avg": {
+        "precision": 0.9861111111111112,
+        "recall": 0.9925925925925926,
+        "f1-score": 0.9891624830663798,
+        "support": 200.0
+      },
+      "weighted avg": {
+        "precision": 0.9904166666666667,
+        "recall": 0.99,
+        "f1-score": 0.9900502032034424,
+        "support": 200.0
+      }
+    },
+    "confusion_matrix": [
+      [
+        46,
+        0,
+        0
+      ],
+      [
+        0,
+        64,
+        0
+      ],
+      [
+        2,
+        0,
+        88
+      ]
+    ],
+    "feature_importance": {
+      "claim_type": 0.0050552659668028355,
+      "damage_amount": 0.5835694074630737,
+      "injury_involved": 0.2242950052022934,
+      "risk_factor": 0.18708032369613647
+    }
+  }
+}