Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

.gitattributes +1 -0
README.md +100 -0
app.py +72 -0
feature_list.json +12 -0
feature_scaler.pkl +3 -0
priority_queue.py +304 -0
requirements.txt +5 -0
severity_model.json +0 -0
severity_model_pipeline.py +550 -0
shap_bar_plot.png +0 -0
shap_dot_plot.png +3 -0
simulation_output.txt +0 -0
synthetic_pothole_data.csv +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+shap_dot_plot.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,100 @@

+---
+title: Pothole Severity Scoring
+emoji: 🕳️
+colorFrom: yellow
+colorTo: red
+sdk: gradio
+sdk_version: 5.12.0
+app_file: app.py
+pinned: false
+license: mit
+tags:
+- xgboost
+- tabular-regression
+- civic-tech
+- pothole-detection
+---
+# Model Card for Pothole Severity Scoring
+## Model Details
+### Model Description
+This is an XGBoost Regressor model designed to predict the priority/severity score of civic infrastructure issues (specifically potholes). It evaluates multiple structural, environmental, and temporal features to output a severity score bounded between 0 and 1, assisting civic authorities in prioritizing repairs and resource allocation.
+- **Developed by:** Civic AI System (Demo)
+- **Model type:** XGBoost Regressor
+- **License:** MIT
+## Uses
+### Direct Use
+The model natively ingests 10 engineered features characterizing a reported pothole and outputs:
+- A numeric severity score ($S \in [0,1]$).
+- A qualitative priority label ("Low", "Medium", "High").
+This is intended for sorting and prioritizing civil work dispatch queues.
+## Bias, Risks, and Limitations
+The model heavily factors in proximity to critical infrastructure (`P`) and road hierarchy (`R`). While this effectively prioritizes areas like highways and hospitals, it may systematically delay repairs in neglected or local neighborhoods if those areas lack designated local "critical infrastructure". Disparate impact assessments should be run periodically to ensure equitable civic maintenance.
+## Training Details
+### Training Data
+The model was trained on a synthetically generated dataset of `10,000` samples designed to mirror realistic distributions of civic reporting. Features include:
+- `A`: Defect area ratio
+- `D`: Defect density
+- `C`: Centrality (distance from center)
+- `Q`: Initial detection confidence
+- `M`: Multi-user confirmation score
+- `T`: Temporal persistence (days unresolved)
+- `R`: Traffic importance tier
+- `P`: Proximity to critical infrastructure
+- `F`: Recurrence frequency
+- `X`: Resolution failure count
+All features are min-max scaled `[0,1]`.
+### Training Procedure
+- **Algorithm:** XGBoost
+- **Objective:** `reg:squarederror`
+- **Trees:** 200
+- **Max Depth:** 5
+- **Learning Rate:** 0.05
+## 📊 Performance & Interpretability
+### Model Metrics
+The model demonstrates high precision in predicting the severity score $S$, which controls civic resource allocation.
+| Metric | Value | Interpretation |
+| :--- | :--- | :--- |
+| **RMSE** | 0.0312 | Low average error (0.03 units on 0-1 scale) |
+| **MAE** | 0.0247 | High predictive accuracy |
+| **R² Score** | 0.8067 | 80% of variance explained by features |
+### Feature Importance (Gain)
+The following ranking describes how much each feature contributes to the XGBoost tree construction:
+1. **C (Centrality)**: 0.3585 — Central potholes pose higher collision risks.
+2. **A (Area Ratio)**: 0.2187 — Size of the defect is a primary driver.
+3. **R (Road Type)**: 0.1629 — Priority given to highways over local streets.
+4. **P (Proximity)**: 0.0937 — Closeness to critical infrastructure.
+### SHAP Visualizations
+We use SHAP (SHapley Additive exPlanations) to explain individual predictions and global feature influence.
+#### Global Feature Impact
+The bar chart below shows the mean absolute SHAP value, identifying which features consistently shift the severity score.
+![SHAP Bar Plot](shap_bar_plot.png)
+#### Detailed Impact (Beeswarm)
+The summary plot shows how high vs. low values of a feature affect the outcome. For example, high values of **C (Centrality)** push the score significantly higher.
+![SHAP Dot Plot](shap_dot_plot.png)

app.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import sys
+# SHIM FOR PYTHON 3.13: fake audioop module before any imports
+try:
+    import audioop
+except ImportError:
+    import types
+    sys.modules["audioop"] = types.ModuleType("audioop")
+import gradio as gr
+import xgboost as xgb
+import joblib
+import json
+import numpy as np
+# --- Load Assets ---
+MODEL_PATH = "severity_model.json"
+SCALER_PATH = "feature_scaler.pkl"
+FEATURES_PATH = "feature_list.json"
+def load_resources():
+    model = xgb.XGBRegressor()
+    model.load_model(MODEL_PATH)
+    scaler = joblib.load(SCALER_PATH)
+    with open(FEATURES_PATH) as f:
+        features = json.load(f)
+    return model, scaler, features
+model, scaler, feature_names = load_resources()
+def get_label(score):
+    if score < 0.33: return "Low 🟢"
+    if score < 0.66: return "Medium 🟡"
+    return "High 🔴"
+def predict(*args):
+    input_dict = dict(zip(feature_names, args))
+    row = np.array([[input_dict[f] for f in feature_names]], dtype=np.float32)
+    scaled_row = scaler.transform(row)
+    prediction = float(model.predict(scaled_row)[0])
+    score = max(0, min(1, prediction))
+    return round(score, 4), get_label(score)
+# --- UI ---
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🕳️ Pothole Severity Predictor (Civic AI)")
+    gr.Markdown("Adjust the sliders below to simulate pothole features and predict repair priority.")
+    with gr.Row():
+        with gr.Column():
+            a = gr.Slider(0, 1, value=0.1, label="Area Ratio (A)", info="Size of pothole")
+            d = gr.Slider(0, 1, value=0.1, label="Density (D)", info="Fragmentation")
+            c = gr.Slider(0, 1, value=0.5, label="Centrality (C)", info="0=Edge, 1=Center")
+            q = gr.Slider(0, 1, value=0.9, label="Confidence (Q)", info="CV Model Certainty")
+            m = gr.Slider(0, 1, value=0.1, label="Confirmations (M)", info="User reports")
+        with gr.Column():
+            t = gr.Slider(0, 1, value=0.1, label="Persistence (T)", info="Wait time")
+            r = gr.Slider(0, 1, value=0.4, label="Road Type (R)", info="0.4:Local, 1.0:Highway")
+            p = gr.Slider(0, 1, value=0.1, label="Critical Infra (P)", info="Proximity to hospitals/schools")
+            f = gr.Slider(0, 1, value=0.1, label="Recurrence (F)", info="Historical failure")
+            x = gr.Slider(0, 1, value=0.0, label="Reopen Count (X)", info="Failed repairs")
+    btn = gr.Button("Calculate Severity Score", variant="primary")
+    with gr.Row():
+        out_score = gr.Number(label="Severity Score (0-1)")
+        out_label = gr.Textbox(label="Priority Level")
+    btn.click(predict, inputs=[a, d, c, q, m, t, r, p, f, x], outputs=[out_score, out_label])
+if __name__ == "__main__":
+    demo.launch()

feature_list.json ADDED Viewed

	@@ -0,0 +1,12 @@

+[
+  "A",
+  "D",
+  "C",
+  "Q",
+  "M",
+  "T",
+  "R",
+  "P",
+  "F",
+  "X"
+]

feature_scaler.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98bee969099864217324a3154bd7e0e65ef2a167d6616feb66e656c2a853cc7f
+size 1351

priority_queue.py ADDED Viewed

	@@ -0,0 +1,304 @@

+"""
+=============================================================================
+CIVIC ISSUE MANAGEMENT — PRIORITY QUEUE SYSTEM
+=============================================================================
+A production-grade Priority Queue for managing civic issues (potholes),
+prioritized by a composite score evaluating Severity, SLA Breach,
+Escalation Status, and Reopen Frequency.
+Features:
+- Global Queue, Ward-specific Queues, and Contractor-specific Queues.
+- O(log N) task insertion and updates.
+- Real-time SLA breach overrides and explicit emergency handling.
+- Smart lazy-deletion to maintain computational efficiency during updates.
+=============================================================================
+"""
+import heapq
+import itertools
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+import random
+from typing import Dict, List, Optional
+# =============================================================================
+# DATA STRUCTURES & CONFIGURATION
+# =============================================================================
+@dataclass
+class CivicTask:
+    task_id: str
+    severity_score: float
+    severity_label: str
+    created_at: datetime
+    days_pending: int
+    sla_days: int
+    ward: str
+    contractor_id: str
+    is_escalated: bool
+    reopen_count: int
+    emergency_override: bool = False
+    def compute_priority(self) -> float:
+        """
+        Computes the priority score based on the specified formula:
+        Priority = (Sev * 0.6) + (SLA Breach * 0.2) + (Escalation * 0.1) + (Reopen * 0.1)
+        """
+        if self.emergency_override:
+            return float('inf')  # Highest conceivable priority
+        # SLA breach factor computation
+        if self.days_pending <= self.sla_days:
+            sla_breach_factor = 0.0
+        else:
+            sla_breach_factor = min(1.0, (self.days_pending - self.sla_days) / self.sla_days)
+        # Escalation factor
+        escalation_factor = 1.0 if self.is_escalated else 0.0
+        # Reopen factor
+        reopen_factor = min(1.0, self.reopen_count / 3.0)
+        # Final Priority Score
+        priority_score = (
+            (self.severity_score * 0.6) +
+            (sla_breach_factor * 0.2) +
+            (escalation_factor * 0.1) +
+            (reopen_factor * 0.1)
+        )
+        return priority_score
+    def get_priority_reason(self) -> str:
+        """Helper to generate a human-readable explanation of why this is prioritized."""
+        if self.emergency_override:
+            return "🚨 EMERGENCY OVERRIDE"
+        reasons = []
+        if self.severity_score >= 0.66:
+            reasons.append("🔥 High Severity")
+        if self.days_pending > self.sla_days:
+            reasons.append(f"⏳ SLA Breach (+{self.days_pending - self.sla_days} days)")
+        if self.is_escalated:
+            reasons.append("📣 Escalated")
+        if self.reopen_count > 0:
+            reasons.append(f"🔁 Reopened ({self.reopen_count}x)")
+        return " | ".join(reasons) if reasons else "✅ Standard Processing"
+# =============================================================================
+# QUEUE IMPLEMENTATION
+# =============================================================================
+class PriorityQueue:
+    """
+    Min-heap implementation storing negative priorities to act as a Max-Heap.
+    Implements lazy deletion for O(1) removals and O(log N) updates.
+    """
+    def __init__(self, name: str):
+        self.name = name
+        self.pq = []                         # list of entries arranged in a heap
+        self.entry_finder = {}               # mapping of tasks to entries
+        self.REMOVED = '<removed-task>'      # placeholder for a removed task
+        self.counter = itertools.count()     # unique sequence count for tie-breaking
+    def add_task(self, task: CivicTask):
+        """Add a new task or update the priority of an existing task."""
+        if task.task_id in self.entry_finder:
+             self.remove_task(task.task_id)
+        score = task.compute_priority()
+        count = next(self.counter)
+        # Store negative score so the smallest (most negative) bubbles to the top
+        entry = [-score, count, task]
+        self.entry_finder[task.task_id] = entry
+        heapq.heappush(self.pq, entry)
+    def remove_task(self, task_id: str):
+        """Mark an existing task as REMOVED. Doesn't break heap structure."""
+        entry = self.entry_finder.pop(task_id, None)
+        if entry is not None:
+            entry[-1] = self.REMOVED
+    def pop_task(self) -> Optional[CivicTask]:
+        """Remove and return the lowest priority task. Raises KeyError if empty."""
+        while self.pq:
+            score, count, task = heapq.heappop(self.pq)
+            if task is not self.REMOVED:
+                del self.entry_finder[task.task_id]
+                return task
+        return None
+    def peek_top(self) -> Optional[CivicTask]:
+        """Look at the highest priority task without removing it."""
+        while self.pq:
+            score, count, task = self.pq[0]
+            if task is not self.REMOVED:
+                return task
+            heapq.heappop(self.pq) # Clean up removed items floating at the top
+        return None
+    def reprioritize_all(self):
+        """Re-evaluate all priority scores. Required when time passes (SLA changes)."""
+        valid_tasks = [entry[-1] for entry in self.entry_finder.values() if entry[-1] is not self.REMOVED]
+        self.pq = []
+        self.entry_finder = {}
+        for task in valid_tasks:
+            self.add_task(task)
+    def get_sorted_tasks(self) -> List[CivicTask]:
+        """Return all valid tasks sorted by priority (Read-only, doesn't pop)."""
+        valid_entries = [e for e in self.entry_finder.values() if e[-1] is not self.REMOVED]
+        valid_entries.sort(key=lambda x: (x[0], x[1]))
+        return [e[-1] for e in valid_entries]
+class CivicDispatchSystem:
+    """Orchestrates Global, Ward, and Contractor queues."""
+    def __init__(self):
+        self.global_queue = PriorityQueue("Global Queue")
+        self.ward_queues: Dict[str, PriorityQueue] = {}
+        self.contractor_queues: Dict[str, PriorityQueue] = {}
+        self.task_registry: Dict[str, CivicTask] = {}
+    def add_task(self, task: CivicTask):
+        self.task_registry[task.task_id] = task
+        self.global_queue.add_task(task)
+        # Ward specific queue
+        if task.ward not in self.ward_queues:
+            self.ward_queues[task.ward] = PriorityQueue(f"Ward-{task.ward}")
+        self.ward_queues[task.ward].add_task(task)
+        # Contractor specific queue
+        if task.contractor_id not in self.contractor_queues:
+            self.contractor_queues[task.contractor_id] = PriorityQueue(f"Contractor-{task.contractor_id}")
+        self.contractor_queues[task.contractor_id].add_task(task)
+    def get_next_task(self) -> Optional[CivicTask]:
+        """Pops highest global priority."""
+        task = self.global_queue.pop_task()
+        if task:
+             self._sync_removals(task.task_id, task.ward, task.contractor_id)
+        return task
+    def remove_task(self, task_id: str):
+        if task_id in self.task_registry:
+            task = self.task_registry[task_id]
+            self.global_queue.remove_task(task_id)
+            self._sync_removals(task_id, task.ward, task.contractor_id)
+    def _sync_removals(self, task_id: str, ward: str, contractor_id: str):
+        """Keep sub-queues in sync if popped from global."""
+        if task_id in self.task_registry:
+            del self.task_registry[task_id]
+        if ward in self.ward_queues:
+            self.ward_queues[ward].remove_task(task_id)
+        if contractor_id in self.contractor_queues:
+            self.contractor_queues[contractor_id].remove_task(task_id)
+    def update_task(self, task_id: str, updates: dict):
+        """Apply updates and re-insert into queues to recalculate priorities."""
+        if task_id in self.task_registry:
+            task = self.task_registry[task_id]
+            for key, value in updates.items():
+                if hasattr(task, key):
+                    setattr(task, key, value)
+            self.add_task(task) # add_task handles the update internally
+    def reprioritize_system(self):
+        """Execute when system time passes or bulk updates happen."""
+        self.global_queue.reprioritize_all()
+        for q in self.ward_queues.values(): q.reprioritize_all()
+        for q in self.contractor_queues.values(): q.reprioritize_all()
+# =============================================================================
+# SIMULATION ENGINE
+# =============================================================================
+def generate_random_tasks(num_tasks: int) -> List[CivicTask]:
+    tasks = []
+    wards = ["North", "South", "East", "West", "Central"]
+    contractors = ["AlphaRepairs", "CityFix", "OmegaPaving"]
+    for i in range(num_tasks):
+        score = round(random.uniform(0.1, 0.95), 2)
+        label = "High" if score > 0.66 else ("Medium" if score > 0.33 else "Low")
+        task = CivicTask(
+            task_id=f"TSK-{i:04d}",
+            severity_score=score,
+            severity_label=label,
+            created_at=datetime.now() - timedelta(days=random.randint(0, 10)),
+            days_pending=random.randint(0, 15),
+            sla_days=10,
+            ward=random.choice(wards),
+            contractor_id=random.choice(contractors),
+            is_escalated=random.random() > 0.85, # 15% chance
+            reopen_count=random.randint(0, 5) if random.random() > 0.8 else 0
+        )
+        tasks.append(task)
+    return tasks
+def run_simulation():
+    print("="*70)
+    print(" 🚀 INITIALIZING SYSTEM & INSERTING TASKS")
+    print("="*70)
+    system = CivicDispatchSystem()
+    tasks = generate_random_tasks(50)
+    for t in tasks:
+        system.add_task(t)
+    print(f"✅ Loaded {len(tasks)} tasks.")
+    print("\n" + "="*70)
+    print(" 🏆 TOP 10 TASKS IN GLOBAL QUEUE")
+    print("="*70)
+    top_tasks = system.global_queue.get_sorted_tasks()[:10]
+    for idx, t in enumerate(top_tasks, start=1):
+        score = t.compute_priority()
+        print(f"{idx:-2d} | [{score:.4f}] {t.task_id:<8} | Sev: {t.severity_score:.2f} ({t.severity_label:<6}) | "
+              f"Wait: {t.days_pending}/{t.sla_days}d | {t.get_priority_reason()}")
+    print("\n" + "="*70)
+    print(" ⏱️ SIMULATING TIME PASSING (+5 DAYS)")
+    print("="*70)
+    # Fast forward 5 days for all tasks left in queue
+    for task in system.task_registry.values():
+        task.days_pending += 5
+    system.reprioritize_system()
+    print("Re-evaluating priorities after SLA changes...\n")
+    new_top = system.global_queue.peek_top()
+    print(f"🆕 NEW TOP TASK: {new_top.task_id} (Score: {new_top.compute_priority():.4f})")
+    print(f"Reason: {new_top.get_priority_reason()}")
+    print("\n" + "="*70)
+    print(" 🔥 SIMULATING EMERGENCY OVERRIDE")
+    print("="*70)
+    # Pick a random low priority task and make it an emergency
+    low_priority_task = system.global_queue.get_sorted_tasks()[-1]
+    print(f"Targeting bottom task {low_priority_task.task_id} (Score: {low_priority_task.compute_priority():.4f})")
+    system.update_task(low_priority_task.task_id, {"emergency_override": True})
+    emergency_top = system.global_queue.peek_top()
+    print(f"🚨 CURRENT TOP TASK: {emergency_top.task_id} (Score: {emergency_top.compute_priority()})")
+    print(f"Reason: {emergency_top.get_priority_reason()}")
+    print("\n" + "="*70)
+    print(" 👷 PROCESSING TASKS BY CONTRACTOR (AlphaRepairs)")
+    print("="*70)
+    alpha_q = system.contractor_queues.get("AlphaRepairs")
+    if alpha_q:
+        c_tasks = alpha_q.get_sorted_tasks()[:5]
+        for t in c_tasks:
+            print(f"[{t.compute_priority():.4f}] {t.task_id} | {t.get_priority_reason()}")
+if __name__ == "__main__":
+    run_simulation()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+numpy
+pandas
+scikit-learn
+xgboost
+joblib

severity_model.json ADDED Viewed

The diff for this file is too large to render. See raw diff

severity_model_pipeline.py ADDED Viewed

	@@ -0,0 +1,550 @@

+"""
+=============================================================================
+CIVIC ISSUE DETECTION — POTHOLE SEVERITY SCORING PIPELINE
+=============================================================================
+Produces a trained XGBoost regression model that predicts severity S ∈ [0,1]
+from 10 engineered features derived from a civic-issue detection system.
+Pipeline Stages
+---------------
+1. Synthetic dataset generation   (10 000 samples, realistic distributions)
+2. Ground-truth severity formula  (weighted sum + infrastructure boost + noise)
+3. Model training                 (XGBoost Regressor, 80/20 split)
+4. Evaluation                     (RMSE, MAE, R²)
+5. Interpretability               (SHAP summary + top-feature analysis)
+6. Artefact export                (severity_model.json, scaler, feature list)
+7. Inference function             (predict_severity → score + label)
+=============================================================================
+"""
+# ---------------------------------------------------------------------------
+# Imports
+# ---------------------------------------------------------------------------
+import json
+import os
+import warnings
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import shap
+import xgboost as xgb
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import MinMaxScaler
+import joblib
+warnings.filterwarnings("ignore")
+# Ensure reproducible results
+RANDOM_SEED = 42
+np.random.seed(RANDOM_SEED)
+# =============================================================================
+# STEP 1 — GENERATE SYNTHETIC DATASET
+# =============================================================================
+def generate_synthetic_dataset(n_samples: int = 10_000, seed: int = RANDOM_SEED) -> pd.DataFrame:
+    """
+    Generate a synthetic dataset with realistic feature distributions for
+    pothole severity modelling.
+    Feature definitions (all in [0, 1]):
+        A  — defect area ratio
+        D  — defect density
+        C  — centrality (closeness to road centre)
+        Q  — detection confidence
+        M  — multi-user confirmation score
+        T  — temporal persistence
+        R  — traffic importance (road hierarchy)
+        P  — proximity to critical infrastructure
+        F  — recurrence frequency
+        X  — resolution failure score
+    """
+    rng = np.random.default_rng(seed)
+    n = n_samples
+    # A: skewed small (most potholes are small) — Beta(2, 8)
+    A = rng.beta(2, 8, n)
+    # D: low-to-moderate, sparse — Beta(1.5, 6)
+    D = rng.beta(1.5, 6, n)
+    # C: uniform (pothole can be anywhere laterally) — Uniform(0, 1)
+    C = rng.uniform(0, 1, n)
+    # Q: high-biased (confident detections) — Beta(8, 2)
+    Q = rng.beta(8, 2, n)
+    # M: sparse confirmations — exponential-ish via Beta(1.2, 8)
+    M = rng.beta(1.2, 8, n)
+    # T: right-skewed (few very old issues) — Beta(1.5, 5)
+    T = rng.beta(1.5, 5, n)
+    # R: categorical road hierarchy mapped to numeric
+    road_types = rng.choice(
+        [1.0, 0.7, 0.4],          # highway, main road, local street
+        size=n,
+        p=[0.10, 0.35, 0.55],     # realistic road-type proportions
+    )
+    R = road_types.astype(float)
+    # P: mostly low, few high — Beta(1, 10)
+    P = rng.beta(1, 10, n)
+    # F: low recurrence freq — Beta(1.2, 9)
+    F = rng.beta(1.2, 9, n)
+    # X: very low resolution failure rate — Beta(1, 15)
+    X = rng.beta(1, 15, n)
+    df = pd.DataFrame({
+        "A": A,
+        "D": D,
+        "C": C,
+        "Q": Q,
+        "M": M,
+        "T": T,
+        "R": R,
+        "P": P,
+        "F": F,
+        "X": X,
+    })
+    return df
+# =============================================================================
+# STEP 2 — GROUND-TRUTH SEVERITY FORMULA
+# =============================================================================
+def compute_severity(df: pd.DataFrame, noise_std: float = 0.03, seed: int = RANDOM_SEED) -> pd.Series:
+    """
+    Compute ground-truth severity scores.
+    Formula
+    -------
+        S_base = 0.28A + 0.10D + 0.14C + 0.04Q +
+                 0.08M + 0.07T + 0.09R + 0.10P +
+                 0.06F + 0.04X
+        K      = 1 + 0.5 * P          (infrastructure proximity multiplier)
+        S      = clamp(S_base * K + noise, 0, 1)
+    """
+    rng = np.random.default_rng(seed)
+    # Weighted severity base
+    S_base = (
+        0.28 * df["A"] +
+        0.10 * df["D"] +
+        0.14 * df["C"] +
+        0.04 * df["Q"] +
+        0.08 * df["M"] +
+        0.07 * df["T"] +
+        0.09 * df["R"] +
+        0.10 * df["P"] +
+        0.06 * df["F"] +
+        0.04 * df["X"]
+    )
+    # Critical-infrastructure proximity multiplier
+    K = 1 + 0.5 * df["P"]
+    # Boosted severity
+    S_raw = S_base * K
+    # Add Gaussian noise, clamp to [0, 1]
+    noise = rng.normal(loc=0, scale=noise_std, size=len(df))
+    S = np.clip(S_raw + noise, 0, 1)
+    return pd.Series(S, name="severity", index=df.index)
+# =============================================================================
+# STEP 3 — TRAIN XGBOOST MODEL
+# =============================================================================
+FEATURE_COLS = ["A", "D", "C", "Q", "M", "T", "R", "P", "F", "X"]
+def build_and_train_model(
+    X_train: np.ndarray,
+    y_train: np.ndarray,
+    seed: int = RANDOM_SEED,
+) -> xgb.XGBRegressor:
+    """
+    Instantiate and train an XGBoost Regressor on the training split.
+    Hyperparameters are fixed as specified; no tuning loop is performed here
+    (add GridSearchCV / Optuna wrapping for production hyper-opt).
+    """
+    model = xgb.XGBRegressor(
+        objective="reg:squarederror",
+        n_estimators=200,
+        max_depth=5,
+        learning_rate=0.05,
+        subsample=0.8,
+        colsample_bytree=0.8,
+        random_state=seed,
+        verbosity=0,
+        n_jobs=-1,
+    )
+    print("── Training XGBoost Regressor …")
+    model.fit(X_train, y_train)
+    print("   Training complete.\n")
+    return model
+# =============================================================================
+# STEP 4 — EVALUATION
+# =============================================================================
+def evaluate_model(
+    model: xgb.XGBRegressor,
+    X_test: np.ndarray,
+    y_test: np.ndarray,
+    feature_names: list[str],
+) -> dict:
+    """
+    Compute RMSE, MAE, R² and print feature importance ranking.
+    Returns a dict of metric values.
+    """
+    y_pred = model.predict(X_test)
+    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
+    mae  = mean_absolute_error(y_test, y_pred)
+    r2   = r2_score(y_test, y_pred)
+    print("=" * 50)
+    print("  MODEL EVALUATION METRICS")
+    print("=" * 50)
+    print(f"  RMSE : {rmse:.6f}")
+    print(f"  MAE  : {mae:.6f}")
+    print(f"  R²   : {r2:.6f}")
+    print("=" * 50)
+    # Feature importance (gain-based)
+    importances = model.feature_importances_
+    importance_df = (
+        pd.DataFrame({"Feature": feature_names, "Importance": importances})
+        .sort_values("Importance", ascending=False)
+        .reset_index(drop=True)
+    )
+    print("\n  FEATURE IMPORTANCE RANKING (gain)")
+    print("  " + "-" * 36)
+    for _, row in importance_df.iterrows():
+        bar = "█" * int(row["Importance"] * 100)
+        print(f"  {row['Feature']:>3}  {row['Importance']:.4f}  {bar}")
+    print()
+    return {"rmse": rmse, "mae": mae, "r2": r2, "importance": importance_df}
+# =============================================================================
+# STEP 5 — SHAP INTERPRETABILITY
+# =============================================================================
+def run_shap_analysis(
+    model: xgb.XGBRegressor,
+    X_test: np.ndarray,
+    feature_names: list[str],
+    output_dir: str = ".",
+) -> None:
+    """
+    Generate SHAP summary plot and print mean |SHAP| feature ranking.
+    Verifies that A, C, P dominate the explanation.
+    """
+    print("── Running SHAP analysis …")
+    explainer = shap.TreeExplainer(model)
+    shap_values = explainer.shap_values(X_test)
+    # ── Summary bar plot ──────────────────────────────────────────────────
+    plt.figure(figsize=(10, 6))
+    shap.summary_plot(
+        shap_values,
+        X_test,
+        feature_names=feature_names,
+        plot_type="bar",
+        show=False,
+    )
+    plt.title("SHAP Feature Importance — Mean |SHAP value|", fontsize=14, fontweight="bold")
+    plt.tight_layout()
+    bar_path = os.path.join(output_dir, "shap_bar_plot.png")
+    plt.savefig(bar_path, dpi=150, bbox_inches="tight")
+    plt.close()
+    print(f"   Saved: {bar_path}")
+    # ── Beeswarm / dot summary plot ───────────────────────────────────────
+    plt.figure(figsize=(10, 6))
+    shap.summary_plot(
+        shap_values,
+        X_test,
+        feature_names=feature_names,
+        show=False,
+    )
+    plt.title("SHAP Summary Plot — Impact on Severity Score", fontsize=14, fontweight="bold")
+    plt.tight_layout()
+    dot_path = os.path.join(output_dir, "shap_dot_plot.png")
+    plt.savefig(dot_path, dpi=150, bbox_inches="tight")
+    plt.close()
+    print(f"   Saved: {dot_path}\n")
+    # ── Mean |SHAP| ranking ───────────────────────────────────────────────
+    mean_shap = np.abs(shap_values).mean(axis=0)
+    shap_df = (
+        pd.DataFrame({"Feature": feature_names, "Mean|SHAP|": mean_shap})
+        .sort_values("Mean|SHAP|", ascending=False)
+        .reset_index(drop=True)
+    )
+    print("  SHAP MEAN |VALUE| RANKING")
+    print("  " + "-" * 36)
+    top3 = shap_df["Feature"].head(3).tolist()
+    for rank, (_, row) in enumerate(shap_df.iterrows(), start=1):
+        tag = " ◀ dominant" if row["Feature"] in ["A", "C", "P"] else ""
+        print(f"  #{rank:<2} {row['Feature']:>3}  {row['Mean|SHAP|']:.5f}{tag}")
+    print()
+    # Verify dominance of A, C, P
+    expected_dominant = {"A", "C", "P"}
+    actual_top3 = set(top3)
+    overlap = expected_dominant & actual_top3
+    if len(overlap) >= 2:
+        print(f"  ✅ Dominance check PASSED — {overlap} appear in top-3 SHAP features.")
+    else:
+        print(f"  ⚠️  Dominance check NOTE — top-3 are {top3}; "
+              "model learned different patterns from the data.")
+    print()
+# =============================================================================
+# STEP 6 — SAVE MODEL & ARTEFACTS
+# =============================================================================
+def save_artefacts(
+    model: xgb.XGBRegressor,
+    scaler: MinMaxScaler | None,
+    feature_names: list[str],
+    output_dir: str = ".",
+) -> None:
+    """
+    Export:
+        severity_model.json   — XGBoost model (native JSON format)
+        feature_scaler.pkl    — fitted MinMaxScaler (or None sentinel)
+        feature_list.json     — ordered list of feature names
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    # XGBoost native JSON
+    model_path = os.path.join(output_dir, "severity_model.json")
+    model.save_model(model_path)
+    print(f"── Model saved: {model_path}")
+    # Scaler
+    scaler_path = os.path.join(output_dir, "feature_scaler.pkl")
+    joblib.dump(scaler, scaler_path)
+    print(f"── Scaler saved: {scaler_path}")
+    # Feature list
+    feature_path = os.path.join(output_dir, "feature_list.json")
+    with open(feature_path, "w") as fp:
+        json.dump(feature_names, fp, indent=2)
+    print(f"── Feature list saved: {feature_path}\n")
+# =============================================================================
+# STEP 7 — INFERENCE FUNCTION
+# =============================================================================
+def load_inference_artefacts(
+    model_path: str = "severity_model.json",
+    scaler_path: str = "feature_scaler.pkl",
+    feature_list_path: str = "feature_list.json",
+) -> tuple[xgb.XGBRegressor, MinMaxScaler | None, list[str]]:
+    """Load saved model, scaler, and feature list for inference."""
+    model = xgb.XGBRegressor()
+    model.load_model(model_path)
+    scaler = joblib.load(scaler_path)
+    with open(feature_list_path) as fp:
+        feature_names = json.load(fp)
+    return model, scaler, feature_names
+def _severity_label(score: float) -> str:
+    """
+    Assign a human-readable label to a numeric severity score.
+    Thresholds (domain-tunable):
+        Low    : score < 0.33
+        Medium : 0.33 ≤ score < 0.66
+        High   : score ≥ 0.66
+    """
+    if score < 0.33:
+        return "Low"
+    elif score < 0.66:
+        return "Medium"
+    else:
+        return "High"
+def predict_severity(
+    features_dict: dict,
+    model: xgb.XGBRegressor,
+    scaler: MinMaxScaler | None,
+    feature_names: list[str],
+) -> dict:
+    """
+    Predict severity for a single pothole observation.
+    Parameters
+    ----------
+    features_dict : dict
+        Keys must match feature_names; values are raw (pre-scaling) floats.
+    model         : trained XGBRegressor
+    scaler        : fitted MinMaxScaler (or None if features are already scaled)
+    feature_names : ordered list of feature column names
+    Returns
+    -------
+    dict with:
+        "score" : float  — predicted severity in [0, 1]
+        "label" : str    — "Low" | "Medium" | "High"
+    """
+    # Validate input keys
+    missing = set(feature_names) - set(features_dict.keys())
+    if missing:
+        raise ValueError(f"Missing features in input dict: {missing}")
+    # Build ordered feature vector
+    row = np.array([[features_dict[f] for f in feature_names]], dtype=np.float32)
+    # Apply scaler if provided
+    if scaler is not None:
+        row = scaler.transform(row)
+    # Predict and clamp
+    raw_score = float(model.predict(row)[0])
+    score = float(np.clip(raw_score, 0.0, 1.0))
+    label = _severity_label(score)
+    return {"score": round(score, 4), "label": label}
+# =============================================================================
+# MAIN PIPELINE RUNNER
+# =============================================================================
+def main(output_dir: str = ".") -> None:
+    print("\n" + "=" * 60)
+    print("  CIVIC POTHOLE SEVERITY SCORING — FULL ML PIPELINE")
+    print("=" * 60 + "\n")
+    # ── 1. Generate dataset ──────────────────────────────────────────────
+    print("── [1/7] Generating synthetic dataset …")
+    df = generate_synthetic_dataset(n_samples=10_000)
+    y  = compute_severity(df)
+    # Save the dataset for persistence/user inspection
+    full_dataset = df.copy()
+    full_dataset['severity'] = y
+    dataset_path = os.path.join(output_dir, "synthetic_pothole_data.csv")
+    full_dataset.to_csv(dataset_path, index=False)
+    print(f"   Dataset shape : {df.shape}")
+    print(f"   Dataset saved to: {dataset_path}")
+    print(f"   Severity stats: mean={y.mean():.4f}, std={y.std():.4f}, "
+          f"min={y.min():.4f}, max={y.max():.4f}\n")
+    # ── 2. Feature scaling ───────────────────────────────────────────────
+    print("── [2/7] Scaling features (MinMaxScaler) …")
+    # NOTE: Features are already in [0, 1] by construction, but we fit a
+    # scaler so the inference function can handle raw un-normalised inputs
+    # if the production system requires it.
+    scaler = MinMaxScaler()
+    X_scaled = scaler.fit_transform(df[FEATURE_COLS])
+    print("   Scaling complete.\n")
+    # ── 3. Train / test split ────────────────────────────────────────────
+    print("── [3/7] Splitting data (80 % train / 20 % test) …")
+    X_train, X_test, y_train, y_test = train_test_split(
+        X_scaled, y, test_size=0.20, random_state=RANDOM_SEED
+    )
+    print(f"   Train samples : {len(X_train)}")
+    print(f"   Test  samples : {len(X_test)}\n")
+    # ── 4. Train model ───────────────────────────────────────────────────
+    print("── [4/7] Training model …")
+    model = build_and_train_model(X_train, y_train)
+    # ── 5. Evaluate ──────────────────────────────────────────────────────
+    print("── [5/7] Evaluating model …\n")
+    metrics = evaluate_model(model, X_test, y_test, FEATURE_COLS)
+    # ── 6. SHAP ──────────────────────────────────────────────────────────
+    print("── [6/7] SHAP interpretability …\n")
+    run_shap_analysis(model, X_test, FEATURE_COLS, output_dir=output_dir)
+    # ── 7. Save artefacts ────────────────────────────────────────────────
+    print("── [7/7] Saving model artefacts …")
+    save_artefacts(model, scaler, FEATURE_COLS, output_dir=output_dir)
+    # ── Sample predictions ───────────────────────────────────────────────
+    print("=" * 60)
+    print("  SAMPLE PREDICTIONS")
+    print("=" * 60)
+    sample_cases = [
+        {
+            "name": "Minor Local-Street Pothole",
+            "features": dict(zip(FEATURE_COLS,
+                [0.05, 0.08, 0.30, 0.90, 0.05, 0.10, 0.40, 0.02, 0.03, 0.01])),
+        },
+        {
+            "name": "Moderate Main-Road Pothole",
+            "features": dict(zip(FEATURE_COLS,
+                [0.25, 0.20, 0.55, 0.75, 0.35, 0.40, 0.70, 0.15, 0.20, 0.10])),
+        },
+        {
+            "name": "Severe Highway near Hospital",
+            "features": dict(zip(FEATURE_COLS,
+                [0.70, 0.55, 0.85, 0.95, 0.80, 0.75, 1.00, 0.90, 0.65, 0.40])),
+        },
+        {
+            "name": "Recurring Pothole (high reopen)",
+            "features": dict(zip(FEATURE_COLS,
+                [0.40, 0.35, 0.60, 0.80, 0.50, 0.85, 0.70, 0.30, 0.75, 0.80])),
+        },
+    ]
+    for case in sample_cases:
+        result = predict_severity(
+            features_dict=case["features"],
+            model=model,
+            scaler=scaler,
+            feature_names=FEATURE_COLS,
+        )
+        print(f"\n  📍 {case['name']}")
+        feature_str = ", ".join(f"{k}={v}" for k, v in case["features"].items())
+        print(f"     Features : {feature_str}")
+        print(f"     Score    : {result['score']:.4f}")
+        print(f"     Label    : {result['label']}")
+    print("\n" + "=" * 60)
+    print("  PIPELINE COMPLETE")
+    print(f"  Output artefacts → {os.path.abspath(output_dir)}")
+    print("=" * 60 + "\n")
+if __name__ == "__main__":
+    # Output directory for all saved files (same folder as this script)
+    OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))
+    main(output_dir=OUTPUT_DIR)

shap_bar_plot.png ADDED Viewed

shap_dot_plot.png ADDED Viewed

Git LFS Details

SHA256: ed5f94e557b4cc5eafd8a992a94cb75db874fc4c8e79c8a237947e5b003fc7d7
Pointer size: 131 Bytes
Size of remote file: 130 kB

simulation_output.txt ADDED Viewed

Binary file (5.09 kB). View file

synthetic_pothole_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff