File size: 4,940 Bytes
f836001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
hmc_learner.py – Hamiltonian Monte Carlo offline learner for ARF.
Trains a hierarchical Bayesian model on historical incidents to produce:
- Posterior coefficients for interpretable risk factors
- A fast approximator (logistic regression) for real-time use
- Feature importance for weighting semantic embeddings
"""

import json
import os
import sqlite3
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az
from datetime import datetime
from typing import Dict, Any, Optional

def fetch_incident_data(db_path: str) -> pd.DataFrame:
    """Query the incidents table and return as DataFrame with all columns."""
    conn = sqlite3.connect(db_path)
    # Ensure we select the new columns; if missing, they will be NULL
    query = """
        SELECT action, risk_score, risk_level, confidence, allowed,
               environment, user_role, requires_human, rollback_feasible,
               hour_of_day, action_category, timestamp
        FROM incidents
        WHERE allowed IS NOT NULL  -- only include evaluated incidents
    """
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df

def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """Convert categorical variables to numeric codes and create features."""
    # Ensure allowed is integer (0/1)
    df['allowed'] = df['allowed'].astype(int)

    # Map environment to numeric (e.g., production=1, staging=0, dev=0)
    df['env_prod'] = (df['environment'] == 'production').astype(int)

    # Map user_role to numeric (junior=1, else 0)
    df['role_junior'] = (df['user_role'] == 'junior').astype(int)

    # action_category as categorical codes
    df['action_cat_code'] = df['action_category'].astype('category').cat.codes
    # Keep mapping for interpretation
    cat_mapping = dict(enumerate(df['action_category'].astype('category').cat.categories))

    # risk_score as is (continuous)
    df['risk_score'] = df['risk_score'].astype(float)

    # hour_of_day as cyclic? For simplicity, keep linear for now
    df['hour'] = df['hour_of_day'].astype(float)

    # confidence
    df['confidence'] = df['confidence'].astype(float)

    return df, cat_mapping

def build_model(df: pd.DataFrame):
    """Build hierarchical logistic regression model using PyMC."""
    coords = {
        "action_cat": np.unique(df['action_cat_code']),
    }
    with pm.Model(coords=coords) as model:
        # Hierarchical intercepts for action categories
        α_mu = pm.Normal("α_mu", mu=0, sigma=1)
        α_sigma = pm.HalfNormal("α_sigma", sigma=1)
        α_cat = pm.Normal("α_cat", mu=α_mu, sigma=α_sigma, dims="action_cat")

        # Coefficients for fixed effects
        β_env = pm.Normal("β_env", mu=0, sigma=1)
        β_role = pm.Normal("β_role", mu=0, sigma=1)
        β_risk = pm.Normal("β_risk", mu=0, sigma=1)
        β_hour = pm.Normal("β_hour", mu=0, sigma=1)
        β_conf = pm.Normal("β_conf", mu=0, sigma=1)

        # Linear predictor
        logit_p = (α_cat[df['action_cat_code'].values] +
                   β_env * df['env_prod'].values +
                   β_role * df['role_junior'].values +
                   β_risk * (df['risk_score'].values - 0.5) +  # center risk around 0.5
                   β_hour * (df['hour'].values - 12) / 12 +    # normalize hour to [-1,1]
                   β_conf * (df['confidence'].values - 0.5))   # center confidence

        # Likelihood
        pm.Bernoulli("success", logit_p=logit_p, observed=df['allowed'].values)

        # Sample using NUTS (HMC)
        trace = pm.sample(1000, tune=1000, chains=4, cores=4, return_inferencedata=True, progressbar=False)
    return model, trace

def extract_coefficients(trace, cat_mapping: Dict) -> Dict[str, Any]:
    """Extract posterior means and 95% intervals."""
    summary = az.summary(trace, hdi_prob=0.95)
    coeffs = {}
    for var in summary.index:
        coeffs[var] = {
            "mean": summary.loc[var, "mean"],
            "sd": summary.loc[var, "sd"],
            "hdi_low": summary.loc[var, "hdi_2.5%"],
            "hdi_high": summary.loc[var, "hdi_97.5%"],
        }
    # Also store action category mapping
    coeffs["action_cat_mapping"] = cat_mapping
    return coeffs

def save_model(coeffs: Dict, output_dir: str):
    """Save coefficients to JSON file."""
    path = os.path.join(output_dir, "hmc_model.json")
    with open(path, 'w') as f:
        json.dump(coeffs, f, indent=2)
    return path

def train_hmc_model(db_path: str, output_dir: str) -> Dict[str, Any]:
    """Main training routine: fetch data, build model, save coefficients."""
    df = fetch_incident_data(db_path)
    if len(df) < 10:
        raise ValueError("Insufficient data for training (need at least 10 incidents)")

    df_proc, cat_mapping = preprocess_data(df)
    model, trace = build_model(df_proc)
    coeffs = extract_coefficients(trace, cat_mapping)
    save_model(coeffs, output_dir)
    return coeffs