Delete hmc_learner.py
Browse files- hmc_learner.py +0 -130
hmc_learner.py
DELETED
|
@@ -1,130 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
hmc_learner.py – Hamiltonian Monte Carlo offline learner for ARF.
|
| 3 |
-
Trains a hierarchical Bayesian model on historical incidents to produce:
|
| 4 |
-
- Posterior coefficients for interpretable risk factors
|
| 5 |
-
- A fast approximator (logistic regression) for real-time use
|
| 6 |
-
- Feature importance for weighting semantic embeddings
|
| 7 |
-
"""
|
| 8 |
-
|
| 9 |
-
import json
|
| 10 |
-
import os
|
| 11 |
-
import sqlite3
|
| 12 |
-
import numpy as np
|
| 13 |
-
import pandas as pd
|
| 14 |
-
import pymc as pm
|
| 15 |
-
import arviz as az
|
| 16 |
-
from datetime import datetime
|
| 17 |
-
from typing import Dict, Any, Optional
|
| 18 |
-
|
| 19 |
-
def fetch_incident_data(db_path: str) -> pd.DataFrame:
|
| 20 |
-
"""Query the incidents table and return as DataFrame with all columns."""
|
| 21 |
-
conn = sqlite3.connect(db_path)
|
| 22 |
-
# Ensure we select the new columns; if missing, they will be NULL
|
| 23 |
-
query = """
|
| 24 |
-
SELECT action, risk_score, risk_level, confidence, allowed,
|
| 25 |
-
environment, user_role, requires_human, rollback_feasible,
|
| 26 |
-
hour_of_day, action_category, timestamp
|
| 27 |
-
FROM incidents
|
| 28 |
-
WHERE allowed IS NOT NULL -- only include evaluated incidents
|
| 29 |
-
"""
|
| 30 |
-
df = pd.read_sql_query(query, conn)
|
| 31 |
-
conn.close()
|
| 32 |
-
return df
|
| 33 |
-
|
| 34 |
-
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
|
| 35 |
-
"""Convert categorical variables to numeric codes and create features."""
|
| 36 |
-
# Ensure allowed is integer (0/1)
|
| 37 |
-
df['allowed'] = df['allowed'].astype(int)
|
| 38 |
-
|
| 39 |
-
# Map environment to numeric (e.g., production=1, staging=0, dev=0)
|
| 40 |
-
df['env_prod'] = (df['environment'] == 'production').astype(int)
|
| 41 |
-
|
| 42 |
-
# Map user_role to numeric (junior=1, else 0)
|
| 43 |
-
df['role_junior'] = (df['user_role'] == 'junior').astype(int)
|
| 44 |
-
|
| 45 |
-
# action_category as categorical codes
|
| 46 |
-
df['action_cat_code'] = df['action_category'].astype('category').cat.codes
|
| 47 |
-
# Keep mapping for interpretation
|
| 48 |
-
cat_mapping = dict(enumerate(df['action_category'].astype('category').cat.categories))
|
| 49 |
-
|
| 50 |
-
# risk_score as is (continuous)
|
| 51 |
-
df['risk_score'] = df['risk_score'].astype(float)
|
| 52 |
-
|
| 53 |
-
# Cyclical encoding of hour
|
| 54 |
-
hours = df['hour_of_day'].values
|
| 55 |
-
df['hour_sin'] = np.sin(2 * np.pi * hours / 24)
|
| 56 |
-
df['hour_cos'] = np.cos(2 * np.pi * hours / 24)
|
| 57 |
-
|
| 58 |
-
# confidence
|
| 59 |
-
df['confidence'] = df['confidence'].astype(float)
|
| 60 |
-
|
| 61 |
-
return df, cat_mapping
|
| 62 |
-
|
| 63 |
-
def build_model(df: pd.DataFrame):
|
| 64 |
-
"""Build hierarchical logistic regression model using PyMC."""
|
| 65 |
-
coords = {
|
| 66 |
-
"action_cat": np.unique(df['action_cat_code']),
|
| 67 |
-
}
|
| 68 |
-
with pm.Model(coords=coords) as model:
|
| 69 |
-
# Hierarchical intercepts for action categories
|
| 70 |
-
α_mu = pm.Normal("α_mu", mu=0, sigma=1)
|
| 71 |
-
α_sigma = pm.HalfNormal("α_sigma", sigma=1)
|
| 72 |
-
α_cat = pm.Normal("α_cat", mu=α_mu, sigma=α_sigma, dims="action_cat")
|
| 73 |
-
|
| 74 |
-
# Coefficients for fixed effects
|
| 75 |
-
β_env = pm.Normal("β_env", mu=0, sigma=1)
|
| 76 |
-
β_role = pm.Normal("β_role", mu=0, sigma=1)
|
| 77 |
-
β_risk = pm.Normal("β_risk", mu=0, sigma=1)
|
| 78 |
-
β_hour_sin = pm.Normal("β_hour_sin", mu=0, sigma=1)
|
| 79 |
-
β_hour_cos = pm.Normal("β_hour_cos", mu=0, sigma=1)
|
| 80 |
-
β_conf = pm.Normal("β_conf", mu=0, sigma=1)
|
| 81 |
-
|
| 82 |
-
# Linear predictor
|
| 83 |
-
logit_p = (α_cat[df['action_cat_code'].values] +
|
| 84 |
-
β_env * df['env_prod'].values +
|
| 85 |
-
β_role * df['role_junior'].values +
|
| 86 |
-
β_risk * (df['risk_score'].values - 0.5) + # center risk around 0.5
|
| 87 |
-
β_hour_sin * df['hour_sin'].values +
|
| 88 |
-
β_hour_cos * df['hour_cos'].values +
|
| 89 |
-
β_conf * (df['confidence'].values - 0.5)) # center confidence
|
| 90 |
-
|
| 91 |
-
# Likelihood
|
| 92 |
-
pm.Bernoulli("success", logit_p=logit_p, observed=df['allowed'].values)
|
| 93 |
-
|
| 94 |
-
# Sample using NUTS (HMC)
|
| 95 |
-
trace = pm.sample(1000, tune=1000, chains=4, cores=4, return_inferencedata=True, progressbar=False)
|
| 96 |
-
return model, trace
|
| 97 |
-
|
| 98 |
-
def extract_coefficients(trace, cat_mapping: Dict) -> Dict[str, Any]:
|
| 99 |
-
"""Extract posterior means and 95% intervals."""
|
| 100 |
-
summary = az.summary(trace, hdi_prob=0.95)
|
| 101 |
-
coeffs = {}
|
| 102 |
-
for var in summary.index:
|
| 103 |
-
coeffs[var] = {
|
| 104 |
-
"mean": summary.loc[var, "mean"],
|
| 105 |
-
"sd": summary.loc[var, "sd"],
|
| 106 |
-
"hdi_low": summary.loc[var, "hdi_2.5%"],
|
| 107 |
-
"hdi_high": summary.loc[var, "hdi_97.5%"],
|
| 108 |
-
}
|
| 109 |
-
# Also store action category mapping
|
| 110 |
-
coeffs["action_cat_mapping"] = cat_mapping
|
| 111 |
-
return coeffs
|
| 112 |
-
|
| 113 |
-
def save_model(coeffs: Dict, output_dir: str):
|
| 114 |
-
"""Save coefficients to JSON file."""
|
| 115 |
-
path = os.path.join(output_dir, "hmc_model.json")
|
| 116 |
-
with open(path, 'w') as f:
|
| 117 |
-
json.dump(coeffs, f, indent=2)
|
| 118 |
-
return path
|
| 119 |
-
|
| 120 |
-
def train_hmc_model(db_path: str, output_dir: str) -> Dict[str, Any]:
|
| 121 |
-
"""Main training routine: fetch data, build model, save coefficients."""
|
| 122 |
-
df = fetch_incident_data(db_path)
|
| 123 |
-
if len(df) < 10:
|
| 124 |
-
raise ValueError("Insufficient data for training (need at least 10 incidents)")
|
| 125 |
-
|
| 126 |
-
df_proc, cat_mapping = preprocess_data(df)
|
| 127 |
-
model, trace = build_model(df_proc)
|
| 128 |
-
coeffs = extract_coefficients(trace, cat_mapping)
|
| 129 |
-
save_model(coeffs, output_dir)
|
| 130 |
-
return coeffs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|