petter2025 commited on
Commit
f836001
·
verified ·
1 Parent(s): 9460a0e

Create hmc_learner.py

Browse files
Files changed (1) hide show
  1. hmc_learner.py +126 -0
hmc_learner.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ hmc_learner.py – Hamiltonian Monte Carlo offline learner for ARF.
3
+ Trains a hierarchical Bayesian model on historical incidents to produce:
4
+ - Posterior coefficients for interpretable risk factors
5
+ - A fast approximator (logistic regression) for real-time use
6
+ - Feature importance for weighting semantic embeddings
7
+ """
8
+
9
+ import json
10
+ import os
11
+ import sqlite3
12
+ import numpy as np
13
+ import pandas as pd
14
+ import pymc as pm
15
+ import arviz as az
16
+ from datetime import datetime
17
+ from typing import Dict, Any, Optional
18
+
19
+ def fetch_incident_data(db_path: str) -> pd.DataFrame:
20
+ """Query the incidents table and return as DataFrame with all columns."""
21
+ conn = sqlite3.connect(db_path)
22
+ # Ensure we select the new columns; if missing, they will be NULL
23
+ query = """
24
+ SELECT action, risk_score, risk_level, confidence, allowed,
25
+ environment, user_role, requires_human, rollback_feasible,
26
+ hour_of_day, action_category, timestamp
27
+ FROM incidents
28
+ WHERE allowed IS NOT NULL -- only include evaluated incidents
29
+ """
30
+ df = pd.read_sql_query(query, conn)
31
+ conn.close()
32
+ return df
33
+
34
+ def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
35
+ """Convert categorical variables to numeric codes and create features."""
36
+ # Ensure allowed is integer (0/1)
37
+ df['allowed'] = df['allowed'].astype(int)
38
+
39
+ # Map environment to numeric (e.g., production=1, staging=0, dev=0)
40
+ df['env_prod'] = (df['environment'] == 'production').astype(int)
41
+
42
+ # Map user_role to numeric (junior=1, else 0)
43
+ df['role_junior'] = (df['user_role'] == 'junior').astype(int)
44
+
45
+ # action_category as categorical codes
46
+ df['action_cat_code'] = df['action_category'].astype('category').cat.codes
47
+ # Keep mapping for interpretation
48
+ cat_mapping = dict(enumerate(df['action_category'].astype('category').cat.categories))
49
+
50
+ # risk_score as is (continuous)
51
+ df['risk_score'] = df['risk_score'].astype(float)
52
+
53
+ # hour_of_day as cyclic? For simplicity, keep linear for now
54
+ df['hour'] = df['hour_of_day'].astype(float)
55
+
56
+ # confidence
57
+ df['confidence'] = df['confidence'].astype(float)
58
+
59
+ return df, cat_mapping
60
+
61
+ def build_model(df: pd.DataFrame):
62
+ """Build hierarchical logistic regression model using PyMC."""
63
+ coords = {
64
+ "action_cat": np.unique(df['action_cat_code']),
65
+ }
66
+ with pm.Model(coords=coords) as model:
67
+ # Hierarchical intercepts for action categories
68
+ α_mu = pm.Normal("α_mu", mu=0, sigma=1)
69
+ α_sigma = pm.HalfNormal("α_sigma", sigma=1)
70
+ α_cat = pm.Normal("α_cat", mu=α_mu, sigma=α_sigma, dims="action_cat")
71
+
72
+ # Coefficients for fixed effects
73
+ β_env = pm.Normal("β_env", mu=0, sigma=1)
74
+ β_role = pm.Normal("β_role", mu=0, sigma=1)
75
+ β_risk = pm.Normal("β_risk", mu=0, sigma=1)
76
+ β_hour = pm.Normal("β_hour", mu=0, sigma=1)
77
+ β_conf = pm.Normal("β_conf", mu=0, sigma=1)
78
+
79
+ # Linear predictor
80
+ logit_p = (α_cat[df['action_cat_code'].values] +
81
+ β_env * df['env_prod'].values +
82
+ β_role * df['role_junior'].values +
83
+ β_risk * (df['risk_score'].values - 0.5) + # center risk around 0.5
84
+ β_hour * (df['hour'].values - 12) / 12 + # normalize hour to [-1,1]
85
+ β_conf * (df['confidence'].values - 0.5)) # center confidence
86
+
87
+ # Likelihood
88
+ pm.Bernoulli("success", logit_p=logit_p, observed=df['allowed'].values)
89
+
90
+ # Sample using NUTS (HMC)
91
+ trace = pm.sample(1000, tune=1000, chains=4, cores=4, return_inferencedata=True, progressbar=False)
92
+ return model, trace
93
+
94
+ def extract_coefficients(trace, cat_mapping: Dict) -> Dict[str, Any]:
95
+ """Extract posterior means and 95% intervals."""
96
+ summary = az.summary(trace, hdi_prob=0.95)
97
+ coeffs = {}
98
+ for var in summary.index:
99
+ coeffs[var] = {
100
+ "mean": summary.loc[var, "mean"],
101
+ "sd": summary.loc[var, "sd"],
102
+ "hdi_low": summary.loc[var, "hdi_2.5%"],
103
+ "hdi_high": summary.loc[var, "hdi_97.5%"],
104
+ }
105
+ # Also store action category mapping
106
+ coeffs["action_cat_mapping"] = cat_mapping
107
+ return coeffs
108
+
109
+ def save_model(coeffs: Dict, output_dir: str):
110
+ """Save coefficients to JSON file."""
111
+ path = os.path.join(output_dir, "hmc_model.json")
112
+ with open(path, 'w') as f:
113
+ json.dump(coeffs, f, indent=2)
114
+ return path
115
+
116
+ def train_hmc_model(db_path: str, output_dir: str) -> Dict[str, Any]:
117
+ """Main training routine: fetch data, build model, save coefficients."""
118
+ df = fetch_incident_data(db_path)
119
+ if len(df) < 10:
120
+ raise ValueError("Insufficient data for training (need at least 10 incidents)")
121
+
122
+ df_proc, cat_mapping = preprocess_data(df)
123
+ model, trace = build_model(df_proc)
124
+ coeffs = extract_coefficients(trace, cat_mapping)
125
+ save_model(coeffs, output_dir)
126
+ return coeffs