petter2025 commited on
Commit
8bb8b1e
·
verified ·
1 Parent(s): 0f5f4d1

Delete hmc_learner.py

Browse files
Files changed (1) hide show
  1. hmc_learner.py +0 -130
hmc_learner.py DELETED
@@ -1,130 +0,0 @@
1
- """
2
- hmc_learner.py – Hamiltonian Monte Carlo offline learner for ARF.
3
- Trains a hierarchical Bayesian model on historical incidents to produce:
4
- - Posterior coefficients for interpretable risk factors
5
- - A fast approximator (logistic regression) for real-time use
6
- - Feature importance for weighting semantic embeddings
7
- """
8
-
9
- import json
10
- import os
11
- import sqlite3
12
- import numpy as np
13
- import pandas as pd
14
- import pymc as pm
15
- import arviz as az
16
- from datetime import datetime
17
- from typing import Dict, Any, Optional
18
-
19
- def fetch_incident_data(db_path: str) -> pd.DataFrame:
20
- """Query the incidents table and return as DataFrame with all columns."""
21
- conn = sqlite3.connect(db_path)
22
- # Ensure we select the new columns; if missing, they will be NULL
23
- query = """
24
- SELECT action, risk_score, risk_level, confidence, allowed,
25
- environment, user_role, requires_human, rollback_feasible,
26
- hour_of_day, action_category, timestamp
27
- FROM incidents
28
- WHERE allowed IS NOT NULL -- only include evaluated incidents
29
- """
30
- df = pd.read_sql_query(query, conn)
31
- conn.close()
32
- return df
33
-
34
- def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
35
- """Convert categorical variables to numeric codes and create features."""
36
- # Ensure allowed is integer (0/1)
37
- df['allowed'] = df['allowed'].astype(int)
38
-
39
- # Map environment to numeric (e.g., production=1, staging=0, dev=0)
40
- df['env_prod'] = (df['environment'] == 'production').astype(int)
41
-
42
- # Map user_role to numeric (junior=1, else 0)
43
- df['role_junior'] = (df['user_role'] == 'junior').astype(int)
44
-
45
- # action_category as categorical codes
46
- df['action_cat_code'] = df['action_category'].astype('category').cat.codes
47
- # Keep mapping for interpretation
48
- cat_mapping = dict(enumerate(df['action_category'].astype('category').cat.categories))
49
-
50
- # risk_score as is (continuous)
51
- df['risk_score'] = df['risk_score'].astype(float)
52
-
53
- # Cyclical encoding of hour
54
- hours = df['hour_of_day'].values
55
- df['hour_sin'] = np.sin(2 * np.pi * hours / 24)
56
- df['hour_cos'] = np.cos(2 * np.pi * hours / 24)
57
-
58
- # confidence
59
- df['confidence'] = df['confidence'].astype(float)
60
-
61
- return df, cat_mapping
62
-
63
- def build_model(df: pd.DataFrame):
64
- """Build hierarchical logistic regression model using PyMC."""
65
- coords = {
66
- "action_cat": np.unique(df['action_cat_code']),
67
- }
68
- with pm.Model(coords=coords) as model:
69
- # Hierarchical intercepts for action categories
70
- α_mu = pm.Normal("α_mu", mu=0, sigma=1)
71
- α_sigma = pm.HalfNormal("α_sigma", sigma=1)
72
- α_cat = pm.Normal("α_cat", mu=α_mu, sigma=α_sigma, dims="action_cat")
73
-
74
- # Coefficients for fixed effects
75
- β_env = pm.Normal("β_env", mu=0, sigma=1)
76
- β_role = pm.Normal("β_role", mu=0, sigma=1)
77
- β_risk = pm.Normal("β_risk", mu=0, sigma=1)
78
- β_hour_sin = pm.Normal("β_hour_sin", mu=0, sigma=1)
79
- β_hour_cos = pm.Normal("β_hour_cos", mu=0, sigma=1)
80
- β_conf = pm.Normal("β_conf", mu=0, sigma=1)
81
-
82
- # Linear predictor
83
- logit_p = (α_cat[df['action_cat_code'].values] +
84
- β_env * df['env_prod'].values +
85
- β_role * df['role_junior'].values +
86
- β_risk * (df['risk_score'].values - 0.5) + # center risk around 0.5
87
- β_hour_sin * df['hour_sin'].values +
88
- β_hour_cos * df['hour_cos'].values +
89
- β_conf * (df['confidence'].values - 0.5)) # center confidence
90
-
91
- # Likelihood
92
- pm.Bernoulli("success", logit_p=logit_p, observed=df['allowed'].values)
93
-
94
- # Sample using NUTS (HMC)
95
- trace = pm.sample(1000, tune=1000, chains=4, cores=4, return_inferencedata=True, progressbar=False)
96
- return model, trace
97
-
98
- def extract_coefficients(trace, cat_mapping: Dict) -> Dict[str, Any]:
99
- """Extract posterior means and 95% intervals."""
100
- summary = az.summary(trace, hdi_prob=0.95)
101
- coeffs = {}
102
- for var in summary.index:
103
- coeffs[var] = {
104
- "mean": summary.loc[var, "mean"],
105
- "sd": summary.loc[var, "sd"],
106
- "hdi_low": summary.loc[var, "hdi_2.5%"],
107
- "hdi_high": summary.loc[var, "hdi_97.5%"],
108
- }
109
- # Also store action category mapping
110
- coeffs["action_cat_mapping"] = cat_mapping
111
- return coeffs
112
-
113
- def save_model(coeffs: Dict, output_dir: str):
114
- """Save coefficients to JSON file."""
115
- path = os.path.join(output_dir, "hmc_model.json")
116
- with open(path, 'w') as f:
117
- json.dump(coeffs, f, indent=2)
118
- return path
119
-
120
- def train_hmc_model(db_path: str, output_dir: str) -> Dict[str, Any]:
121
- """Main training routine: fetch data, build model, save coefficients."""
122
- df = fetch_incident_data(db_path)
123
- if len(df) < 10:
124
- raise ValueError("Insufficient data for training (need at least 10 incidents)")
125
-
126
- df_proc, cat_mapping = preprocess_data(df)
127
- model, trace = build_model(df_proc)
128
- coeffs = extract_coefficients(trace, cat_mapping)
129
- save_model(coeffs, output_dir)
130
- return coeffs