import pandas as pd import joblib from features.log_feature_extraction import run_pipeline MODEL_PATH = "models/failure_model.pkl" FEATURE_PATH = "models/feature_columns.pkl" model = joblib.load(MODEL_PATH) feature_cols = joblib.load(FEATURE_PATH) def predict_logs(log_file): # run feature extraction run_pipeline(log_file, "temp_features.csv") df = pd.read_csv("temp_features.csv") # keep module column for aggregation later modules = df["module"].copy() # ensure all required features exist for col in feature_cols: if col not in df.columns: df[col] = 0 # select only model features X = df[feature_cols] # predict probabilities probs = model.predict_proba(X)[:, 1] # attach predictions back df["module"] = modules df["failure_probability"] = probs # aggregate module risk module_risk = ( df.groupby("module")["failure_probability"] .mean() .sort_values(ascending=False) ) results = [] for module, prob in module_risk.items(): if prob > 0.75: risk = "HIGH" elif prob > 0.4: risk = "MEDIUM" else: risk = "LOW" results.append({ "module": module, "failure_probability": float(prob), "risk": risk }) return { "summary": { "total_logs": int(len(df)), "modules_analyzed": int(len(results)) }, "module_risk": results }