import streamlit as st import joblib import numpy as np # 1. Load model and scaler (once, cached) @st.cache_resource def load_artifacts(): model = joblib.load("bug_predictor_model.pkl") scaler = joblib.load("scaler.pkl") return model, scaler model, scaler = load_artifacts() # 2. Feature names in same order as training feature_names = [ 'loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't', 'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op', 'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount' ] # 3. Simple metric extraction from raw Python code def extract_simple_metrics_from_code(code: str): """ Approximate metrics from Python code. This is a heuristic approximation, not exact NASA KC1 metrics. """ lines = code.splitlines() # Non-empty lines of code loc = len([l for l in lines if l.strip()]) # Basic branching structures branch_keywords = ("if ", "elif ", "for ", "while ", "try:", "except", "with ") branch_count = sum(any(kw in l for kw in branch_keywords) for l in lines) # Rough proxies for the rest (just to fill features) v_g = branch_count # cyclomatic complexity approx ev_g = max(1, branch_count // 2) iv_g = max(1, branch_count // 3) n = max(1, loc * 2) v = max(1, loc * 3) l_metric = 1.0 d_metric = 1.0 i_metric = 1.0 e_metric = float(loc * 10) b_metric = float(branch_count) t_metric = max(1, loc // 10) lOCode = float(loc) lOComment = float(len([l for l in lines if l.strip().startswith("#")])) lOBlank = float(len([l for l in lines if not l.strip()])) locCodeAndComment = float(loc + lOComment) uniq_Op = 10.0 uniq_Opnd = 10.0 total_Op = float(loc * 2) total_Opnd = float(loc * 2) return [ loc, v_g, ev_g, iv_g, n, v, l_metric, d_metric, i_metric, e_metric, b_metric, t_metric, lOCode, lOComment, lOBlank, locCodeAndComment, uniq_Op, uniq_Opnd, total_Op, total_Opnd, branch_count ] # 4. Simple rule-based suspicious line detector def find_suspicious_lines(code: str): """ Rule-based suspicious patterns (NOT ML, just heuristics). """ suspicious = [] lines = code.splitlines() for idx, line in enumerate(lines, start=1): stripped = line.strip() # Bare except if stripped.startswith("except:"): suspicious.append((idx, line, "Bare 'except:' (too generic)")) # eval usage if "eval(" in stripped: suspicious.append((idx, line, "Use of eval() is risky")) # == None instead of is None if "== None" in stripped: suspicious.append((idx, line, "Use 'is None' instead of '== None'")) # TODO/FIXME comments if "# TODO" in stripped or "# FIXME" in stripped: suspicious.append((idx, line, "TODO/FIXME comment (pending work)")) # Very long conditional logic if len(stripped) > 100 and ("if " in stripped or "while " in stripped): suspicious.append((idx, line, "Very long condition (complex logic)")) return suspicious # 5. Streamlit UI: ONLY Python file upload st.title("๐Ÿ Software Bug Risk Predictor from Python File") st.write( "Upload a `.py` file. The app will:\n" "1. Estimate code metrics and predict defect risk using an XGBoost model trained on NASA KC1.\n" "2. Highlight lines that look suspicious based on simple static rules (not ML)." ) uploaded_py = st.file_uploader("Choose a Python file", type=["py"]) if uploaded_py is not None: # Read and decode the file code_bytes = uploaded_py.read() try: code_text = code_bytes.decode("utf-8") except UnicodeDecodeError: st.error("โŒ Could not decode file as UTF-8 text. Please upload a UTF-8 encoded .py file.") code_text = None if code_text: st.markdown("### ๐Ÿ“„ Code Preview") st.code(code_text, language="python") # Extract approximate metrics from code and get ML prediction metrics_vector = extract_simple_metrics_from_code(code_text) metrics_array = np.array(metrics_vector).reshape(1, -1) scaled = scaler.transform(metrics_array) ml_pred = model.predict(scaled)[0] if hasattr(model, "predict_proba"): ml_proba = model.predict_proba(scaled)[0][1] else: ml_proba = None # Rule-based suspicious lines suspicious = find_suspicious_lines(code_text) # ๐Ÿ”ด HYBRID DECISION: # If ML says defect OR we found suspicious lines โ†’ treat as defect is_defect = (ml_pred == 1) or (len(suspicious) > 0) st.markdown("### ๐Ÿ” File-level Defect Prediction") if is_defect: st.error("โš ๏ธ Defect Likely") else: st.success("โœ… No Defect Predicted") if ml_proba is not None: st.write(f"Estimated probability from ML model: **{ml_proba:.2f}**") # Suspicious lines section st.markdown("### ๐Ÿงท Suspicious Lines (Rule-Based Heuristics)") if not suspicious: st.info("No suspicious patterns found by the simple rules.") else: for line_no, line_text, reason in suspicious: st.write(f"**Line {line_no}** โ€” {reason}") st.code(line_text, language="python") if not suspicious: st.info("No suspicious patterns found by the simple rules.") else: for line_no, line_text, reason in suspicious: st.write(f"**Line {line_no}** โ€” {reason}") st.code(line_text, language="python")