Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import joblib | |
| import numpy as np | |
| # 1. Load model and scaler (once, cached) | |
| def load_artifacts(): | |
| model = joblib.load("bug_predictor_model.pkl") | |
| scaler = joblib.load("scaler.pkl") | |
| return model, scaler | |
| model, scaler = load_artifacts() | |
| # 2. Feature names in same order as training | |
| feature_names = [ | |
| 'loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't', | |
| 'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op', | |
| 'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount' | |
| ] | |
| # 3. Simple metric extraction from raw Python code | |
| def extract_simple_metrics_from_code(code: str): | |
| """ | |
| Approximate metrics from Python code. | |
| This is a heuristic approximation, not exact NASA KC1 metrics. | |
| """ | |
| lines = code.splitlines() | |
| # Non-empty lines of code | |
| loc = len([l for l in lines if l.strip()]) | |
| # Basic branching structures | |
| branch_keywords = ("if ", "elif ", "for ", "while ", "try:", "except", "with ") | |
| branch_count = sum(any(kw in l for kw in branch_keywords) for l in lines) | |
| # Rough proxies for the rest (just to fill features) | |
| v_g = branch_count # cyclomatic complexity approx | |
| ev_g = max(1, branch_count // 2) | |
| iv_g = max(1, branch_count // 3) | |
| n = max(1, loc * 2) | |
| v = max(1, loc * 3) | |
| l_metric = 1.0 | |
| d_metric = 1.0 | |
| i_metric = 1.0 | |
| e_metric = float(loc * 10) | |
| b_metric = float(branch_count) | |
| t_metric = max(1, loc // 10) | |
| lOCode = float(loc) | |
| lOComment = float(len([l for l in lines if l.strip().startswith("#")])) | |
| lOBlank = float(len([l for l in lines if not l.strip()])) | |
| locCodeAndComment = float(loc + lOComment) | |
| uniq_Op = 10.0 | |
| uniq_Opnd = 10.0 | |
| total_Op = float(loc * 2) | |
| total_Opnd = float(loc * 2) | |
| return [ | |
| loc, v_g, ev_g, iv_g, n, v, l_metric, d_metric, i_metric, e_metric, | |
| b_metric, t_metric, lOCode, lOComment, lOBlank, locCodeAndComment, | |
| uniq_Op, uniq_Opnd, total_Op, total_Opnd, branch_count | |
| ] | |
| # 4. Simple rule-based suspicious line detector | |
| def find_suspicious_lines(code: str): | |
| """ | |
| Rule-based suspicious patterns (NOT ML, just heuristics). | |
| """ | |
| suspicious = [] | |
| lines = code.splitlines() | |
| for idx, line in enumerate(lines, start=1): | |
| stripped = line.strip() | |
| # Bare except | |
| if stripped.startswith("except:"): | |
| suspicious.append((idx, line, "Bare 'except:' (too generic)")) | |
| # eval usage | |
| if "eval(" in stripped: | |
| suspicious.append((idx, line, "Use of eval() is risky")) | |
| # == None instead of is None | |
| if "== None" in stripped: | |
| suspicious.append((idx, line, "Use 'is None' instead of '== None'")) | |
| # TODO/FIXME comments | |
| if "# TODO" in stripped or "# FIXME" in stripped: | |
| suspicious.append((idx, line, "TODO/FIXME comment (pending work)")) | |
| # Very long conditional logic | |
| if len(stripped) > 100 and ("if " in stripped or "while " in stripped): | |
| suspicious.append((idx, line, "Very long condition (complex logic)")) | |
| return suspicious | |
| # 5. Streamlit UI: ONLY Python file upload | |
| st.title("π Software Bug Risk Predictor from Python File") | |
| st.write( | |
| "Upload a `.py` file. The app will:\n" | |
| "1. Estimate code metrics and predict defect risk using an XGBoost model trained on NASA KC1.\n" | |
| "2. Highlight lines that look suspicious based on simple static rules (not ML)." | |
| ) | |
| uploaded_py = st.file_uploader("Choose a Python file", type=["py"]) | |
| if uploaded_py is not None: | |
| # Read and decode the file | |
| code_bytes = uploaded_py.read() | |
| try: | |
| code_text = code_bytes.decode("utf-8") | |
| except UnicodeDecodeError: | |
| st.error("β Could not decode file as UTF-8 text. Please upload a UTF-8 encoded .py file.") | |
| code_text = None | |
| if code_text: | |
| st.markdown("### π Code Preview") | |
| st.code(code_text, language="python") | |
| # Extract approximate metrics from code and get ML prediction | |
| metrics_vector = extract_simple_metrics_from_code(code_text) | |
| metrics_array = np.array(metrics_vector).reshape(1, -1) | |
| scaled = scaler.transform(metrics_array) | |
| ml_pred = model.predict(scaled)[0] | |
| if hasattr(model, "predict_proba"): | |
| ml_proba = model.predict_proba(scaled)[0][1] | |
| else: | |
| ml_proba = None | |
| # Rule-based suspicious lines | |
| suspicious = find_suspicious_lines(code_text) | |
| # π΄ HYBRID DECISION: | |
| # If ML says defect OR we found suspicious lines β treat as defect | |
| is_defect = (ml_pred == 1) or (len(suspicious) > 0) | |
| st.markdown("### π File-level Defect Prediction") | |
| if is_defect: | |
| st.error("β οΈ Defect Likely") | |
| else: | |
| st.success("β No Defect Predicted") | |
| if ml_proba is not None: | |
| st.write(f"Estimated probability from ML model: **{ml_proba:.2f}**") | |
| # Suspicious lines section | |
| st.markdown("### π§· Suspicious Lines (Rule-Based Heuristics)") | |
| if not suspicious: | |
| st.info("No suspicious patterns found by the simple rules.") | |
| else: | |
| for line_no, line_text, reason in suspicious: | |
| st.write(f"**Line {line_no}** β {reason}") | |
| st.code(line_text, language="python") | |
| if not suspicious: | |
| st.info("No suspicious patterns found by the simple rules.") | |
| else: | |
| for line_no, line_text, reason in suspicious: | |
| st.write(f"**Line {line_no}** β {reason}") | |
| st.code(line_text, language="python") | |