Phani1008's picture
Update app.py
d1bbf06 verified
import streamlit as st
import joblib
import numpy as np
# 1. Load model and scaler (once, cached)
@st.cache_resource
def load_artifacts():
model = joblib.load("bug_predictor_model.pkl")
scaler = joblib.load("scaler.pkl")
return model, scaler
model, scaler = load_artifacts()
# 2. Feature names in same order as training
feature_names = [
'loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't',
'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op',
'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount'
]
# 3. Simple metric extraction from raw Python code
def extract_simple_metrics_from_code(code: str):
"""
Approximate metrics from Python code.
This is a heuristic approximation, not exact NASA KC1 metrics.
"""
lines = code.splitlines()
# Non-empty lines of code
loc = len([l for l in lines if l.strip()])
# Basic branching structures
branch_keywords = ("if ", "elif ", "for ", "while ", "try:", "except", "with ")
branch_count = sum(any(kw in l for kw in branch_keywords) for l in lines)
# Rough proxies for the rest (just to fill features)
v_g = branch_count # cyclomatic complexity approx
ev_g = max(1, branch_count // 2)
iv_g = max(1, branch_count // 3)
n = max(1, loc * 2)
v = max(1, loc * 3)
l_metric = 1.0
d_metric = 1.0
i_metric = 1.0
e_metric = float(loc * 10)
b_metric = float(branch_count)
t_metric = max(1, loc // 10)
lOCode = float(loc)
lOComment = float(len([l for l in lines if l.strip().startswith("#")]))
lOBlank = float(len([l for l in lines if not l.strip()]))
locCodeAndComment = float(loc + lOComment)
uniq_Op = 10.0
uniq_Opnd = 10.0
total_Op = float(loc * 2)
total_Opnd = float(loc * 2)
return [
loc, v_g, ev_g, iv_g, n, v, l_metric, d_metric, i_metric, e_metric,
b_metric, t_metric, lOCode, lOComment, lOBlank, locCodeAndComment,
uniq_Op, uniq_Opnd, total_Op, total_Opnd, branch_count
]
# 4. Simple rule-based suspicious line detector
def find_suspicious_lines(code: str):
"""
Rule-based suspicious patterns (NOT ML, just heuristics).
"""
suspicious = []
lines = code.splitlines()
for idx, line in enumerate(lines, start=1):
stripped = line.strip()
# Bare except
if stripped.startswith("except:"):
suspicious.append((idx, line, "Bare 'except:' (too generic)"))
# eval usage
if "eval(" in stripped:
suspicious.append((idx, line, "Use of eval() is risky"))
# == None instead of is None
if "== None" in stripped:
suspicious.append((idx, line, "Use 'is None' instead of '== None'"))
# TODO/FIXME comments
if "# TODO" in stripped or "# FIXME" in stripped:
suspicious.append((idx, line, "TODO/FIXME comment (pending work)"))
# Very long conditional logic
if len(stripped) > 100 and ("if " in stripped or "while " in stripped):
suspicious.append((idx, line, "Very long condition (complex logic)"))
return suspicious
# 5. Streamlit UI: ONLY Python file upload
st.title("🐍 Software Bug Risk Predictor from Python File")
st.write(
"Upload a `.py` file. The app will:\n"
"1. Estimate code metrics and predict defect risk using an XGBoost model trained on NASA KC1.\n"
"2. Highlight lines that look suspicious based on simple static rules (not ML)."
)
uploaded_py = st.file_uploader("Choose a Python file", type=["py"])
if uploaded_py is not None:
# Read and decode the file
code_bytes = uploaded_py.read()
try:
code_text = code_bytes.decode("utf-8")
except UnicodeDecodeError:
st.error("❌ Could not decode file as UTF-8 text. Please upload a UTF-8 encoded .py file.")
code_text = None
if code_text:
st.markdown("### πŸ“„ Code Preview")
st.code(code_text, language="python")
# Extract approximate metrics from code and get ML prediction
metrics_vector = extract_simple_metrics_from_code(code_text)
metrics_array = np.array(metrics_vector).reshape(1, -1)
scaled = scaler.transform(metrics_array)
ml_pred = model.predict(scaled)[0]
if hasattr(model, "predict_proba"):
ml_proba = model.predict_proba(scaled)[0][1]
else:
ml_proba = None
# Rule-based suspicious lines
suspicious = find_suspicious_lines(code_text)
# πŸ”΄ HYBRID DECISION:
# If ML says defect OR we found suspicious lines β†’ treat as defect
is_defect = (ml_pred == 1) or (len(suspicious) > 0)
st.markdown("### πŸ” File-level Defect Prediction")
if is_defect:
st.error("⚠️ Defect Likely")
else:
st.success("βœ… No Defect Predicted")
if ml_proba is not None:
st.write(f"Estimated probability from ML model: **{ml_proba:.2f}**")
# Suspicious lines section
st.markdown("### 🧷 Suspicious Lines (Rule-Based Heuristics)")
if not suspicious:
st.info("No suspicious patterns found by the simple rules.")
else:
for line_no, line_text, reason in suspicious:
st.write(f"**Line {line_no}** β€” {reason}")
st.code(line_text, language="python")
if not suspicious:
st.info("No suspicious patterns found by the simple rules.")
else:
for line_no, line_text, reason in suspicious:
st.write(f"**Line {line_no}** β€” {reason}")
st.code(line_text, language="python")