Spaces:

Phani1008
/

Software-Bug-Predictor

Sleeping

App Files Files Community

Software-Bug-Predictor / app.py

Phani1008

Update app.py

d1bbf06 verified 2 months ago

raw

history blame contribute delete

5.7 kB

	import streamlit as st
	import joblib
	import numpy as np

	# 1. Load model and scaler (once, cached)
	@st.cache_resource
	def load_artifacts():
	model = joblib.load("bug_predictor_model.pkl")
	scaler = joblib.load("scaler.pkl")
	return model, scaler

	model, scaler = load_artifacts()

	# 2. Feature names in same order as training
	feature_names = [
	'loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't',
	'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op',
	'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount'
	]

	# 3. Simple metric extraction from raw Python code
	def extract_simple_metrics_from_code(code: str):
	"""
	Approximate metrics from Python code.
	This is a heuristic approximation, not exact NASA KC1 metrics.
	"""
	lines = code.splitlines()

	# Non-empty lines of code
	loc = len([l for l in lines if l.strip()])

	# Basic branching structures
	branch_keywords = ("if ", "elif ", "for ", "while ", "try:", "except", "with ")
	branch_count = sum(any(kw in l for kw in branch_keywords) for l in lines)

	# Rough proxies for the rest (just to fill features)
	v_g = branch_count # cyclomatic complexity approx
	ev_g = max(1, branch_count // 2)
	iv_g = max(1, branch_count // 3)
	n = max(1, loc * 2)
	v = max(1, loc * 3)
	l_metric = 1.0
	d_metric = 1.0
	i_metric = 1.0
	e_metric = float(loc * 10)
	b_metric = float(branch_count)
	t_metric = max(1, loc // 10)

	lOCode = float(loc)
	lOComment = float(len([l for l in lines if l.strip().startswith("#")]))
	lOBlank = float(len([l for l in lines if not l.strip()]))
	locCodeAndComment = float(loc + lOComment)

	uniq_Op = 10.0
	uniq_Opnd = 10.0
	total_Op = float(loc * 2)
	total_Opnd = float(loc * 2)

	return [
	loc, v_g, ev_g, iv_g, n, v, l_metric, d_metric, i_metric, e_metric,
	b_metric, t_metric, lOCode, lOComment, lOBlank, locCodeAndComment,
	uniq_Op, uniq_Opnd, total_Op, total_Opnd, branch_count
	]


	# 4. Simple rule-based suspicious line detector
	def find_suspicious_lines(code: str):
	"""
	Rule-based suspicious patterns (NOT ML, just heuristics).
	"""
	suspicious = []
	lines = code.splitlines()

	for idx, line in enumerate(lines, start=1):
	stripped = line.strip()

	# Bare except
	if stripped.startswith("except:"):
	suspicious.append((idx, line, "Bare 'except:' (too generic)"))

	# eval usage
	if "eval(" in stripped:
	suspicious.append((idx, line, "Use of eval() is risky"))

	# == None instead of is None
	if "== None" in stripped:
	suspicious.append((idx, line, "Use 'is None' instead of '== None'"))

	# TODO/FIXME comments
	if "# TODO" in stripped or "# FIXME" in stripped:
	suspicious.append((idx, line, "TODO/FIXME comment (pending work)"))

	# Very long conditional logic
	if len(stripped) > 100 and ("if " in stripped or "while " in stripped):
	suspicious.append((idx, line, "Very long condition (complex logic)"))

	return suspicious


	# 5. Streamlit UI: ONLY Python file upload
	st.title("🐍 Software Bug Risk Predictor from Python File")
	st.write(
	"Upload a `.py` file. The app will:\n"
	"1. Estimate code metrics and predict defect risk using an XGBoost model trained on NASA KC1.\n"
	"2. Highlight lines that look suspicious based on simple static rules (not ML)."
	)

	uploaded_py = st.file_uploader("Choose a Python file", type=["py"])

	if uploaded_py is not None:
	# Read and decode the file
	code_bytes = uploaded_py.read()
	try:
	code_text = code_bytes.decode("utf-8")
	except UnicodeDecodeError:
	st.error("❌ Could not decode file as UTF-8 text. Please upload a UTF-8 encoded .py file.")
	code_text = None

	if code_text:
	st.markdown("### 📄 Code Preview")
	st.code(code_text, language="python")

	# Extract approximate metrics from code and get ML prediction
	metrics_vector = extract_simple_metrics_from_code(code_text)
	metrics_array = np.array(metrics_vector).reshape(1, -1)

	scaled = scaler.transform(metrics_array)
	ml_pred = model.predict(scaled)[0]

	if hasattr(model, "predict_proba"):
	ml_proba = model.predict_proba(scaled)[0][1]
	else:
	ml_proba = None

	# Rule-based suspicious lines
	suspicious = find_suspicious_lines(code_text)

	# 🔴 HYBRID DECISION:
	# If ML says defect OR we found suspicious lines → treat as defect
	is_defect = (ml_pred == 1) or (len(suspicious) > 0)

	st.markdown("### 🔍 File-level Defect Prediction")
	if is_defect:
	st.error("⚠️ Defect Likely")
	else:
	st.success("✅ No Defect Predicted")

	if ml_proba is not None:
	st.write(f"Estimated probability from ML model: {ml_proba:.2f}")

	# Suspicious lines section
	st.markdown("### 🧷 Suspicious Lines (Rule-Based Heuristics)")
	if not suspicious:
	st.info("No suspicious patterns found by the simple rules.")
	else:
	for line_no, line_text, reason in suspicious:
	st.write(f"Line {line_no} — {reason}")
	st.code(line_text, language="python")

	if not suspicious:
	st.info("No suspicious patterns found by the simple rules.")
	else:
	for line_no, line_text, reason in suspicious:
	st.write(f"Line {line_no} — {reason}")
	st.code(line_text, language="python")