Spaces:

Phani1008
/

Software-Bug-Predictor

Sleeping

App Files Files Community

Phani1008 commited on Nov 27, 2025

Commit

09c969b

verified ·

1 Parent(s): 62b36ef

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -98

app.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import streamlit as st
 import joblib
 import numpy as np
-import pandas as pd
-# Load model and scaler once, cached for performance
 @st.cache_resource
 def load_artifacts():
     model = joblib.load("bug_predictor_model.pkl")
@@ -12,119 +11,144 @@ def load_artifacts():
 model, scaler = load_artifacts()
-st.title("🔍 Software Bug Prediction System")
-st.write(
-    "Predict whether a software module is likely to be **defective** based on metrics "
-    "from the NASA KC1 dataset."
-)
-# List of feature names in the same order as training
 feature_names = [
     'loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't',
     'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op',
     'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount'
 ]
-tab_single, tab_bulk = st.tabs(["🧮 Single module input", "📂 Bulk prediction via CSV"])
-# =========================
-# TAB 1: SINGLE ROW INPUT
-# =========================
-with tab_single:
-    st.subheader("📥 Enter Module Metrics Manually")
-    inputs = []
-    cols = st.columns(3)  # 3-column layout for nicer UI
-    for idx, name in enumerate(feature_names):
-        with cols[idx % 3]:
-            val = st.number_input(name, value=0.0)
-            inputs.append(val)
-    if st.button("Predict Defect Risk", key="single_predict"):
-        # Convert inputs to 2D array
-        input_array = np.array(inputs).reshape(1, -1)
-        # Scale using same scaler from training
-        scaled = scaler.transform(input_array)
-        # Predict with loaded model
         pred = model.predict(scaled)[0]
-        # Probability of defect (if supported)
-        proba = model.predict_proba(scaled)[0][1] if hasattr(model, "predict_proba") else None
         if pred == 1:
             st.error("⚠️ Defect Likely")
         else:
             st.success("✅ No Defect Predicted")
         if proba is not None:
-            st.write(f"Probability of Defect: **{proba:.2f}**")
-# =========================
-# TAB 2: BULK CSV PREDICTION
-# =========================
-with tab_bulk:
-    st.subheader("📂 Upload CSV for Bulk Prediction")
-    st.write(
-        "Upload a CSV file containing the following columns (no target column needed):"
-    )
-    st.code(", ".join(feature_names), language="text")
-    uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
-    if uploaded_file is not None:
-        try:
-            df = pd.read_csv(uploaded_file)
-            st.write("📄 Preview of uploaded data:")
-            st.dataframe(df.head())
-            # Check if all required columns exist
-            missing_cols = [col for col in feature_names if col not in df.columns]
-            if missing_cols:
-                st.error(
-                    "The following required columns are missing from the uploaded file:\n"
-                    + ", ".join(missing_cols)
-                )
-            else:
-                # Keep only the required columns in correct order
-                X = df[feature_names].copy()
-                # Scale features
-                X_scaled = scaler.transform(X)
-                # Predict
-                preds = model.predict(X_scaled)
-                # Probabilities (if available)
-                if hasattr(model, "predict_proba"):
-                    probas = model.predict_proba(X_scaled)[:, 1]
-                else:
-                    probas = None
-                # Add predictions to dataframe
-                df["Defect_Prediction"] = np.where(
-                    preds == 1, "Defect Likely", "No Defect Predicted"
-                )
-                if probas is not None:
-                    df["Defect_Probability"] = probas
-                st.success("✅ Predictions generated!")
-                st.write("📊 Results:")
-                st.dataframe(df.head())
-                # Allow user to download results
-                csv_data = df.to_csv(index=False).encode("utf-8")
-                st.download_button(
-                    label="⬇️ Download Predictions as CSV",
-                    data=csv_data,
-                    file_name="bug_predictions.csv",
-                    mime="text/csv",
-                )
-        except Exception as e:
-            st.error(f"❌ Error reading file: {e}")

 import streamlit as st
 import joblib
 import numpy as np
+# 1. Load model and scaler (once, cached)
 @st.cache_resource
 def load_artifacts():
     model = joblib.load("bug_predictor_model.pkl")
 model, scaler = load_artifacts()
+# 2. Feature names in same order as training
 feature_names = [
     'loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't',
     'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op',
     'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount'
 ]
+# 3. Simple metric extraction from raw Python code
+def extract_simple_metrics_from_code(code: str):
+    """
+    Approximate metrics from Python code.
+    This is a heuristic approximation, not exact NASA KC1 metrics.
+    """
+    lines = code.splitlines()
+    # Non-empty lines of code
+    loc = len([l for l in lines if l.strip()])
+    # Basic branching structures
+    branch_keywords = ("if ", "elif ", "for ", "while ", "try:", "except", "with ")
+    branch_count = sum(any(kw in l for kw in branch_keywords) for l in lines)
+    # Rough proxies for the rest (just to fill features)
+    v_g = branch_count                          # cyclomatic complexity approx
+    ev_g = max(1, branch_count // 2)
+    iv_g = max(1, branch_count // 3)
+    n = max(1, loc * 2)
+    v = max(1, loc * 3)
+    l_metric = 1.0
+    d_metric = 1.0
+    i_metric = 1.0
+    e_metric = float(loc * 10)
+    b_metric = float(branch_count)
+    t_metric = max(1, loc // 10)
+    lOCode = float(loc)
+    lOComment = float(len([l for l in lines if l.strip().startswith("#")]))
+    lOBlank = float(len([l for l in lines if not l.strip()]))
+    locCodeAndComment = float(loc + lOComment)
+    uniq_Op = 10.0
+    uniq_Opnd = 10.0
+    total_Op = float(loc * 2)
+    total_Opnd = float(loc * 2)
+    return [
+        loc, v_g, ev_g, iv_g, n, v, l_metric, d_metric, i_metric, e_metric,
+        b_metric, t_metric, lOCode, lOComment, lOBlank, locCodeAndComment,
+        uniq_Op, uniq_Opnd, total_Op, total_Opnd, branch_count
+    ]
+# 4. Simple rule-based suspicious line detector
+def find_suspicious_lines(code: str):
+    """
+    Rule-based suspicious patterns (NOT ML, just heuristics).
+    """
+    suspicious = []
+    lines = code.splitlines()
+    for idx, line in enumerate(lines, start=1):
+        stripped = line.strip()
+        # Bare except
+        if stripped.startswith("except:"):
+            suspicious.append((idx, line, "Bare 'except:' (too generic)"))
+        # eval usage
+        if "eval(" in stripped:
+            suspicious.append((idx, line, "Use of eval() is risky"))
+        # == None instead of is None
+        if "== None" in stripped:
+            suspicious.append((idx, line, "Use 'is None' instead of '== None'"))
+        # TODO/FIXME comments
+        if "# TODO" in stripped or "# FIXME" in stripped:
+            suspicious.append((idx, line, "TODO/FIXME comment (pending work)"))
+        # Very long conditional logic
+        if len(stripped) > 100 and ("if " in stripped or "while " in stripped):
+            suspicious.append((idx, line, "Very long condition (complex logic)"))
+    return suspicious
+# 5. Streamlit UI: ONLY Python file upload
+st.title("🐍 Software Bug Risk Predictor from Python File")
+st.write(
+    "Upload a `.py` file. The app will:\n"
+    "1. Estimate code metrics and predict defect risk using an XGBoost model trained on NASA KC1.\n"
+    "2. Highlight lines that look suspicious based on simple static rules (not ML)."
+)
+uploaded_py = st.file_uploader("Choose a Python file", type=["py"])
+if uploaded_py is not None:
+    # Read and decode the file
+    code_bytes = uploaded_py.read()
+    try:
+        code_text = code_bytes.decode("utf-8")
+    except UnicodeDecodeError:
+        st.error("❌ Could not decode file as UTF-8 text. Please upload a UTF-8 encoded .py file.")
+        code_text = None
+    if code_text:
+        st.markdown("### 📄 Code Preview")
+        st.code(code_text, language="python")
+        # Extract approximate metrics, predict
+        metrics_vector = extract_simple_metrics_from_code(code_text)
+        metrics_array = np.array(metrics_vector).reshape(1, -1)
+        # Scale and predict
+        scaled = scaler.transform(metrics_array)
         pred = model.predict(scaled)[0]
+        if hasattr(model, "predict_proba"):
+            proba = model.predict_proba(scaled)[0][1]
+        else:
+            proba = None
+        st.markdown("### 🔍 File-level Defect Prediction")
         if pred == 1:
             st.error("⚠️ Defect Likely")
         else:
             st.success("✅ No Defect Predicted")
         if proba is not None:
+            st.write(f"Estimated probability of defect: **{proba:.2f}**")
+        # Suspicious lines section
+        st.markdown("### 🧷 Suspicious Lines (Rule-Based Heuristics)")
+        suspicious = find_suspicious_lines(code_text)
+        if not suspicious:
+            st.info("No suspicious patterns found by the simple rules.")
+        else:
+            for line_no, line_text, reason in suspicious:
+                st.write(f"**Line {line_no}** — {reason}")
+                st.code(line_text, language="python")