Spaces:

Phani1008
/

Software-Bug-Predictor

Sleeping

File size: 5,697 Bytes

import streamlit as st
import joblib
import numpy as np

# 1. Load model and scaler (once, cached)
@st.cache_resource
def load_artifacts():
    model = joblib.load("bug_predictor_model.pkl")
    scaler = joblib.load("scaler.pkl")
    return model, scaler

model, scaler = load_artifacts()

# 2. Feature names in same order as training
feature_names = [
    'loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't',
    'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op',
    'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount'
]

# 3. Simple metric extraction from raw Python code
def extract_simple_metrics_from_code(code: str):
    """
    Approximate metrics from Python code.
    This is a heuristic approximation, not exact NASA KC1 metrics.
    """
    lines = code.splitlines()

    # Non-empty lines of code
    loc = len([l for l in lines if l.strip()])

    # Basic branching structures
    branch_keywords = ("if ", "elif ", "for ", "while ", "try:", "except", "with ")
    branch_count = sum(any(kw in l for kw in branch_keywords) for l in lines)

    # Rough proxies for the rest (just to fill features)
    v_g = branch_count                          # cyclomatic complexity approx
    ev_g = max(1, branch_count // 2)
    iv_g = max(1, branch_count // 3)
    n = max(1, loc * 2)
    v = max(1, loc * 3)
    l_metric = 1.0
    d_metric = 1.0
    i_metric = 1.0
    e_metric = float(loc * 10)
    b_metric = float(branch_count)
    t_metric = max(1, loc // 10)

    lOCode = float(loc)
    lOComment = float(len([l for l in lines if l.strip().startswith("#")]))
    lOBlank = float(len([l for l in lines if not l.strip()]))
    locCodeAndComment = float(loc + lOComment)

    uniq_Op = 10.0
    uniq_Opnd = 10.0
    total_Op = float(loc * 2)
    total_Opnd = float(loc * 2)

    return [
        loc, v_g, ev_g, iv_g, n, v, l_metric, d_metric, i_metric, e_metric,
        b_metric, t_metric, lOCode, lOComment, lOBlank, locCodeAndComment,
        uniq_Op, uniq_Opnd, total_Op, total_Opnd, branch_count
    ]


# 4. Simple rule-based suspicious line detector
def find_suspicious_lines(code: str):
    """
    Rule-based suspicious patterns (NOT ML, just heuristics).
    """
    suspicious = []
    lines = code.splitlines()

    for idx, line in enumerate(lines, start=1):
        stripped = line.strip()

        # Bare except
        if stripped.startswith("except:"):
            suspicious.append((idx, line, "Bare 'except:' (too generic)"))

        # eval usage
        if "eval(" in stripped:
            suspicious.append((idx, line, "Use of eval() is risky"))

        # == None instead of is None
        if "== None" in stripped:
            suspicious.append((idx, line, "Use 'is None' instead of '== None'"))

        # TODO/FIXME comments
        if "# TODO" in stripped or "# FIXME" in stripped:
            suspicious.append((idx, line, "TODO/FIXME comment (pending work)"))

        # Very long conditional logic
        if len(stripped) > 100 and ("if " in stripped or "while " in stripped):
            suspicious.append((idx, line, "Very long condition (complex logic)"))

    return suspicious


# 5. Streamlit UI: ONLY Python file upload
st.title("🐍 Software Bug Risk Predictor from Python File")
st.write(
    "Upload a `.py` file. The app will:\n"
    "1. Estimate code metrics and predict defect risk using an XGBoost model trained on NASA KC1.\n"
    "2. Highlight lines that look suspicious based on simple static rules (not ML)."
)

uploaded_py = st.file_uploader("Choose a Python file", type=["py"])

if uploaded_py is not None:
    # Read and decode the file
    code_bytes = uploaded_py.read()
    try:
        code_text = code_bytes.decode("utf-8")
    except UnicodeDecodeError:
        st.error("❌ Could not decode file as UTF-8 text. Please upload a UTF-8 encoded .py file.")
        code_text = None

    if code_text:
        st.markdown("### 📄 Code Preview")
        st.code(code_text, language="python")

            # Extract approximate metrics from code and get ML prediction
        metrics_vector = extract_simple_metrics_from_code(code_text)
        metrics_array = np.array(metrics_vector).reshape(1, -1)

        scaled = scaler.transform(metrics_array)
        ml_pred = model.predict(scaled)[0]

        if hasattr(model, "predict_proba"):
            ml_proba = model.predict_proba(scaled)[0][1]
        else:
            ml_proba = None

        # Rule-based suspicious lines
        suspicious = find_suspicious_lines(code_text)

        # 🔴 HYBRID DECISION:
        # If ML says defect OR we found suspicious lines → treat as defect
        is_defect = (ml_pred == 1) or (len(suspicious) > 0)

        st.markdown("### 🔍 File-level Defect Prediction")
        if is_defect:
            st.error("⚠️ Defect Likely")
        else:
            st.success("✅ No Defect Predicted")

        if ml_proba is not None:
            st.write(f"Estimated probability from ML model: **{ml_proba:.2f}**")

        # Suspicious lines section
        st.markdown("### 🧷 Suspicious Lines (Rule-Based Heuristics)")
        if not suspicious:
            st.info("No suspicious patterns found by the simple rules.")
        else:
            for line_no, line_text, reason in suspicious:
                st.write(f"**Line {line_no}** — {reason}")
                st.code(line_text, language="python")
   
        if not suspicious:
            st.info("No suspicious patterns found by the simple rules.")
        else:
            for line_no, line_text, reason in suspicious:
                st.write(f"**Line {line_no}** — {reason}")
                st.code(line_text, language="python")