File size: 5,697 Bytes
f9e44c7
 
 
 
09c969b
f9e44c7
 
 
 
 
 
 
 
09c969b
f9e44c7
 
 
 
 
 
09c969b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9e44c7
09c969b
f9e44c7
09c969b
 
 
 
 
 
 
 
f9e44c7
09c969b
 
 
f9e44c7
d1bbf06
09c969b
 
62b36ef
09c969b
d1bbf06
62b36ef
09c969b
d1bbf06
09c969b
d1bbf06
 
 
 
 
 
 
 
62b36ef
09c969b
d1bbf06
62b36ef
 
 
 
d1bbf06
 
09c969b
 
 
d1bbf06
 
 
 
 
 
 
09c969b
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import streamlit as st
import joblib
import numpy as np

# 1. Load model and scaler (once, cached)
@st.cache_resource
def load_artifacts():
    model = joblib.load("bug_predictor_model.pkl")
    scaler = joblib.load("scaler.pkl")
    return model, scaler

model, scaler = load_artifacts()

# 2. Feature names in same order as training
feature_names = [
    'loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't',
    'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op',
    'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount'
]

# 3. Simple metric extraction from raw Python code
def extract_simple_metrics_from_code(code: str):
    """
    Approximate metrics from Python code.
    This is a heuristic approximation, not exact NASA KC1 metrics.
    """
    lines = code.splitlines()

    # Non-empty lines of code
    loc = len([l for l in lines if l.strip()])

    # Basic branching structures
    branch_keywords = ("if ", "elif ", "for ", "while ", "try:", "except", "with ")
    branch_count = sum(any(kw in l for kw in branch_keywords) for l in lines)

    # Rough proxies for the rest (just to fill features)
    v_g = branch_count                          # cyclomatic complexity approx
    ev_g = max(1, branch_count // 2)
    iv_g = max(1, branch_count // 3)
    n = max(1, loc * 2)
    v = max(1, loc * 3)
    l_metric = 1.0
    d_metric = 1.0
    i_metric = 1.0
    e_metric = float(loc * 10)
    b_metric = float(branch_count)
    t_metric = max(1, loc // 10)

    lOCode = float(loc)
    lOComment = float(len([l for l in lines if l.strip().startswith("#")]))
    lOBlank = float(len([l for l in lines if not l.strip()]))
    locCodeAndComment = float(loc + lOComment)

    uniq_Op = 10.0
    uniq_Opnd = 10.0
    total_Op = float(loc * 2)
    total_Opnd = float(loc * 2)

    return [
        loc, v_g, ev_g, iv_g, n, v, l_metric, d_metric, i_metric, e_metric,
        b_metric, t_metric, lOCode, lOComment, lOBlank, locCodeAndComment,
        uniq_Op, uniq_Opnd, total_Op, total_Opnd, branch_count
    ]


# 4. Simple rule-based suspicious line detector
def find_suspicious_lines(code: str):
    """
    Rule-based suspicious patterns (NOT ML, just heuristics).
    """
    suspicious = []
    lines = code.splitlines()

    for idx, line in enumerate(lines, start=1):
        stripped = line.strip()

        # Bare except
        if stripped.startswith("except:"):
            suspicious.append((idx, line, "Bare 'except:' (too generic)"))

        # eval usage
        if "eval(" in stripped:
            suspicious.append((idx, line, "Use of eval() is risky"))

        # == None instead of is None
        if "== None" in stripped:
            suspicious.append((idx, line, "Use 'is None' instead of '== None'"))

        # TODO/FIXME comments
        if "# TODO" in stripped or "# FIXME" in stripped:
            suspicious.append((idx, line, "TODO/FIXME comment (pending work)"))

        # Very long conditional logic
        if len(stripped) > 100 and ("if " in stripped or "while " in stripped):
            suspicious.append((idx, line, "Very long condition (complex logic)"))

    return suspicious


# 5. Streamlit UI: ONLY Python file upload
st.title("🐍 Software Bug Risk Predictor from Python File")
st.write(
    "Upload a `.py` file. The app will:\n"
    "1. Estimate code metrics and predict defect risk using an XGBoost model trained on NASA KC1.\n"
    "2. Highlight lines that look suspicious based on simple static rules (not ML)."
)

uploaded_py = st.file_uploader("Choose a Python file", type=["py"])

if uploaded_py is not None:
    # Read and decode the file
    code_bytes = uploaded_py.read()
    try:
        code_text = code_bytes.decode("utf-8")
    except UnicodeDecodeError:
        st.error("❌ Could not decode file as UTF-8 text. Please upload a UTF-8 encoded .py file.")
        code_text = None

    if code_text:
        st.markdown("### πŸ“„ Code Preview")
        st.code(code_text, language="python")

            # Extract approximate metrics from code and get ML prediction
        metrics_vector = extract_simple_metrics_from_code(code_text)
        metrics_array = np.array(metrics_vector).reshape(1, -1)

        scaled = scaler.transform(metrics_array)
        ml_pred = model.predict(scaled)[0]

        if hasattr(model, "predict_proba"):
            ml_proba = model.predict_proba(scaled)[0][1]
        else:
            ml_proba = None

        # Rule-based suspicious lines
        suspicious = find_suspicious_lines(code_text)

        # πŸ”΄ HYBRID DECISION:
        # If ML says defect OR we found suspicious lines β†’ treat as defect
        is_defect = (ml_pred == 1) or (len(suspicious) > 0)

        st.markdown("### πŸ” File-level Defect Prediction")
        if is_defect:
            st.error("⚠️ Defect Likely")
        else:
            st.success("βœ… No Defect Predicted")

        if ml_proba is not None:
            st.write(f"Estimated probability from ML model: **{ml_proba:.2f}**")

        # Suspicious lines section
        st.markdown("### 🧷 Suspicious Lines (Rule-Based Heuristics)")
        if not suspicious:
            st.info("No suspicious patterns found by the simple rules.")
        else:
            for line_no, line_text, reason in suspicious:
                st.write(f"**Line {line_no}** β€” {reason}")
                st.code(line_text, language="python")
   
        if not suspicious:
            st.info("No suspicious patterns found by the simple rules.")
        else:
            for line_no, line_text, reason in suspicious:
                st.write(f"**Line {line_no}** β€” {reason}")
                st.code(line_text, language="python")