Phani1008 commited on
Commit
09c969b
ยท
verified ยท
1 Parent(s): 62b36ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -98
app.py CHANGED
@@ -1,9 +1,8 @@
1
  import streamlit as st
2
  import joblib
3
  import numpy as np
4
- import pandas as pd
5
 
6
- # Load model and scaler once, cached for performance
7
  @st.cache_resource
8
  def load_artifacts():
9
  model = joblib.load("bug_predictor_model.pkl")
@@ -12,119 +11,144 @@ def load_artifacts():
12
 
13
  model, scaler = load_artifacts()
14
 
15
- st.title("๐Ÿ” Software Bug Prediction System")
16
- st.write(
17
- "Predict whether a software module is likely to be **defective** based on metrics "
18
- "from the NASA KC1 dataset."
19
- )
20
-
21
- # List of feature names in the same order as training
22
  feature_names = [
23
  'loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't',
24
  'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op',
25
  'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount'
26
  ]
27
 
28
- tab_single, tab_bulk = st.tabs(["๐Ÿงฎ Single module input", "๐Ÿ“‚ Bulk prediction via CSV"])
29
-
30
- # =========================
31
- # TAB 1: SINGLE ROW INPUT
32
- # =========================
33
- with tab_single:
34
- st.subheader("๐Ÿ“ฅ Enter Module Metrics Manually")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- inputs = []
37
- cols = st.columns(3) # 3-column layout for nicer UI
38
 
39
- for idx, name in enumerate(feature_names):
40
- with cols[idx % 3]:
41
- val = st.number_input(name, value=0.0)
42
- inputs.append(val)
 
 
 
 
43
 
44
- if st.button("Predict Defect Risk", key="single_predict"):
45
- # Convert inputs to 2D array
46
- input_array = np.array(inputs).reshape(1, -1)
47
 
48
- # Scale using same scaler from training
49
- scaled = scaler.transform(input_array)
 
50
 
51
- # Predict with loaded model
 
52
  pred = model.predict(scaled)[0]
53
 
54
- # Probability of defect (if supported)
55
- proba = model.predict_proba(scaled)[0][1] if hasattr(model, "predict_proba") else None
 
 
56
 
 
57
  if pred == 1:
58
  st.error("โš ๏ธ Defect Likely")
59
  else:
60
  st.success("โœ… No Defect Predicted")
61
 
62
  if proba is not None:
63
- st.write(f"Probability of Defect: **{proba:.2f}**")
64
-
65
-
66
- # =========================
67
- # TAB 2: BULK CSV PREDICTION
68
- # =========================
69
- with tab_bulk:
70
- st.subheader("๐Ÿ“‚ Upload CSV for Bulk Prediction")
71
- st.write(
72
- "Upload a CSV file containing the following columns (no target column needed):"
73
- )
74
- st.code(", ".join(feature_names), language="text")
75
-
76
- uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
77
-
78
- if uploaded_file is not None:
79
- try:
80
- df = pd.read_csv(uploaded_file)
81
-
82
- st.write("๐Ÿ“„ Preview of uploaded data:")
83
- st.dataframe(df.head())
84
-
85
- # Check if all required columns exist
86
- missing_cols = [col for col in feature_names if col not in df.columns]
87
- if missing_cols:
88
- st.error(
89
- "The following required columns are missing from the uploaded file:\n"
90
- + ", ".join(missing_cols)
91
- )
92
- else:
93
- # Keep only the required columns in correct order
94
- X = df[feature_names].copy()
95
-
96
- # Scale features
97
- X_scaled = scaler.transform(X)
98
-
99
- # Predict
100
- preds = model.predict(X_scaled)
101
-
102
- # Probabilities (if available)
103
- if hasattr(model, "predict_proba"):
104
- probas = model.predict_proba(X_scaled)[:, 1]
105
- else:
106
- probas = None
107
-
108
- # Add predictions to dataframe
109
- df["Defect_Prediction"] = np.where(
110
- preds == 1, "Defect Likely", "No Defect Predicted"
111
- )
112
-
113
- if probas is not None:
114
- df["Defect_Probability"] = probas
115
-
116
- st.success("โœ… Predictions generated!")
117
- st.write("๐Ÿ“Š Results:")
118
- st.dataframe(df.head())
119
-
120
- # Allow user to download results
121
- csv_data = df.to_csv(index=False).encode("utf-8")
122
- st.download_button(
123
- label="โฌ‡๏ธ Download Predictions as CSV",
124
- data=csv_data,
125
- file_name="bug_predictions.csv",
126
- mime="text/csv",
127
- )
128
-
129
- except Exception as e:
130
- st.error(f"โŒ Error reading file: {e}")
 
1
  import streamlit as st
2
  import joblib
3
  import numpy as np
 
4
 
5
+ # 1. Load model and scaler (once, cached)
6
  @st.cache_resource
7
  def load_artifacts():
8
  model = joblib.load("bug_predictor_model.pkl")
 
11
 
12
  model, scaler = load_artifacts()
13
 
14
+ # 2. Feature names in same order as training
 
 
 
 
 
 
15
  feature_names = [
16
  'loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't',
17
  'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op',
18
  'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount'
19
  ]
20
 
21
+ # 3. Simple metric extraction from raw Python code
22
+ def extract_simple_metrics_from_code(code: str):
23
+ """
24
+ Approximate metrics from Python code.
25
+ This is a heuristic approximation, not exact NASA KC1 metrics.
26
+ """
27
+ lines = code.splitlines()
28
+
29
+ # Non-empty lines of code
30
+ loc = len([l for l in lines if l.strip()])
31
+
32
+ # Basic branching structures
33
+ branch_keywords = ("if ", "elif ", "for ", "while ", "try:", "except", "with ")
34
+ branch_count = sum(any(kw in l for kw in branch_keywords) for l in lines)
35
+
36
+ # Rough proxies for the rest (just to fill features)
37
+ v_g = branch_count # cyclomatic complexity approx
38
+ ev_g = max(1, branch_count // 2)
39
+ iv_g = max(1, branch_count // 3)
40
+ n = max(1, loc * 2)
41
+ v = max(1, loc * 3)
42
+ l_metric = 1.0
43
+ d_metric = 1.0
44
+ i_metric = 1.0
45
+ e_metric = float(loc * 10)
46
+ b_metric = float(branch_count)
47
+ t_metric = max(1, loc // 10)
48
+
49
+ lOCode = float(loc)
50
+ lOComment = float(len([l for l in lines if l.strip().startswith("#")]))
51
+ lOBlank = float(len([l for l in lines if not l.strip()]))
52
+ locCodeAndComment = float(loc + lOComment)
53
+
54
+ uniq_Op = 10.0
55
+ uniq_Opnd = 10.0
56
+ total_Op = float(loc * 2)
57
+ total_Opnd = float(loc * 2)
58
+
59
+ return [
60
+ loc, v_g, ev_g, iv_g, n, v, l_metric, d_metric, i_metric, e_metric,
61
+ b_metric, t_metric, lOCode, lOComment, lOBlank, locCodeAndComment,
62
+ uniq_Op, uniq_Opnd, total_Op, total_Opnd, branch_count
63
+ ]
64
+
65
+
66
+ # 4. Simple rule-based suspicious line detector
67
+ def find_suspicious_lines(code: str):
68
+ """
69
+ Rule-based suspicious patterns (NOT ML, just heuristics).
70
+ """
71
+ suspicious = []
72
+ lines = code.splitlines()
73
+
74
+ for idx, line in enumerate(lines, start=1):
75
+ stripped = line.strip()
76
+
77
+ # Bare except
78
+ if stripped.startswith("except:"):
79
+ suspicious.append((idx, line, "Bare 'except:' (too generic)"))
80
+
81
+ # eval usage
82
+ if "eval(" in stripped:
83
+ suspicious.append((idx, line, "Use of eval() is risky"))
84
+
85
+ # == None instead of is None
86
+ if "== None" in stripped:
87
+ suspicious.append((idx, line, "Use 'is None' instead of '== None'"))
88
+
89
+ # TODO/FIXME comments
90
+ if "# TODO" in stripped or "# FIXME" in stripped:
91
+ suspicious.append((idx, line, "TODO/FIXME comment (pending work)"))
92
+
93
+ # Very long conditional logic
94
+ if len(stripped) > 100 and ("if " in stripped or "while " in stripped):
95
+ suspicious.append((idx, line, "Very long condition (complex logic)"))
96
+
97
+ return suspicious
98
+
99
+
100
+ # 5. Streamlit UI: ONLY Python file upload
101
+ st.title("๐Ÿ Software Bug Risk Predictor from Python File")
102
+ st.write(
103
+ "Upload a `.py` file. The app will:\n"
104
+ "1. Estimate code metrics and predict defect risk using an XGBoost model trained on NASA KC1.\n"
105
+ "2. Highlight lines that look suspicious based on simple static rules (not ML)."
106
+ )
107
 
108
+ uploaded_py = st.file_uploader("Choose a Python file", type=["py"])
 
109
 
110
+ if uploaded_py is not None:
111
+ # Read and decode the file
112
+ code_bytes = uploaded_py.read()
113
+ try:
114
+ code_text = code_bytes.decode("utf-8")
115
+ except UnicodeDecodeError:
116
+ st.error("โŒ Could not decode file as UTF-8 text. Please upload a UTF-8 encoded .py file.")
117
+ code_text = None
118
 
119
+ if code_text:
120
+ st.markdown("### ๐Ÿ“„ Code Preview")
121
+ st.code(code_text, language="python")
122
 
123
+ # Extract approximate metrics, predict
124
+ metrics_vector = extract_simple_metrics_from_code(code_text)
125
+ metrics_array = np.array(metrics_vector).reshape(1, -1)
126
 
127
+ # Scale and predict
128
+ scaled = scaler.transform(metrics_array)
129
  pred = model.predict(scaled)[0]
130
 
131
+ if hasattr(model, "predict_proba"):
132
+ proba = model.predict_proba(scaled)[0][1]
133
+ else:
134
+ proba = None
135
 
136
+ st.markdown("### ๐Ÿ” File-level Defect Prediction")
137
  if pred == 1:
138
  st.error("โš ๏ธ Defect Likely")
139
  else:
140
  st.success("โœ… No Defect Predicted")
141
 
142
  if proba is not None:
143
+ st.write(f"Estimated probability of defect: **{proba:.2f}**")
144
+
145
+ # Suspicious lines section
146
+ st.markdown("### ๐Ÿงท Suspicious Lines (Rule-Based Heuristics)")
147
+ suspicious = find_suspicious_lines(code_text)
148
+
149
+ if not suspicious:
150
+ st.info("No suspicious patterns found by the simple rules.")
151
+ else:
152
+ for line_no, line_text, reason in suspicious:
153
+ st.write(f"**Line {line_no}** โ€” {reason}")
154
+ st.code(line_text, language="python")