hieu3636 commited on
Commit
1d2f225
·
verified ·
1 Parent(s): 4eaae24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -81
app.py CHANGED
@@ -1,81 +1,110 @@
1
- import gradio as gr
2
- import pandas as pd
3
- import numpy as np
4
- import joblib
5
-
6
- # =========================
7
- # LOAD MODEL
8
- # =========================
9
- artifact = joblib.load("stacking_model.pkl")
10
-
11
- base_models = artifact["base_models"] # list of (name, model)
12
- meta_model = artifact["meta_model"]
13
- FEATURE_NAMES = artifact["features"]
14
-
15
- N_FEATURES = len(FEATURE_NAMES)
16
-
17
- # =========================
18
- # PREDICTION FUNCTION
19
- # =========================
20
- def predict_csv(file):
21
- df = pd.read_csv(file)
22
-
23
- # Chuẩn hóa tên cột để tránh BOM/khoảng trắng
24
- df.columns = df.columns.str.strip()
25
- df.columns = df.columns.str.replace("\ufeff", "")
26
-
27
- # Drop label columns if exist
28
- df = df.drop(columns=["Label", "label", "class", "Class"], errors="ignore")
29
-
30
- # Check missing features
31
- missing_features = [f for f in FEATURE_NAMES if f not in df.columns]
32
- if missing_features:
33
- # Trả về DataFrame báo lỗi thay vì string
34
- return pd.DataFrame({
35
- "error": [f"Missing required features: {missing_features}"]
36
- })
37
-
38
- # Keep only required features & correct order
39
- X = df[FEATURE_NAMES].astype(float)
40
-
41
- # =========================
42
- # LEVEL-1 (BASE MODELS)
43
- # =========================
44
- meta_inputs = []
45
- for name, model in base_models:
46
- probs = model.predict_proba(X)[:, 1]
47
- meta_inputs.append(probs)
48
-
49
- meta_X = np.column_stack(meta_inputs)
50
-
51
- # =========================
52
- # META MODEL
53
- # =========================
54
- final_probs = meta_model.predict_proba(meta_X)[:, 1]
55
- final_preds = (final_probs > 0.5).astype(int)
56
-
57
- # =========================
58
- # BUILD OUTPUT
59
- # =========================
60
- result = df.copy()
61
- result.insert(0, "row_id", range(1, len(df) + 1))
62
- result["probability_malware"] = final_probs
63
- result["prediction"] = final_preds
64
- result["prediction_label"] = result["prediction"].map(
65
- {1: "malware", 0: "benign"}
66
- )
67
-
68
- return result
69
-
70
- # =========================
71
- # GRADIO INTERFACE
72
- # =========================
73
- demo = gr.Interface(
74
- fn=predict_csv,
75
- inputs=gr.File(label="Upload CSV file"),
76
- outputs=gr.Dataframe(label="Prediction Result"),
77
- title="Stacking-based Malware Detection",
78
- description="ExtraTrees + RandomForest + LightGBM + LogisticRegression → XGBoost"
79
- )
80
-
81
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ import pandas as pd
3
+ import gradio as gr
4
+ import numpy as np
5
+ import re
6
+
7
+ # ======================
8
+ # LOAD MODEL
9
+ # ======================
10
+ artifact = joblib.load("stacking_model.pkl")
11
+
12
+ base_models = artifact["base_models"] # list of (name, model)
13
+ meta_model = artifact["meta_model"]
14
+ feature_names = artifact["features"]
15
+
16
+ # ======================
17
+ # CLEAN FUNCTION (same as training)
18
+ # ======================
19
+ def clean_numeric(val):
20
+ if pd.isna(val):
21
+ return None
22
+
23
+ val = str(val).strip()
24
+ val = re.sub(r'\s+', '', val)
25
+
26
+ # scientific notation
27
+ if re.match(r'^-?\d+(\.\d+)?[eE][+-]?\d+$', val):
28
+ return float(val)
29
+
30
+ # remove thousand separators
31
+ if val.count('.') > 1:
32
+ val = val.replace('.', '')
33
+
34
+ # comma decimal -> dot
35
+ if ',' in val and '.' not in val:
36
+ val = val.replace(',', '.')
37
+
38
+ try:
39
+ return float(val)
40
+ except ValueError:
41
+ return None
42
+
43
+
44
+ # ======================
45
+ # PREDICTION FUNCTION
46
+ # ======================
47
+ def predict_malware_csv(file):
48
+ df = pd.read_csv(file.name)
49
+
50
+ # Check missing features
51
+ missing = set(feature_names) - set(df.columns)
52
+ if missing:
53
+ return f"❌ Missing features: {list(missing)}", None
54
+
55
+ # Keep only needed features
56
+ X = df[feature_names].copy()
57
+
58
+ # 🔥 CLEAN NUMERIC FEATURES
59
+ for col in feature_names:
60
+ X[col] = X[col].apply(clean_numeric)
61
+
62
+ # Optional: fill NaN if needed
63
+ # X = X.fillna(0)
64
+
65
+ # Level-1 predictions
66
+ meta_inputs = []
67
+ for name, model in base_models:
68
+ prob = model.predict_proba(X)[:, 1]
69
+ meta_inputs.append(prob)
70
+
71
+ meta_X = np.column_stack(meta_inputs)
72
+
73
+ # Meta prediction
74
+ preds = meta_model.predict(meta_X)
75
+ probs = meta_model.predict_proba(meta_X)[:, 1]
76
+
77
+ # Append results
78
+ result_df = df.copy()
79
+ result_df["Prediction"] = np.where(preds == 1, "Malware", "Benign")
80
+ result_df["Malware_Probability"] = probs
81
+
82
+ return "✅ Prediction completed", result_df
83
+
84
+
85
+ # ======================
86
+ # UI
87
+ # ======================
88
+ inputs = gr.File(
89
+ label="Upload CSV file (features only)",
90
+ file_types=[".csv"]
91
+ )
92
+
93
+ outputs = [
94
+ gr.Textbox(label="Status"),
95
+ gr.Dataframe(label="Prediction Results")
96
+ ]
97
+
98
+ app = gr.Interface(
99
+ fn=predict_malware_csv,
100
+ inputs=inputs,
101
+ outputs=outputs,
102
+ title="Stacking-based Malware Detection",
103
+ description=(
104
+ "Upload a CSV file.\n\n"
105
+ f"Required features: {', '.join(feature_names)}"
106
+ )
107
+ )
108
+
109
+ if __name__ == "__main__":
110
+ app.launch()