dyo3112 commited on
Commit
4eaae24
·
verified ·
1 Parent(s): 0b9a7c3

Upload 3 files

Browse files
Files changed (2) hide show
  1. app.py +81 -110
  2. requirements.txt +7 -9
app.py CHANGED
@@ -1,110 +1,81 @@
1
- import joblib
2
- import pandas as pd
3
- import gradio as gr
4
- import numpy as np
5
- import re
6
-
7
- # ======================
8
- # LOAD MODEL
9
- # ======================
10
- artifact = joblib.load("stacking_model.pkl")
11
-
12
- base_models = artifact["base_models"] # list of (name, model)
13
- meta_model = artifact["meta_model"]
14
- feature_names = artifact["features"]
15
-
16
- # ======================
17
- # CLEAN FUNCTION (same as training)
18
- # ======================
19
- def clean_numeric(val):
20
- if pd.isna(val):
21
- return None
22
-
23
- val = str(val).strip()
24
- val = re.sub(r'\s+', '', val)
25
-
26
- # scientific notation
27
- if re.match(r'^-?\d+(\.\d+)?[eE][+-]?\d+$', val):
28
- return float(val)
29
-
30
- # remove thousand separators
31
- if val.count('.') > 1:
32
- val = val.replace('.', '')
33
-
34
- # comma decimal -> dot
35
- if ',' in val and '.' not in val:
36
- val = val.replace(',', '.')
37
-
38
- try:
39
- return float(val)
40
- except ValueError:
41
- return None
42
-
43
-
44
- # ======================
45
- # PREDICTION FUNCTION
46
- # ======================
47
- def predict_malware_csv(file):
48
- df = pd.read_csv(file.name)
49
-
50
- # Check missing features
51
- missing = set(feature_names) - set(df.columns)
52
- if missing:
53
- return f"❌ Missing features: {list(missing)}", None
54
-
55
- # Keep only needed features
56
- X = df[feature_names].copy()
57
-
58
- # 🔥 CLEAN NUMERIC FEATURES
59
- for col in feature_names:
60
- X[col] = X[col].apply(clean_numeric)
61
-
62
- # Optional: fill NaN if needed
63
- # X = X.fillna(0)
64
-
65
- # Level-1 predictions
66
- meta_inputs = []
67
- for name, model in base_models:
68
- prob = model.predict_proba(X)[:, 1]
69
- meta_inputs.append(prob)
70
-
71
- meta_X = np.column_stack(meta_inputs)
72
-
73
- # Meta prediction
74
- preds = meta_model.predict(meta_X)
75
- probs = meta_model.predict_proba(meta_X)[:, 1]
76
-
77
- # Append results
78
- result_df = df.copy()
79
- result_df["Prediction"] = np.where(preds == 1, "Malware", "Benign")
80
- result_df["Malware_Probability"] = probs
81
-
82
- return "✅ Prediction completed", result_df
83
-
84
-
85
- # ======================
86
- # UI
87
- # ======================
88
- inputs = gr.File(
89
- label="Upload CSV file (features only)",
90
- file_types=[".csv"]
91
- )
92
-
93
- outputs = [
94
- gr.Textbox(label="Status"),
95
- gr.Dataframe(label="Prediction Results")
96
- ]
97
-
98
- app = gr.Interface(
99
- fn=predict_malware_csv,
100
- inputs=inputs,
101
- outputs=outputs,
102
- title="Stacking-based Malware Detection",
103
- description=(
104
- "Upload a CSV file.\n\n"
105
-
106
- )
107
- )
108
-
109
- if __name__ == "__main__":
110
- app.launch()
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import joblib
5
+
6
+ # =========================
7
+ # LOAD MODEL
8
+ # =========================
9
+ artifact = joblib.load("stacking_model.pkl")
10
+
11
+ base_models = artifact["base_models"] # list of (name, model)
12
+ meta_model = artifact["meta_model"]
13
+ FEATURE_NAMES = artifact["features"]
14
+
15
+ N_FEATURES = len(FEATURE_NAMES)
16
+
17
+ # =========================
18
+ # PREDICTION FUNCTION
19
+ # =========================
20
+ def predict_csv(file):
21
+ df = pd.read_csv(file)
22
+
23
+ # Chuẩn hóa tên cột để tránh BOM/khoảng trắng
24
+ df.columns = df.columns.str.strip()
25
+ df.columns = df.columns.str.replace("\ufeff", "")
26
+
27
+ # Drop label columns if exist
28
+ df = df.drop(columns=["Label", "label", "class", "Class"], errors="ignore")
29
+
30
+ # Check missing features
31
+ missing_features = [f for f in FEATURE_NAMES if f not in df.columns]
32
+ if missing_features:
33
+ # Trả về DataFrame báo lỗi thay vì string
34
+ return pd.DataFrame({
35
+ "error": [f"Missing required features: {missing_features}"]
36
+ })
37
+
38
+ # Keep only required features & correct order
39
+ X = df[FEATURE_NAMES].astype(float)
40
+
41
+ # =========================
42
+ # LEVEL-1 (BASE MODELS)
43
+ # =========================
44
+ meta_inputs = []
45
+ for name, model in base_models:
46
+ probs = model.predict_proba(X)[:, 1]
47
+ meta_inputs.append(probs)
48
+
49
+ meta_X = np.column_stack(meta_inputs)
50
+
51
+ # =========================
52
+ # META MODEL
53
+ # =========================
54
+ final_probs = meta_model.predict_proba(meta_X)[:, 1]
55
+ final_preds = (final_probs > 0.5).astype(int)
56
+
57
+ # =========================
58
+ # BUILD OUTPUT
59
+ # =========================
60
+ result = df.copy()
61
+ result.insert(0, "row_id", range(1, len(df) + 1))
62
+ result["probability_malware"] = final_probs
63
+ result["prediction"] = final_preds
64
+ result["prediction_label"] = result["prediction"].map(
65
+ {1: "malware", 0: "benign"}
66
+ )
67
+
68
+ return result
69
+
70
+ # =========================
71
+ # GRADIO INTERFACE
72
+ # =========================
73
+ demo = gr.Interface(
74
+ fn=predict_csv,
75
+ inputs=gr.File(label="Upload CSV file"),
76
+ outputs=gr.Dataframe(label="Prediction Result"),
77
+ title="Stacking-based Malware Detection",
78
+ description="ExtraTrees + RandomForest + LightGBM + LogisticRegression → XGBoost"
79
+ )
80
+
81
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,9 +1,7 @@
1
- numpy
2
- pandas
3
- scikit-learn
4
- xgboost
5
- lightgbm
6
- gradio
7
- joblib
8
- huggingface_hub
9
- tensorflow
 
1
+ numpy
2
+ pandas
3
+ scikit-learn
4
+ xgboost
5
+ lightgbm
6
+ gradio
7
+ joblib