hieu3636 commited on
Commit
1d72177
·
verified ·
1 Parent(s): 63ab7eb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -5
app.py CHANGED
@@ -2,6 +2,7 @@ import joblib
2
  import pandas as pd
3
  import gradio as gr
4
  import numpy as np
 
5
 
6
  # ======================
7
  # LOAD MODEL
@@ -12,11 +13,38 @@ base_models = artifact["base_models"] # list of (name, model)
12
  meta_model = artifact["meta_model"]
13
  feature_names = artifact["features"]
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # ======================
16
  # PREDICTION FUNCTION
17
  # ======================
18
  def predict_malware_csv(file):
19
- # Read CSV
20
  df = pd.read_csv(file.name)
21
 
22
  # Check missing features
@@ -24,7 +52,15 @@ def predict_malware_csv(file):
24
  if missing:
25
  return f"❌ Missing features: {list(missing)}", None
26
 
27
- X = df[feature_names]
 
 
 
 
 
 
 
 
28
 
29
  # Level-1 predictions
30
  meta_inputs = []
@@ -65,9 +101,8 @@ app = gr.Interface(
65
  outputs=outputs,
66
  title="Stacking-based Malware Detection",
67
  description=(
68
- "Upload a CSV file containing malware features.\n\n"
69
- "Model: ExtraTrees + RandomForest + LightGBM + LogisticRegression → XGBoost\n"
70
- f"Required features: {', '.join(feature_names)}"
71
  )
72
  )
73
 
 
2
  import pandas as pd
3
  import gradio as gr
4
  import numpy as np
5
+ import re
6
 
7
  # ======================
8
  # LOAD MODEL
 
13
  meta_model = artifact["meta_model"]
14
  feature_names = artifact["features"]
15
 
16
+ # ======================
17
+ # CLEAN FUNCTION (same as training)
18
+ # ======================
19
+ def clean_numeric(val):
20
+ if pd.isna(val):
21
+ return None
22
+
23
+ val = str(val).strip()
24
+ val = re.sub(r'\s+', '', val)
25
+
26
+ # scientific notation
27
+ if re.match(r'^-?\d+(\.\d+)?[eE][+-]?\d+$', val):
28
+ return float(val)
29
+
30
+ # remove thousand separators
31
+ if val.count('.') > 1:
32
+ val = val.replace('.', '')
33
+
34
+ # comma decimal -> dot
35
+ if ',' in val and '.' not in val:
36
+ val = val.replace(',', '.')
37
+
38
+ try:
39
+ return float(val)
40
+ except ValueError:
41
+ return None
42
+
43
+
44
  # ======================
45
  # PREDICTION FUNCTION
46
  # ======================
47
  def predict_malware_csv(file):
 
48
  df = pd.read_csv(file.name)
49
 
50
  # Check missing features
 
52
  if missing:
53
  return f"❌ Missing features: {list(missing)}", None
54
 
55
+ # Keep only needed features
56
+ X = df[feature_names].copy()
57
+
58
+ # 🔥 CLEAN NUMERIC FEATURES
59
+ for col in feature_names:
60
+ X[col] = X[col].apply(clean_numeric)
61
+
62
+ # Optional: fill NaN if needed
63
+ # X = X.fillna(0)
64
 
65
  # Level-1 predictions
66
  meta_inputs = []
 
101
  outputs=outputs,
102
  title="Stacking-based Malware Detection",
103
  description=(
104
+ "Upload a CSV file.\n\n"
105
+
 
106
  )
107
  )
108