File size: 2,934 Bytes
1d2f225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff5d94a
9f19f6e
 
ff5d94a
9f19f6e
 
 
1d2f225
ff5d94a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d2f225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import joblib
import pandas as pd
import gradio as gr
import numpy as np
import re

# ======================
# LOAD MODEL
# ======================
artifact = joblib.load("stacking_model.pkl")

base_models = artifact["base_models"]      # list of (name, model)
meta_model = artifact["meta_model"]
feature_names = artifact["features"]

# ======================
# CLEAN FUNCTION (same as training)
# ======================
def clean_numeric(val):
    if pd.isna(val):
        return None

    val = str(val).strip()
    val = re.sub(r'\s+', '', val)

    # scientific notation
    if re.match(r'^-?\d+(\.\d+)?[eE][+-]?\d+$', val):
        return float(val)

    # remove thousand separators
    if val.count('.') > 1:
        val = val.replace('.', '')

    # comma decimal -> dot
    if ',' in val and '.' not in val:
        val = val.replace(',', '.')

    try:
        return float(val)
    except ValueError:
        return None

def load_and_clean_csv(file):
    df = pd.read_csv(
        file.name,
        sep=None,
        engine='python',
        dtype=str
    )

    # clean header
    df.columns = (
        df.columns
          .astype(str)
          .str.strip()
          .str.replace(r'\s+', '', regex=True)
    )

    # clean numeric values
    for col in df.columns:
        if col not in ['Label', 'file_name']:
            df[col] = df[col].apply(clean_numeric)

    return df

# ======================
# PREDICTION FUNCTION
# ======================
def predict_malware_csv(file):
    df = load_and_clean_csv(file)

    # Check missing features
    missing = set(feature_names) - set(df.columns)
    if missing:
        return f"❌ Missing features: {list(missing)}", None

    # Keep only needed features
    X = df[feature_names].copy()

    # 🔥 CLEAN NUMERIC FEATURES
    for col in feature_names:
        X[col] = X[col].apply(clean_numeric)

    # Optional: fill NaN if needed
    # X = X.fillna(0)

    # Level-1 predictions
    meta_inputs = []
    for name, model in base_models:
        prob = model.predict_proba(X)[:, 1]
        meta_inputs.append(prob)

    meta_X = np.column_stack(meta_inputs)

    # Meta prediction
    preds = meta_model.predict(meta_X)
    probs = meta_model.predict_proba(meta_X)[:, 1]

    # Append results
    result_df = df.copy()
    result_df["Prediction"] = np.where(preds == 1, "Malware", "Benign")
    result_df["Malware_Probability"] = probs

    return "✅ Prediction completed", result_df


# ======================
# UI
# ======================
inputs = gr.File(
    label="Upload CSV file (features only)",
    file_types=[".csv"]
)

outputs = [
    gr.Textbox(label="Status"),
    gr.Dataframe(label="Prediction Results")
]

app = gr.Interface(
    fn=predict_malware_csv,
    inputs=inputs,
    outputs=outputs,
    title="Stacking-based Malware Detection",
    description=(
        "Upload a CSV file.\n\n"
    )
)

if __name__ == "__main__":
    app.launch()