File size: 3,721 Bytes
1a43fb5
 
 
 
 
de31167
1a43fb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96f75f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a43fb5
96f75f0
1a43fb5
 
 
 
96f75f0
1a43fb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import gradio as gr
import pandas as pd
import numpy as np
import joblib
import tensorflow as tf
import re
# =========================
# LOAD MODEL & SCALER
# =========================
model = tf.keras.models.load_model("mlp_malware.keras")
scaler = joblib.load("scaler.pkl")

# =========================
# 30 SELECTED FEATURES

SELECTED_FEATURES = [
    "filesize",
    "E_file",
    "E_text",
    "E_data",
    "AddressOfEntryPoint",
    "NumberOfSections",
    "SizeOfInitializedData",
    "SizeOfImage",
    "SizeOfOptionalHeader",
    "SizeOfCode",
    "DirectoryEntryImportSize",
    "ImageBase",
    "CheckSum",
    "Magic",
    "MinorLinkerVersion",
    "MajorSubsystemVersion",
    "e_lfanew",
    "sus_sections",
    "PointerToSymbolTable",
    "SectionsLength",
    "SizeOfStackReserve",
    "MajorOperatingSystemVersion",
    "non_sus_sections",
    "Characteristics",
    "NumberOfSymbols",
    "BaseOfData",
    "MajorImageVersion",
    "FH_char5",
    "FH_char8",
    "OH_DLLchar5"
]

N_FEATURES = len(SELECTED_FEATURES)
# CLEAN NUMERIC (same as training)
# =========================
def clean_numeric(val):
    if pd.isna(val):
        return None

    val = str(val).strip()
    val = re.sub(r"\s+", "", val)

    # scientific notation
    if re.match(r"^-?\d+(\.\d+)?[eE][+-]?\d+$", val):
        return float(val)

    # remove thousand separators
    if val.count(".") > 1:
        val = val.replace(".", "")

    # comma decimal -> dot
    if "," in val and "." not in val:
        val = val.replace(",", ".")

    try:
        return float(val)
    except ValueError:
        return None


# =========================
# LOAD & PREPROCESS CSV
# =========================
def load_and_clean_csv(file):
    # 1. Read CSV (auto detect delimiter)
    df = pd.read_csv(
        file.name,
        sep=None,
        engine="python",
        dtype=str
    )

    # 2. Clean header
    df.columns = (
        df.columns
        .astype(str)
        .str.strip()
        .str.replace(r"\s+", "", regex=True)
    )

    # 3. Drop label columns if exist
    df = df.drop(
        columns=["Label", "label", "class", "Class", "file_name"],
        errors="ignore"
    )

    # 4. Clean numeric values
    for col in df.columns:
        df[col] = df[col].apply(clean_numeric)

    return df
# =========================
# PREDICTION FUNCTION
# =========================
def predict_csv(file):
    df = load_and_clean_csv(file)

    # Drop label columns if exist
    df = df.drop(columns=["Label", "label", "class", "Class"], errors="ignore")

    # Check missing features
    missing_features = [f for f in SELECTED_FEATURES if f not in df.columns]
    if missing_features:
        return (
            f"Missing required features: {missing_features}"
        )

    # Keep only selected features & correct order
    feature_df = df[SELECTED_FEATURES].copy()

    # Convert to float
    X = feature_df.values.astype(float)

    # Scale
    X_scaled = scaler.transform(X)

    # Predict
    probs = model.predict(X_scaled).reshape(-1)
    preds = (probs > 0.5).astype(int)

    # Build output dataframe
    result = df.copy()
    result.insert(0, "row_id", range(1, len(df) + 1))
    result["probability_malware"] = probs
    result["prediction"] = preds
    result["prediction_label"] = result["prediction"].map(
        {1: "malware", 0: "benign"}
    )

    return result

# =========================
# GRADIO INTERFACE
# =========================
demo = gr.Interface(
    fn=predict_csv,
    inputs=gr.File(label="Upload CSV file"),
    outputs=gr.Dataframe(label="Prediction Result"),
    title="Malware Detection",
    description=(
        "Upload a CSV file containing PE features. "
    )
)

demo.launch()