Spaces:
Sleeping
Sleeping
File size: 2,934 Bytes
1d2f225 ff5d94a 9f19f6e ff5d94a 9f19f6e 1d2f225 ff5d94a 1d2f225 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | import joblib
import pandas as pd
import gradio as gr
import numpy as np
import re
# ======================
# LOAD MODEL
# ======================
artifact = joblib.load("stacking_model.pkl")
base_models = artifact["base_models"] # list of (name, model)
meta_model = artifact["meta_model"]
feature_names = artifact["features"]
# ======================
# CLEAN FUNCTION (same as training)
# ======================
def clean_numeric(val):
if pd.isna(val):
return None
val = str(val).strip()
val = re.sub(r'\s+', '', val)
# scientific notation
if re.match(r'^-?\d+(\.\d+)?[eE][+-]?\d+$', val):
return float(val)
# remove thousand separators
if val.count('.') > 1:
val = val.replace('.', '')
# comma decimal -> dot
if ',' in val and '.' not in val:
val = val.replace(',', '.')
try:
return float(val)
except ValueError:
return None
def load_and_clean_csv(file):
df = pd.read_csv(
file.name,
sep=None,
engine='python',
dtype=str
)
# clean header
df.columns = (
df.columns
.astype(str)
.str.strip()
.str.replace(r'\s+', '', regex=True)
)
# clean numeric values
for col in df.columns:
if col not in ['Label', 'file_name']:
df[col] = df[col].apply(clean_numeric)
return df
# ======================
# PREDICTION FUNCTION
# ======================
def predict_malware_csv(file):
df = load_and_clean_csv(file)
# Check missing features
missing = set(feature_names) - set(df.columns)
if missing:
return f"❌ Missing features: {list(missing)}", None
# Keep only needed features
X = df[feature_names].copy()
# 🔥 CLEAN NUMERIC FEATURES
for col in feature_names:
X[col] = X[col].apply(clean_numeric)
# Optional: fill NaN if needed
# X = X.fillna(0)
# Level-1 predictions
meta_inputs = []
for name, model in base_models:
prob = model.predict_proba(X)[:, 1]
meta_inputs.append(prob)
meta_X = np.column_stack(meta_inputs)
# Meta prediction
preds = meta_model.predict(meta_X)
probs = meta_model.predict_proba(meta_X)[:, 1]
# Append results
result_df = df.copy()
result_df["Prediction"] = np.where(preds == 1, "Malware", "Benign")
result_df["Malware_Probability"] = probs
return "✅ Prediction completed", result_df
# ======================
# UI
# ======================
inputs = gr.File(
label="Upload CSV file (features only)",
file_types=[".csv"]
)
outputs = [
gr.Textbox(label="Status"),
gr.Dataframe(label="Prediction Results")
]
app = gr.Interface(
fn=predict_malware_csv,
inputs=inputs,
outputs=outputs,
title="Stacking-based Malware Detection",
description=(
"Upload a CSV file.\n\n"
)
)
if __name__ == "__main__":
app.launch()
|