UI_stacking / app.py
hieu3636's picture
Update app.py
ff5d94a verified
import joblib
import pandas as pd
import gradio as gr
import numpy as np
import re
# ======================
# LOAD MODEL
# ======================
artifact = joblib.load("stacking_model.pkl")
base_models = artifact["base_models"] # list of (name, model)
meta_model = artifact["meta_model"]
feature_names = artifact["features"]
# ======================
# CLEAN FUNCTION (same as training)
# ======================
def clean_numeric(val):
if pd.isna(val):
return None
val = str(val).strip()
val = re.sub(r'\s+', '', val)
# scientific notation
if re.match(r'^-?\d+(\.\d+)?[eE][+-]?\d+$', val):
return float(val)
# remove thousand separators
if val.count('.') > 1:
val = val.replace('.', '')
# comma decimal -> dot
if ',' in val and '.' not in val:
val = val.replace(',', '.')
try:
return float(val)
except ValueError:
return None
def load_and_clean_csv(file):
df = pd.read_csv(
file.name,
sep=None,
engine='python',
dtype=str
)
# clean header
df.columns = (
df.columns
.astype(str)
.str.strip()
.str.replace(r'\s+', '', regex=True)
)
# clean numeric values
for col in df.columns:
if col not in ['Label', 'file_name']:
df[col] = df[col].apply(clean_numeric)
return df
# ======================
# PREDICTION FUNCTION
# ======================
def predict_malware_csv(file):
df = load_and_clean_csv(file)
# Check missing features
missing = set(feature_names) - set(df.columns)
if missing:
return f"❌ Missing features: {list(missing)}", None
# Keep only needed features
X = df[feature_names].copy()
# πŸ”₯ CLEAN NUMERIC FEATURES
for col in feature_names:
X[col] = X[col].apply(clean_numeric)
# Optional: fill NaN if needed
# X = X.fillna(0)
# Level-1 predictions
meta_inputs = []
for name, model in base_models:
prob = model.predict_proba(X)[:, 1]
meta_inputs.append(prob)
meta_X = np.column_stack(meta_inputs)
# Meta prediction
preds = meta_model.predict(meta_X)
probs = meta_model.predict_proba(meta_X)[:, 1]
# Append results
result_df = df.copy()
result_df["Prediction"] = np.where(preds == 1, "Malware", "Benign")
result_df["Malware_Probability"] = probs
return "βœ… Prediction completed", result_df
# ======================
# UI
# ======================
inputs = gr.File(
label="Upload CSV file (features only)",
file_types=[".csv"]
)
outputs = [
gr.Textbox(label="Status"),
gr.Dataframe(label="Prediction Results")
]
app = gr.Interface(
fn=predict_malware_csv,
inputs=inputs,
outputs=outputs,
title="Stacking-based Malware Detection",
description=(
"Upload a CSV file.\n\n"
)
)
if __name__ == "__main__":
app.launch()