mlp_csv / app.py
hieu3636's picture
Update app.py
de31167 verified
import gradio as gr
import pandas as pd
import numpy as np
import joblib
import tensorflow as tf
import re
# =========================
# LOAD MODEL & SCALER
# =========================
model = tf.keras.models.load_model("mlp_malware.keras")
scaler = joblib.load("scaler.pkl")
# =========================
# 30 SELECTED FEATURES
SELECTED_FEATURES = [
"filesize",
"E_file",
"E_text",
"E_data",
"AddressOfEntryPoint",
"NumberOfSections",
"SizeOfInitializedData",
"SizeOfImage",
"SizeOfOptionalHeader",
"SizeOfCode",
"DirectoryEntryImportSize",
"ImageBase",
"CheckSum",
"Magic",
"MinorLinkerVersion",
"MajorSubsystemVersion",
"e_lfanew",
"sus_sections",
"PointerToSymbolTable",
"SectionsLength",
"SizeOfStackReserve",
"MajorOperatingSystemVersion",
"non_sus_sections",
"Characteristics",
"NumberOfSymbols",
"BaseOfData",
"MajorImageVersion",
"FH_char5",
"FH_char8",
"OH_DLLchar5"
]
N_FEATURES = len(SELECTED_FEATURES)
# CLEAN NUMERIC (same as training)
# =========================
def clean_numeric(val):
if pd.isna(val):
return None
val = str(val).strip()
val = re.sub(r"\s+", "", val)
# scientific notation
if re.match(r"^-?\d+(\.\d+)?[eE][+-]?\d+$", val):
return float(val)
# remove thousand separators
if val.count(".") > 1:
val = val.replace(".", "")
# comma decimal -> dot
if "," in val and "." not in val:
val = val.replace(",", ".")
try:
return float(val)
except ValueError:
return None
# =========================
# LOAD & PREPROCESS CSV
# =========================
def load_and_clean_csv(file):
# 1. Read CSV (auto detect delimiter)
df = pd.read_csv(
file.name,
sep=None,
engine="python",
dtype=str
)
# 2. Clean header
df.columns = (
df.columns
.astype(str)
.str.strip()
.str.replace(r"\s+", "", regex=True)
)
# 3. Drop label columns if exist
df = df.drop(
columns=["Label", "label", "class", "Class", "file_name"],
errors="ignore"
)
# 4. Clean numeric values
for col in df.columns:
df[col] = df[col].apply(clean_numeric)
return df
# =========================
# PREDICTION FUNCTION
# =========================
def predict_csv(file):
df = load_and_clean_csv(file)
# Drop label columns if exist
df = df.drop(columns=["Label", "label", "class", "Class"], errors="ignore")
# Check missing features
missing_features = [f for f in SELECTED_FEATURES if f not in df.columns]
if missing_features:
return (
f"Missing required features: {missing_features}"
)
# Keep only selected features & correct order
feature_df = df[SELECTED_FEATURES].copy()
# Convert to float
X = feature_df.values.astype(float)
# Scale
X_scaled = scaler.transform(X)
# Predict
probs = model.predict(X_scaled).reshape(-1)
preds = (probs > 0.5).astype(int)
# Build output dataframe
result = df.copy()
result.insert(0, "row_id", range(1, len(df) + 1))
result["probability_malware"] = probs
result["prediction"] = preds
result["prediction_label"] = result["prediction"].map(
{1: "malware", 0: "benign"}
)
return result
# =========================
# GRADIO INTERFACE
# =========================
demo = gr.Interface(
fn=predict_csv,
inputs=gr.File(label="Upload CSV file"),
outputs=gr.Dataframe(label="Prediction Result"),
title="Malware Detection",
description=(
"Upload a CSV file containing PE features. "
)
)
demo.launch()