Spaces:

malware-USTH
/

mlp_csv

Sleeping

App Files Files Community

mlp_csv / app.py

hieu3636

Update app.py

de31167 verified 2 months ago

raw

history blame contribute delete

3.72 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import joblib
	import tensorflow as tf
	import re
	# =========================
	# LOAD MODEL & SCALER
	# =========================
	model = tf.keras.models.load_model("mlp_malware.keras")
	scaler = joblib.load("scaler.pkl")

	# =========================
	# 30 SELECTED FEATURES

	SELECTED_FEATURES = [
	"filesize",
	"E_file",
	"E_text",
	"E_data",
	"AddressOfEntryPoint",
	"NumberOfSections",
	"SizeOfInitializedData",
	"SizeOfImage",
	"SizeOfOptionalHeader",
	"SizeOfCode",
	"DirectoryEntryImportSize",
	"ImageBase",
	"CheckSum",
	"Magic",
	"MinorLinkerVersion",
	"MajorSubsystemVersion",
	"e_lfanew",
	"sus_sections",
	"PointerToSymbolTable",
	"SectionsLength",
	"SizeOfStackReserve",
	"MajorOperatingSystemVersion",
	"non_sus_sections",
	"Characteristics",
	"NumberOfSymbols",
	"BaseOfData",
	"MajorImageVersion",
	"FH_char5",
	"FH_char8",
	"OH_DLLchar5"
	]

	N_FEATURES = len(SELECTED_FEATURES)
	# CLEAN NUMERIC (same as training)
	# =========================
	def clean_numeric(val):
	if pd.isna(val):
	return None

	val = str(val).strip()
	val = re.sub(r"\s+", "", val)

	# scientific notation
	if re.match(r"^-?\d+(\.\d+)?[eE][+-]?\d+$", val):
	return float(val)

	# remove thousand separators
	if val.count(".") > 1:
	val = val.replace(".", "")

	# comma decimal -> dot
	if "," in val and "." not in val:
	val = val.replace(",", ".")

	try:
	return float(val)
	except ValueError:
	return None


	# =========================
	# LOAD & PREPROCESS CSV
	# =========================
	def load_and_clean_csv(file):
	# 1. Read CSV (auto detect delimiter)
	df = pd.read_csv(
	file.name,
	sep=None,
	engine="python",
	dtype=str
	)

	# 2. Clean header
	df.columns = (
	df.columns
	.astype(str)
	.str.strip()
	.str.replace(r"\s+", "", regex=True)
	)

	# 3. Drop label columns if exist
	df = df.drop(
	columns=["Label", "label", "class", "Class", "file_name"],
	errors="ignore"
	)

	# 4. Clean numeric values
	for col in df.columns:
	df[col] = df[col].apply(clean_numeric)

	return df
	# =========================
	# PREDICTION FUNCTION
	# =========================
	def predict_csv(file):
	df = load_and_clean_csv(file)

	# Drop label columns if exist
	df = df.drop(columns=["Label", "label", "class", "Class"], errors="ignore")

	# Check missing features
	missing_features = [f for f in SELECTED_FEATURES if f not in df.columns]
	if missing_features:
	return (
	f"Missing required features: {missing_features}"
	)

	# Keep only selected features & correct order
	feature_df = df[SELECTED_FEATURES].copy()

	# Convert to float
	X = feature_df.values.astype(float)

	# Scale
	X_scaled = scaler.transform(X)

	# Predict
	probs = model.predict(X_scaled).reshape(-1)
	preds = (probs > 0.5).astype(int)

	# Build output dataframe
	result = df.copy()
	result.insert(0, "row_id", range(1, len(df) + 1))
	result["probability_malware"] = probs
	result["prediction"] = preds
	result["prediction_label"] = result["prediction"].map(
	{1: "malware", 0: "benign"}
	)

	return result

	# =========================
	# GRADIO INTERFACE
	# =========================
	demo = gr.Interface(
	fn=predict_csv,
	inputs=gr.File(label="Upload CSV file"),
	outputs=gr.Dataframe(label="Prediction Result"),
	title="Malware Detection",
	description=(
	"Upload a CSV file containing PE features. "
	)
	)

	demo.launch()