import gradio as gr import pandas as pd import numpy as np import joblib import tensorflow as tf import re # ========================= # LOAD MODEL & SCALER # ========================= model = tf.keras.models.load_model("mlp_malware.keras") scaler = joblib.load("scaler.pkl") # ========================= # 30 SELECTED FEATURES SELECTED_FEATURES = [ "filesize", "E_file", "E_text", "E_data", "AddressOfEntryPoint", "NumberOfSections", "SizeOfInitializedData", "SizeOfImage", "SizeOfOptionalHeader", "SizeOfCode", "DirectoryEntryImportSize", "ImageBase", "CheckSum", "Magic", "MinorLinkerVersion", "MajorSubsystemVersion", "e_lfanew", "sus_sections", "PointerToSymbolTable", "SectionsLength", "SizeOfStackReserve", "MajorOperatingSystemVersion", "non_sus_sections", "Characteristics", "NumberOfSymbols", "BaseOfData", "MajorImageVersion", "FH_char5", "FH_char8", "OH_DLLchar5" ] N_FEATURES = len(SELECTED_FEATURES) # CLEAN NUMERIC (same as training) # ========================= def clean_numeric(val): if pd.isna(val): return None val = str(val).strip() val = re.sub(r"\s+", "", val) # scientific notation if re.match(r"^-?\d+(\.\d+)?[eE][+-]?\d+$", val): return float(val) # remove thousand separators if val.count(".") > 1: val = val.replace(".", "") # comma decimal -> dot if "," in val and "." not in val: val = val.replace(",", ".") try: return float(val) except ValueError: return None # ========================= # LOAD & PREPROCESS CSV # ========================= def load_and_clean_csv(file): # 1. Read CSV (auto detect delimiter) df = pd.read_csv( file.name, sep=None, engine="python", dtype=str ) # 2. Clean header df.columns = ( df.columns .astype(str) .str.strip() .str.replace(r"\s+", "", regex=True) ) # 3. Drop label columns if exist df = df.drop( columns=["Label", "label", "class", "Class", "file_name"], errors="ignore" ) # 4. Clean numeric values for col in df.columns: df[col] = df[col].apply(clean_numeric) return df # ========================= # PREDICTION FUNCTION # ========================= def predict_csv(file): df = load_and_clean_csv(file) # Drop label columns if exist df = df.drop(columns=["Label", "label", "class", "Class"], errors="ignore") # Check missing features missing_features = [f for f in SELECTED_FEATURES if f not in df.columns] if missing_features: return ( f"Missing required features: {missing_features}" ) # Keep only selected features & correct order feature_df = df[SELECTED_FEATURES].copy() # Convert to float X = feature_df.values.astype(float) # Scale X_scaled = scaler.transform(X) # Predict probs = model.predict(X_scaled).reshape(-1) preds = (probs > 0.5).astype(int) # Build output dataframe result = df.copy() result.insert(0, "row_id", range(1, len(df) + 1)) result["probability_malware"] = probs result["prediction"] = preds result["prediction_label"] = result["prediction"].map( {1: "malware", 0: "benign"} ) return result # ========================= # GRADIO INTERFACE # ========================= demo = gr.Interface( fn=predict_csv, inputs=gr.File(label="Upload CSV file"), outputs=gr.Dataframe(label="Prediction Result"), title="Malware Detection", description=( "Upload a CSV file containing PE features. " ) ) demo.launch()