Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import joblib | |
| import tensorflow as tf | |
| import re | |
| # ========================= | |
| # LOAD MODEL & SCALER | |
| # ========================= | |
| model = tf.keras.models.load_model("mlp_malware.keras") | |
| scaler = joblib.load("scaler.pkl") | |
| # ========================= | |
| # 30 SELECTED FEATURES | |
| SELECTED_FEATURES = [ | |
| "filesize", | |
| "E_file", | |
| "E_text", | |
| "E_data", | |
| "AddressOfEntryPoint", | |
| "NumberOfSections", | |
| "SizeOfInitializedData", | |
| "SizeOfImage", | |
| "SizeOfOptionalHeader", | |
| "SizeOfCode", | |
| "DirectoryEntryImportSize", | |
| "ImageBase", | |
| "CheckSum", | |
| "Magic", | |
| "MinorLinkerVersion", | |
| "MajorSubsystemVersion", | |
| "e_lfanew", | |
| "sus_sections", | |
| "PointerToSymbolTable", | |
| "SectionsLength", | |
| "SizeOfStackReserve", | |
| "MajorOperatingSystemVersion", | |
| "non_sus_sections", | |
| "Characteristics", | |
| "NumberOfSymbols", | |
| "BaseOfData", | |
| "MajorImageVersion", | |
| "FH_char5", | |
| "FH_char8", | |
| "OH_DLLchar5" | |
| ] | |
| N_FEATURES = len(SELECTED_FEATURES) | |
| # CLEAN NUMERIC (same as training) | |
| # ========================= | |
| def clean_numeric(val): | |
| if pd.isna(val): | |
| return None | |
| val = str(val).strip() | |
| val = re.sub(r"\s+", "", val) | |
| # scientific notation | |
| if re.match(r"^-?\d+(\.\d+)?[eE][+-]?\d+$", val): | |
| return float(val) | |
| # remove thousand separators | |
| if val.count(".") > 1: | |
| val = val.replace(".", "") | |
| # comma decimal -> dot | |
| if "," in val and "." not in val: | |
| val = val.replace(",", ".") | |
| try: | |
| return float(val) | |
| except ValueError: | |
| return None | |
| # ========================= | |
| # LOAD & PREPROCESS CSV | |
| # ========================= | |
| def load_and_clean_csv(file): | |
| # 1. Read CSV (auto detect delimiter) | |
| df = pd.read_csv( | |
| file.name, | |
| sep=None, | |
| engine="python", | |
| dtype=str | |
| ) | |
| # 2. Clean header | |
| df.columns = ( | |
| df.columns | |
| .astype(str) | |
| .str.strip() | |
| .str.replace(r"\s+", "", regex=True) | |
| ) | |
| # 3. Drop label columns if exist | |
| df = df.drop( | |
| columns=["Label", "label", "class", "Class", "file_name"], | |
| errors="ignore" | |
| ) | |
| # 4. Clean numeric values | |
| for col in df.columns: | |
| df[col] = df[col].apply(clean_numeric) | |
| return df | |
| # ========================= | |
| # PREDICTION FUNCTION | |
| # ========================= | |
| def predict_csv(file): | |
| df = load_and_clean_csv(file) | |
| # Drop label columns if exist | |
| df = df.drop(columns=["Label", "label", "class", "Class"], errors="ignore") | |
| # Check missing features | |
| missing_features = [f for f in SELECTED_FEATURES if f not in df.columns] | |
| if missing_features: | |
| return ( | |
| f"Missing required features: {missing_features}" | |
| ) | |
| # Keep only selected features & correct order | |
| feature_df = df[SELECTED_FEATURES].copy() | |
| # Convert to float | |
| X = feature_df.values.astype(float) | |
| # Scale | |
| X_scaled = scaler.transform(X) | |
| # Predict | |
| probs = model.predict(X_scaled).reshape(-1) | |
| preds = (probs > 0.5).astype(int) | |
| # Build output dataframe | |
| result = df.copy() | |
| result.insert(0, "row_id", range(1, len(df) + 1)) | |
| result["probability_malware"] = probs | |
| result["prediction"] = preds | |
| result["prediction_label"] = result["prediction"].map( | |
| {1: "malware", 0: "benign"} | |
| ) | |
| return result | |
| # ========================= | |
| # GRADIO INTERFACE | |
| # ========================= | |
| demo = gr.Interface( | |
| fn=predict_csv, | |
| inputs=gr.File(label="Upload CSV file"), | |
| outputs=gr.Dataframe(label="Prediction Result"), | |
| title="Malware Detection", | |
| description=( | |
| "Upload a CSV file containing PE features. " | |
| ) | |
| ) | |
| demo.launch() | |