import joblib import pandas as pd import gradio as gr import numpy as np import re # ====================== # LOAD MODEL # ====================== artifact = joblib.load("stacking_model.pkl") base_models = artifact["base_models"] # list of (name, model) meta_model = artifact["meta_model"] feature_names = artifact["features"] # ====================== # CLEAN FUNCTION (same as training) # ====================== def clean_numeric(val): if pd.isna(val): return None val = str(val).strip() val = re.sub(r'\s+', '', val) # scientific notation if re.match(r'^-?\d+(\.\d+)?[eE][+-]?\d+$', val): return float(val) # remove thousand separators if val.count('.') > 1: val = val.replace('.', '') # comma decimal -> dot if ',' in val and '.' not in val: val = val.replace(',', '.') try: return float(val) except ValueError: return None def load_and_clean_csv(file): df = pd.read_csv( file.name, sep=None, engine='python', dtype=str ) # clean header df.columns = ( df.columns .astype(str) .str.strip() .str.replace(r'\s+', '', regex=True) ) # clean numeric values for col in df.columns: if col not in ['Label', 'file_name']: df[col] = df[col].apply(clean_numeric) return df # ====================== # PREDICTION FUNCTION # ====================== def predict_malware_csv(file): df = load_and_clean_csv(file) # Check missing features missing = set(feature_names) - set(df.columns) if missing: return f"❌ Missing features: {list(missing)}", None # Keep only needed features X = df[feature_names].copy() # 🔥 CLEAN NUMERIC FEATURES for col in feature_names: X[col] = X[col].apply(clean_numeric) # Optional: fill NaN if needed # X = X.fillna(0) # Level-1 predictions meta_inputs = [] for name, model in base_models: prob = model.predict_proba(X)[:, 1] meta_inputs.append(prob) meta_X = np.column_stack(meta_inputs) # Meta prediction preds = meta_model.predict(meta_X) probs = meta_model.predict_proba(meta_X)[:, 1] # Append results result_df = df.copy() result_df["Prediction"] = np.where(preds == 1, "Malware", "Benign") result_df["Malware_Probability"] = probs return "✅ Prediction completed", result_df # ====================== # UI # ====================== inputs = gr.File( label="Upload CSV file (features only)", file_types=[".csv"] ) outputs = [ gr.Textbox(label="Status"), gr.Dataframe(label="Prediction Results") ] app = gr.Interface( fn=predict_malware_csv, inputs=inputs, outputs=outputs, title="Stacking-based Malware Detection", description=( "Upload a CSV file.\n\n" ) ) if __name__ == "__main__": app.launch()