Spaces:

mallware
/

UI_stacking

Sleeping

File size: 2,934 Bytes

import joblib
import pandas as pd
import gradio as gr
import numpy as np
import re

# ======================
# LOAD MODEL
# ======================
artifact = joblib.load("stacking_model.pkl")

base_models = artifact["base_models"]      # list of (name, model)
meta_model = artifact["meta_model"]
feature_names = artifact["features"]

# ======================
# CLEAN FUNCTION (same as training)
# ======================
def clean_numeric(val):
    if pd.isna(val):
        return None

    val = str(val).strip()
    val = re.sub(r'\s+', '', val)

    # scientific notation
    if re.match(r'^-?\d+(\.\d+)?[eE][+-]?\d+$', val):
        return float(val)

    # remove thousand separators
    if val.count('.') > 1:
        val = val.replace('.', '')

    # comma decimal -> dot
    if ',' in val and '.' not in val:
        val = val.replace(',', '.')

    try:
        return float(val)
    except ValueError:
        return None

def load_and_clean_csv(file):
    df = pd.read_csv(
        file.name,
        sep=None,
        engine='python',
        dtype=str
    )

    # clean header
    df.columns = (
        df.columns
          .astype(str)
          .str.strip()
          .str.replace(r'\s+', '', regex=True)
    )

    # clean numeric values
    for col in df.columns:
        if col not in ['Label', 'file_name']:
            df[col] = df[col].apply(clean_numeric)

    return df

# ======================
# PREDICTION FUNCTION
# ======================
def predict_malware_csv(file):
    df = load_and_clean_csv(file)

    # Check missing features
    missing = set(feature_names) - set(df.columns)
    if missing:
        return f"❌ Missing features: {list(missing)}", None

    # Keep only needed features
    X = df[feature_names].copy()

    # 🔥 CLEAN NUMERIC FEATURES
    for col in feature_names:
        X[col] = X[col].apply(clean_numeric)

    # Optional: fill NaN if needed
    # X = X.fillna(0)

    # Level-1 predictions
    meta_inputs = []
    for name, model in base_models:
        prob = model.predict_proba(X)[:, 1]
        meta_inputs.append(prob)

    meta_X = np.column_stack(meta_inputs)

    # Meta prediction
    preds = meta_model.predict(meta_X)
    probs = meta_model.predict_proba(meta_X)[:, 1]

    # Append results
    result_df = df.copy()
    result_df["Prediction"] = np.where(preds == 1, "Malware", "Benign")
    result_df["Malware_Probability"] = probs

    return "✅ Prediction completed", result_df


# ======================
# UI
# ======================
inputs = gr.File(
    label="Upload CSV file (features only)",
    file_types=[".csv"]
)

outputs = [
    gr.Textbox(label="Status"),
    gr.Dataframe(label="Prediction Results")
]

app = gr.Interface(
    fn=predict_malware_csv,
    inputs=inputs,
    outputs=outputs,
    title="Stacking-based Malware Detection",
    description=(
        "Upload a CSV file.\n\n"
    )
)

if __name__ == "__main__":
    app.launch()