""" Backdoor Android Malware Detection Streamlit deployment application. Peneliti Utama : RAMA ARIA MEGANTARA Co-Peneliti : DEWI PERGIWATI Institusi : UNIVERSITAS DIAN NUSWANTORO Hibah : PENELITIAN DASAR PERGURUAN TINGGI 2025/2026 Semester Gasal Dataset : CCCS-CIC-AndMal-2020 (University of New Brunswick) """ import os import json import joblib import pathlib import numpy as np import pandas as pd import streamlit as st # ============================================================================= # PAGE CONFIG -- must be first Streamlit call # ============================================================================= st.set_page_config( page_title="Backdoor Malware Detection", layout="wide", initial_sidebar_state="expanded", ) # ============================================================================= # CONSTANTS # Paths are anchored to this script's directory so they resolve correctly # regardless of the working directory set by the deployment environment. # ============================================================================= BASE_DIR = pathlib.Path(__file__).parent MODEL_DIR = BASE_DIR / "model" FIGURES_DIR = BASE_DIR / "figures" MODEL_PATH = MODEL_DIR / "best_model.joblib" CONFIG_PATH = MODEL_DIR / "model_config.json" PERM_PATH = MODEL_DIR / "top500_permissions.json" SIG_PATH = MODEL_DIR / "table_backdoor_signature.csv" NAMED_SIG_PATH = MODEL_DIR / "table_permission_signature_named.csv" FIG_BAR_PATH = FIGURES_DIR / "fig_shap_bar_top50.png" FIG_BEE_PATH = FIGURES_DIR / "fig_shap_beeswarm_top20.png" FIG_GROUP_PATH = FIGURES_DIR / "fig_shap_group_bar.png" UDINUS_LOGO_URL = "https://dinus.ac.id/wp-content/uploads/2022/12/Logo-Udinus-Official-02-300x300-1.png" JOURNAL_URL = "https://journal.universitasbumigora.ac.id/matrik/article/view/6198" DATASET_URL = "https://www.unb.ca/cic/datasets/andmal2020.html" MAX_ROWS = 100 # ============================================================================= # GLOBAL CSS # ============================================================================= st.markdown(""" """, unsafe_allow_html=True) # ============================================================================= # CACHED RESOURCES # ============================================================================= @st.cache_resource def load_model(): return joblib.load(MODEL_PATH) @st.cache_data def load_config(): with open(CONFIG_PATH, "r", encoding="utf-8") as f: return json.load(f) @st.cache_data def load_sample_bytes(): """Generate sample input CSV in memory. 9503 columns named 0..9502, one row of zeros.""" col_names = [str(i) for i in range(9503)] df_sample = pd.DataFrame([[0] * 9503], columns=col_names) return df_sample.to_csv(index=False).encode("utf-8") @st.cache_data def load_signature_table(): return pd.read_csv(SIG_PATH, index_col=0) @st.cache_data def load_named_signature_table(): return pd.read_csv(NAMED_SIG_PATH, index_col=0) # ============================================================================= # SIDEBAR NAVIGATION # ============================================================================= st.sidebar.markdown("""
Penelitian Hibah DIKTI
Backdoor Android
Malware Detection
Universitas Dian Nuswantoro
""", unsafe_allow_html=True) st.sidebar.divider() st.sidebar.markdown("
Navigasi
", unsafe_allow_html=True) page = st.sidebar.radio( label="Pilih halaman", options=["Prediksi", "SHAP Signature", "Tentang Penelitian"], label_visibility="collapsed", ) st.sidebar.divider() st.sidebar.markdown("""
Model
Random Forest + SMOTE

Dataset
CCCS-CIC-AndMal-2020

Fitur
9503 static features
Top-500 selected

Max input
100 baris per upload
""", unsafe_allow_html=True) # ============================================================================= # PAGE 1: PREDIKSI # ============================================================================= if page == "Prediksi": st.title("Prediksi Backdoor Android Malware") st.markdown("""
Upload file CSV yang berisi static features dari satu atau lebih Android APK. Model akan memprediksi apakah setiap APK termasuk kategori Backdoor atau Non-Backdoor, beserta nilai probabilitas dari Random Forest classifier.
""", unsafe_allow_html=True) # -- Template download st.subheader("Template Input CSV") col_dl, col_desc = st.columns([2, 3]) with col_dl: st.download_button( label="Download sample_input.csv", data=load_sample_bytes(), file_name="sample_input.csv", mime="text/csv", ) with col_desc: st.markdown("""
Template berisi 9503 kolom (kolom 0 hingga 9502), satu baris bernilai nol. Isi nilai kolom sesuai fitur APK Anda (0 atau 1), tambahkan baris baru untuk setiap APK tambahan, lalu upload di bawah.
""", unsafe_allow_html=True) st.divider() # -- Upload st.subheader("Upload File CSV") uploaded_file = st.file_uploader( label="Pilih file CSV (maks. {} baris)".format(MAX_ROWS), type=["csv"], help="File harus memiliki 9503 kolom bernama 0 sampai 9502.", ) if uploaded_file is not None: # -- Read try: df_input = pd.read_csv(uploaded_file, dtype=np.float32) except Exception as e: st.error("Gagal membaca file CSV: {}".format(e)) st.stop() # -- Validate columns expected_cols = [str(i) for i in range(9503)] df_input.columns = [str(c) for c in df_input.columns] missing_cols = [c for c in expected_cols if c not in df_input.columns] if missing_cols: st.error( "File CSV tidak valid. Dibutuhkan 9503 kolom bernama 0 sampai 9502. " "Jumlah kolom yang hilang: {}. " "Gunakan template sample_input.csv di atas sebagai panduan.".format(len(missing_cols)) ) st.stop() # -- Row limit total_rows = len(df_input) if total_rows > MAX_ROWS: st.warning( "File memiliki {} baris. Hanya {} baris pertama yang diproses " "(batas maksimal per upload).".format(total_rows, MAX_ROWS) ) df_input = df_input.head(MAX_ROWS) n_samples = len(df_input) # -- Extract top-500 features config = load_config() top500_idx = config["top500_original_indices"] col_names = [str(i) for i in top500_idx] X = df_input[col_names].values.astype(np.float32) # -- Predict with st.spinner("Memproses {} sampel...".format(n_samples)): model = load_model() y_pred = model.predict(X) y_proba = model.predict_proba(X)[:, 1] # -- Aggregate summary n_backdoor = int((y_pred == 1).sum()) n_non_backdoor = int((y_pred == 0).sum()) st.divider() st.subheader("Ringkasan Hasil") col1, col2, col3 = st.columns(3) with col1: st.markdown("""
Total Sampel
{}
""".format(n_samples), unsafe_allow_html=True) with col2: st.markdown("""
Backdoor
{}
""".format(n_backdoor), unsafe_allow_html=True) with col3: st.markdown("""
Non-Backdoor
{}
""".format(n_non_backdoor), unsafe_allow_html=True) # -- Per-row results st.divider() st.subheader("Hasil per Sampel") results_df = pd.DataFrame({ "Sample" : range(1, n_samples + 1), "Prediction": ["Backdoor" if p == 1 else "Non-Backdoor" for p in y_pred], "Probability (Backdoor)": [round(float(p), 4) for p in y_proba], }) def _style_row(row): if row["Prediction"] == "Backdoor": return ["background-color: #fce8e6; color: #7b1e1e"] * len(row) return ["background-color: #e8f5e9; color: #1b4d2a"] * len(row) styled = results_df.style.apply(_style_row, axis=1).format( {"Probability (Backdoor)": "{:.4f}"} ) st.dataframe(styled, use_container_width=True, height=min(400, 40 + n_samples * 38)) # -- Download per-row csv_out = results_df.to_csv(index=False).encode("utf-8") st.download_button( label="Download hasil prediksi (.csv)", data=csv_out, file_name="hasil_prediksi.csv", mime="text/csv", ) st.markdown("""
Kolom Probability (Backdoor) adalah probabilitas kelas positif (Backdoor) dari Random Forest classifier. Nilai mendekati 1.0 menunjukkan keyakinan tinggi bahwa APK adalah Backdoor.
""", unsafe_allow_html=True) # ============================================================================= # PAGE 2: SHAP SIGNATURE # ============================================================================= elif page == "SHAP Signature": st.title("SHAP-Based Backdoor Permission Signature") st.markdown("""
Halaman ini menampilkan hasil analisis SHAP (SHapley Additive exPlanations) terhadap model terbaik: Random Forest + SMOTE (RF-C2). SHAP digunakan untuk mengidentifikasi permission dan fitur yang paling berkontribusi terhadap deteksi Backdoor malware, serta mengekstrak behavioral permission signature yang membedakan Backdoor dari kategori malware lain.
""", unsafe_allow_html=True) # -- Figure 1: Bar Top-50 st.subheader("SHAP Bar Plot: Top-50 Features") st.markdown("""
Mean absolute SHAP value untuk 50 fitur dengan kontribusi terbesar terhadap prediksi model pada test set. Semakin besar nilainya, semakin kuat pengaruh fitur tersebut terhadap keputusan classifier.
""", unsafe_allow_html=True) if os.path.exists(FIG_BAR_PATH): st.image(FIG_BAR_PATH, use_container_width=True) else: st.warning("File gambar tidak ditemukan: {}".format(FIG_BAR_PATH)) st.divider() # -- Figure 2: Beeswarm Top-20 st.subheader("SHAP Beeswarm Plot: Top-20 Features") st.markdown("""
Distribusi SHAP value untuk 20 fitur teratas pada seluruh sampel test set. Warna menunjukkan nilai fitur (merah = tinggi, biru = rendah). Plot ini memperlihatkan arah dan besaran pengaruh setiap fitur.
""", unsafe_allow_html=True) if os.path.exists(FIG_BEE_PATH): st.image(FIG_BEE_PATH, use_container_width=True) else: st.warning("File gambar tidak ditemukan: {}".format(FIG_BEE_PATH)) st.divider() # -- Figure 3: Group bar st.subheader("Group-Level SHAP Contribution") st.markdown("""
Perbandingan mean absolute SHAP value antara kelompok fitur Permissions (indeks 0-3267) dan Non-Permissions (indeks 3268-9501), yang mencakup Services, Intent Actions, dan Intent Categories.
""", unsafe_allow_html=True) if os.path.exists(FIG_GROUP_PATH): st.image(FIG_GROUP_PATH, use_container_width=True) else: st.warning("File gambar tidak ditemukan: {}".format(FIG_GROUP_PATH)) st.divider() # -- Signature Table st.subheader("Backdoor Behavioral Signature (Top-50)") st.markdown("""
Fitur-fitur dengan mean SHAP positif pada sampel True Positive (APK yang secara benar terdeteksi sebagai Backdoor). Fitur-fitur ini membentuk behavioral signature Backdoor malware. Kolom feature_col_idx adalah indeks fitur dalam ruang fitur asli (0-9502).
""", unsafe_allow_html=True) df_sig = load_signature_table() st.dataframe(df_sig, use_container_width=True, height=400) st.divider() # -- Named Permission Signature st.subheader("Named Permission Signature") st.markdown("""
Permission-permission yang termasuk dalam Backdoor behavioral signature, beserta nama resminya dari Android permission list dan nilai mean SHAP pada True Positive samples.
""", unsafe_allow_html=True) df_named = load_named_signature_table() st.dataframe(df_named, use_container_width=True, height=350) # ============================================================================= # PAGE 3: TENTANG PENELITIAN # ============================================================================= elif page == "Tentang Penelitian": st.title("Tentang Penelitian") # -- Header: logo + info col_logo, col_header = st.columns([1, 4]) with col_logo: st.image(UDINUS_LOGO_URL, width=120) with col_header: st.markdown("""
Universitas Dian Nuswantoro
Semarang, Jawa Tengah, Indonesia
PENELITIAN DASAR PERGURUAN TINGGI 2025/2026 Semester Gasal
""", unsafe_allow_html=True) st.divider() # -- Informasi penelitian st.subheader("Informasi Penelitian") st.markdown("""
Judul Backdoor Android Malware Detection under Extreme Class Imbalance using Ensemble Learning and SHAP-Based Permission Signature Analysis
Peneliti Utama Rama Aria Megantara
Co-Peneliti Dewi Pergiwati
Institusi Universitas Dian Nuswantoro
Jenis Hibah Penelitian Dasar Perguruan Tinggi (DIKTI)
Periode 2025/2026 Semester Gasal
Publikasi Matrik: Jurnal Manajemen, Teknik Informatika, dan Rekayasa Komputer
Dataset CCCS-CIC-AndMal-2020 (University of New Brunswick)
""".format(journal=JOURNAL_URL, dataset=DATASET_URL), unsafe_allow_html=True) st.divider() # -- Abstract st.subheader("Abstract") st.markdown("""
Backdoor malware pada platform Android merupakan ancaman serius karena memungkinkan akses tidak sah secara tersembunyi ke dalam sistem perangkat korban. Deteksi Backdoor menghadapi tantangan berupa extreme class imbalance yang parah pada dataset dunia nyata. Penelitian ini mengusulkan pipeline deteksi berbasis static features menggunakan dataset CCCS-CIC-AndMal-2020, dengan rasio ketidakseimbangan kelas 1:221.5 antara kelas Backdoor dan kelas lainnya. Pipeline dua tahap feature selection diterapkan: Variance Threshold mereduksi 9.503 fitur menjadi 1.433 fitur, diikuti seleksi berbasis Random Forest feature importance untuk memilih 500 fitur terbaik. Lima classifier dievaluasi: Decision Tree, Logistic Regression, Random Forest, XGBoost, dan LightGBM, masing-masing di bawah tiga kondisi penanganan imbalance: tanpa penanganan (C1), SMOTE (C2), dan SMOTE dengan cost-sensitive learning (C3), dengan 10 random seeds untuk validasi statistik via uji Wilcoxon Signed-Rank. Model terbaik berdasarkan composite ranking adalah Random Forest dengan kondisi SMOTE (RF-C2), yang mencapai F1-Score=0.9046, AUC-ROC=0.9917, dan G-Mean=0.9426. Analisis SHAP (SHapley Additive exPlanations) digunakan untuk mengekstraksi behavioral permission signature Backdoor malware, memberikan interpretabilitas terhadap keputusan model.
""", unsafe_allow_html=True) st.divider() # -- Model performance st.subheader("Performa Model Terbaik: Random Forest + SMOTE (RF-C2)") st.markdown("""
Rata-rata dan standar deviasi dari 10 random seeds pada test set. Evaluasi menggunakan metrik khusus untuk kelas minority (Backdoor).
""", unsafe_allow_html=True) perf_rows = [ ("F1-Score (Backdoor class)", "0.9046", "0.0020"), ("AUC-ROC", "0.9917", "0.0010"), ("G-Mean", "0.9426", "0.0030"), ("Precision", "0.9133", "0.0035"), ("Recall", "0.8961", "0.0061"), ("MCC", "0.8952", "0.0021"), ("Training Time", "2.18 s", "0.04 s"), ("Inference Time per Sample", "0.0174 ms", "0.0004 ms"), ] rows_html = "" for metric, mean_val, std_val in perf_rows: rows_html += """ {metric} {mean} +/- {std} """.format(metric=metric, mean=mean_val, std=std_val) st.markdown(""" {}
Metric Mean +/- Std
""".format(rows_html), unsafe_allow_html=True) st.divider() # -- Methodology overview st.subheader("Ringkasan Metodologi") st.markdown("""
DatasetCCCS-CIC-AndMal-2020 — 342.169 sampel, 18 kategori
TaskBinary classification: Backdoor (1) vs. semua kategori lain (0)
FiturStatic features — 9.503 kolom (permissions, services, intents, categories)
Feature SelectionVariance Threshold + Random Forest Importance (top-500)
Imbalance HandlingC1: None | C2: SMOTE | C3: SMOTE + cost-sensitive
ClassifiersDecision Tree, Logistic Regression, Random Forest, XGBoost, LightGBM
Validasi10 random seeds, Wilcoxon Signed-Rank test, effect size r
ExplainabilitySHAP TreeExplainer — behavioral permission signature
""", unsafe_allow_html=True) st.divider() st.markdown("""
Universitas Dian Nuswantoro  |  Penelitian Dasar Perguruan Tinggi 2025/2026  |  Rama Aria Megantara & Dewi Pergiwati
""", unsafe_allow_html=True)