"""
Backdoor Android Malware Detection
Streamlit deployment application.
Peneliti Utama : RAMA ARIA MEGANTARA
Co-Peneliti : DEWI PERGIWATI
Institusi : UNIVERSITAS DIAN NUSWANTORO
Hibah : PENELITIAN DASAR PERGURUAN TINGGI 2025/2026 Semester Gasal
Dataset : CCCS-CIC-AndMal-2020 (University of New Brunswick)
"""
import os
import json
import joblib
import pathlib
import numpy as np
import pandas as pd
import streamlit as st
# =============================================================================
# PAGE CONFIG -- must be first Streamlit call
# =============================================================================
st.set_page_config(
page_title="Backdoor Malware Detection",
layout="wide",
initial_sidebar_state="expanded",
)
# =============================================================================
# CONSTANTS
# Paths are anchored to this script's directory so they resolve correctly
# regardless of the working directory set by the deployment environment.
# =============================================================================
BASE_DIR = pathlib.Path(__file__).parent
MODEL_DIR = BASE_DIR / "model"
FIGURES_DIR = BASE_DIR / "figures"
MODEL_PATH = MODEL_DIR / "best_model.joblib"
CONFIG_PATH = MODEL_DIR / "model_config.json"
PERM_PATH = MODEL_DIR / "top500_permissions.json"
SIG_PATH = MODEL_DIR / "table_backdoor_signature.csv"
NAMED_SIG_PATH = MODEL_DIR / "table_permission_signature_named.csv"
FIG_BAR_PATH = FIGURES_DIR / "fig_shap_bar_top50.png"
FIG_BEE_PATH = FIGURES_DIR / "fig_shap_beeswarm_top20.png"
FIG_GROUP_PATH = FIGURES_DIR / "fig_shap_group_bar.png"
UDINUS_LOGO_URL = "https://dinus.ac.id/wp-content/uploads/2022/12/Logo-Udinus-Official-02-300x300-1.png"
JOURNAL_URL = "https://journal.universitasbumigora.ac.id/matrik/article/view/6198"
DATASET_URL = "https://www.unb.ca/cic/datasets/andmal2020.html"
MAX_ROWS = 100
# =============================================================================
# GLOBAL CSS
# =============================================================================
st.markdown("""
""", unsafe_allow_html=True)
# =============================================================================
# CACHED RESOURCES
# =============================================================================
@st.cache_resource
def load_model():
return joblib.load(MODEL_PATH)
@st.cache_data
def load_config():
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
return json.load(f)
@st.cache_data
def load_sample_bytes():
"""Generate sample input CSV in memory. 9503 columns named 0..9502, one row of zeros."""
col_names = [str(i) for i in range(9503)]
df_sample = pd.DataFrame([[0] * 9503], columns=col_names)
return df_sample.to_csv(index=False).encode("utf-8")
@st.cache_data
def load_signature_table():
return pd.read_csv(SIG_PATH, index_col=0)
@st.cache_data
def load_named_signature_table():
return pd.read_csv(NAMED_SIG_PATH, index_col=0)
# =============================================================================
# SIDEBAR NAVIGATION
# =============================================================================
st.sidebar.markdown("""
Penelitian Hibah DIKTI
Backdoor Android
Malware Detection
Universitas Dian Nuswantoro
""", unsafe_allow_html=True)
st.sidebar.divider()
st.sidebar.markdown("Navigasi
", unsafe_allow_html=True)
page = st.sidebar.radio(
label="Pilih halaman",
options=["Prediksi", "SHAP Signature", "Tentang Penelitian"],
label_visibility="collapsed",
)
st.sidebar.divider()
st.sidebar.markdown("""
Model
Random Forest + SMOTE
Dataset
CCCS-CIC-AndMal-2020
Fitur
9503 static features
Top-500 selected
Max input
100 baris per upload
""", unsafe_allow_html=True)
# =============================================================================
# PAGE 1: PREDIKSI
# =============================================================================
if page == "Prediksi":
st.title("Prediksi Backdoor Android Malware")
st.markdown("""
Upload file CSV yang berisi static features dari satu atau lebih Android APK.
Model akan memprediksi apakah setiap APK termasuk kategori
Backdoor atau Non-Backdoor,
beserta nilai probabilitas dari Random Forest classifier.
""", unsafe_allow_html=True)
# -- Template download
st.subheader("Template Input CSV")
col_dl, col_desc = st.columns([2, 3])
with col_dl:
st.download_button(
label="Download sample_input.csv",
data=load_sample_bytes(),
file_name="sample_input.csv",
mime="text/csv",
)
with col_desc:
st.markdown("""
Template berisi 9503 kolom
(kolom 0 hingga
9502), satu baris bernilai nol.
Isi nilai kolom sesuai fitur APK Anda (0 atau 1),
tambahkan baris baru untuk setiap APK tambahan, lalu upload di bawah.
""", unsafe_allow_html=True)
st.divider()
# -- Upload
st.subheader("Upload File CSV")
uploaded_file = st.file_uploader(
label="Pilih file CSV (maks. {} baris)".format(MAX_ROWS),
type=["csv"],
help="File harus memiliki 9503 kolom bernama 0 sampai 9502.",
)
if uploaded_file is not None:
# -- Read
try:
df_input = pd.read_csv(uploaded_file, dtype=np.float32)
except Exception as e:
st.error("Gagal membaca file CSV: {}".format(e))
st.stop()
# -- Validate columns
expected_cols = [str(i) for i in range(9503)]
df_input.columns = [str(c) for c in df_input.columns]
missing_cols = [c for c in expected_cols if c not in df_input.columns]
if missing_cols:
st.error(
"File CSV tidak valid. Dibutuhkan 9503 kolom bernama 0 sampai 9502. "
"Jumlah kolom yang hilang: {}. "
"Gunakan template sample_input.csv di atas sebagai panduan.".format(len(missing_cols))
)
st.stop()
# -- Row limit
total_rows = len(df_input)
if total_rows > MAX_ROWS:
st.warning(
"File memiliki {} baris. Hanya {} baris pertama yang diproses "
"(batas maksimal per upload).".format(total_rows, MAX_ROWS)
)
df_input = df_input.head(MAX_ROWS)
n_samples = len(df_input)
# -- Extract top-500 features
config = load_config()
top500_idx = config["top500_original_indices"]
col_names = [str(i) for i in top500_idx]
X = df_input[col_names].values.astype(np.float32)
# -- Predict
with st.spinner("Memproses {} sampel...".format(n_samples)):
model = load_model()
y_pred = model.predict(X)
y_proba = model.predict_proba(X)[:, 1]
# -- Aggregate summary
n_backdoor = int((y_pred == 1).sum())
n_non_backdoor = int((y_pred == 0).sum())
st.divider()
st.subheader("Ringkasan Hasil")
col1, col2, col3 = st.columns(3)
with col1:
st.markdown("""
""".format(n_samples), unsafe_allow_html=True)
with col2:
st.markdown("""
""".format(n_backdoor), unsafe_allow_html=True)
with col3:
st.markdown("""
""".format(n_non_backdoor), unsafe_allow_html=True)
# -- Per-row results
st.divider()
st.subheader("Hasil per Sampel")
results_df = pd.DataFrame({
"Sample" : range(1, n_samples + 1),
"Prediction": ["Backdoor" if p == 1 else "Non-Backdoor" for p in y_pred],
"Probability (Backdoor)": [round(float(p), 4) for p in y_proba],
})
def _style_row(row):
if row["Prediction"] == "Backdoor":
return ["background-color: #fce8e6; color: #7b1e1e"] * len(row)
return ["background-color: #e8f5e9; color: #1b4d2a"] * len(row)
styled = results_df.style.apply(_style_row, axis=1).format(
{"Probability (Backdoor)": "{:.4f}"}
)
st.dataframe(styled, use_container_width=True, height=min(400, 40 + n_samples * 38))
# -- Download per-row
csv_out = results_df.to_csv(index=False).encode("utf-8")
st.download_button(
label="Download hasil prediksi (.csv)",
data=csv_out,
file_name="hasil_prediksi.csv",
mime="text/csv",
)
st.markdown("""
Kolom Probability (Backdoor) adalah
probabilitas kelas positif (Backdoor) dari Random Forest classifier.
Nilai mendekati 1.0 menunjukkan keyakinan tinggi bahwa APK adalah Backdoor.
""", unsafe_allow_html=True)
# =============================================================================
# PAGE 2: SHAP SIGNATURE
# =============================================================================
elif page == "SHAP Signature":
st.title("SHAP-Based Backdoor Permission Signature")
st.markdown("""
Halaman ini menampilkan hasil analisis SHAP (SHapley Additive exPlanations)
terhadap model terbaik: Random Forest + SMOTE (RF-C2).
SHAP digunakan untuk mengidentifikasi permission dan fitur yang paling berkontribusi
terhadap deteksi Backdoor malware, serta mengekstrak
behavioral permission signature yang membedakan Backdoor dari kategori malware lain.
""", unsafe_allow_html=True)
# -- Figure 1: Bar Top-50
st.subheader("SHAP Bar Plot: Top-50 Features")
st.markdown("""
Mean absolute SHAP value untuk 50 fitur dengan kontribusi terbesar
terhadap prediksi model pada test set.
Semakin besar nilainya, semakin kuat pengaruh fitur tersebut terhadap keputusan classifier.
""", unsafe_allow_html=True)
if os.path.exists(FIG_BAR_PATH):
st.image(FIG_BAR_PATH, use_container_width=True)
else:
st.warning("File gambar tidak ditemukan: {}".format(FIG_BAR_PATH))
st.divider()
# -- Figure 2: Beeswarm Top-20
st.subheader("SHAP Beeswarm Plot: Top-20 Features")
st.markdown("""
Distribusi SHAP value untuk 20 fitur teratas pada seluruh sampel test set.
Warna menunjukkan nilai fitur (merah = tinggi, biru = rendah).
Plot ini memperlihatkan arah dan besaran pengaruh setiap fitur.
""", unsafe_allow_html=True)
if os.path.exists(FIG_BEE_PATH):
st.image(FIG_BEE_PATH, use_container_width=True)
else:
st.warning("File gambar tidak ditemukan: {}".format(FIG_BEE_PATH))
st.divider()
# -- Figure 3: Group bar
st.subheader("Group-Level SHAP Contribution")
st.markdown("""
Perbandingan mean absolute SHAP value antara kelompok fitur
Permissions (indeks 0-3267) dan
Non-Permissions (indeks 3268-9501),
yang mencakup Services, Intent Actions, dan Intent Categories.
""", unsafe_allow_html=True)
if os.path.exists(FIG_GROUP_PATH):
st.image(FIG_GROUP_PATH, use_container_width=True)
else:
st.warning("File gambar tidak ditemukan: {}".format(FIG_GROUP_PATH))
st.divider()
# -- Signature Table
st.subheader("Backdoor Behavioral Signature (Top-50)")
st.markdown("""
Fitur-fitur dengan mean SHAP positif pada sampel True Positive
(APK yang secara benar terdeteksi sebagai Backdoor).
Fitur-fitur ini membentuk behavioral signature Backdoor malware.
Kolom feature_col_idx adalah indeks fitur
dalam ruang fitur asli (0-9502).
""", unsafe_allow_html=True)
df_sig = load_signature_table()
st.dataframe(df_sig, use_container_width=True, height=400)
st.divider()
# -- Named Permission Signature
st.subheader("Named Permission Signature")
st.markdown("""
Permission-permission yang termasuk dalam Backdoor behavioral signature,
beserta nama resminya dari Android permission list
dan nilai mean SHAP pada True Positive samples.
""", unsafe_allow_html=True)
df_named = load_named_signature_table()
st.dataframe(df_named, use_container_width=True, height=350)
# =============================================================================
# PAGE 3: TENTANG PENELITIAN
# =============================================================================
elif page == "Tentang Penelitian":
st.title("Tentang Penelitian")
# -- Header: logo + info
col_logo, col_header = st.columns([1, 4])
with col_logo:
st.image(UDINUS_LOGO_URL, width=120)
with col_header:
st.markdown("""
Universitas Dian Nuswantoro
Semarang, Jawa Tengah, Indonesia
PENELITIAN DASAR PERGURUAN TINGGI 2025/2026 Semester Gasal
""", unsafe_allow_html=True)
st.divider()
# -- Informasi penelitian
st.subheader("Informasi Penelitian")
st.markdown("""
""".format(journal=JOURNAL_URL, dataset=DATASET_URL), unsafe_allow_html=True)
st.divider()
# -- Abstract
st.subheader("Abstract")
st.markdown("""
Backdoor malware pada platform Android merupakan ancaman serius karena memungkinkan
akses tidak sah secara tersembunyi ke dalam sistem perangkat korban.
Deteksi Backdoor menghadapi tantangan berupa extreme class imbalance
yang parah pada dataset dunia nyata.
Penelitian ini mengusulkan pipeline deteksi berbasis static features
menggunakan dataset CCCS-CIC-AndMal-2020,
dengan rasio ketidakseimbangan kelas 1:221.5 antara kelas Backdoor dan kelas lainnya.
Pipeline dua tahap feature selection diterapkan:
Variance Threshold mereduksi 9.503 fitur menjadi 1.433 fitur,
diikuti seleksi berbasis Random Forest feature importance
untuk memilih 500 fitur terbaik.
Lima classifier dievaluasi: Decision Tree, Logistic Regression, Random Forest,
XGBoost, dan LightGBM, masing-masing di bawah tiga kondisi penanganan imbalance:
tanpa penanganan (C1), SMOTE (C2), dan SMOTE dengan cost-sensitive learning (C3),
dengan 10 random seeds untuk validasi statistik via uji Wilcoxon Signed-Rank.
Model terbaik berdasarkan composite ranking adalah
Random Forest dengan kondisi SMOTE (RF-C2),
yang mencapai F1-Score=0.9046, AUC-ROC=0.9917, dan G-Mean=0.9426.
Analisis SHAP (SHapley Additive exPlanations) digunakan untuk mengekstraksi
behavioral permission signature Backdoor malware,
memberikan interpretabilitas terhadap keputusan model.
""", unsafe_allow_html=True)
st.divider()
# -- Model performance
st.subheader("Performa Model Terbaik: Random Forest + SMOTE (RF-C2)")
st.markdown("""
Rata-rata dan standar deviasi dari 10 random seeds pada test set.
Evaluasi menggunakan metrik khusus untuk kelas minority (Backdoor).
""", unsafe_allow_html=True)
perf_rows = [
("F1-Score (Backdoor class)", "0.9046", "0.0020"),
("AUC-ROC", "0.9917", "0.0010"),
("G-Mean", "0.9426", "0.0030"),
("Precision", "0.9133", "0.0035"),
("Recall", "0.8961", "0.0061"),
("MCC", "0.8952", "0.0021"),
("Training Time", "2.18 s", "0.04 s"),
("Inference Time per Sample", "0.0174 ms", "0.0004 ms"),
]
rows_html = ""
for metric, mean_val, std_val in perf_rows:
rows_html += """
| {metric} |
{mean} +/- {std} |
""".format(metric=metric, mean=mean_val, std=std_val)
st.markdown("""
""".format(rows_html), unsafe_allow_html=True)
st.divider()
# -- Methodology overview
st.subheader("Ringkasan Metodologi")
st.markdown("""
| Dataset | CCCS-CIC-AndMal-2020 — 342.169 sampel, 18 kategori |
| Task | Binary classification: Backdoor (1) vs. semua kategori lain (0) |
| Fitur | Static features — 9.503 kolom (permissions, services, intents, categories) |
| Feature Selection | Variance Threshold + Random Forest Importance (top-500) |
| Imbalance Handling | C1: None | C2: SMOTE | C3: SMOTE + cost-sensitive |
| Classifiers | Decision Tree, Logistic Regression, Random Forest, XGBoost, LightGBM |
| Validasi | 10 random seeds, Wilcoxon Signed-Rank test, effect size r |
| Explainability | SHAP TreeExplainer — behavioral permission signature |
""", unsafe_allow_html=True)
st.divider()
st.markdown("""
Universitas Dian Nuswantoro |
Penelitian Dasar Perguruan Tinggi 2025/2026 |
Rama Aria Megantara & Dewi Pergiwati
""", unsafe_allow_html=True)