Darendra's picture
Update app.py
dafa625 verified
raw
history blame
11.2 kB
import os
import torch
import pandas as pd
import numpy as np
import gradio as gr
import zipfile
import shutil
from pathlib import Path
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel, AutoConfig
# =========================================================
# 1. KONFIGURASI & SETUP
# =========================================================
LIST_LABEL = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
DIR_TRAINED = Path("saved_models/trained_local")
DIR_UPLOADED = Path("saved_models/uploaded_colab")
DIR_TRAINED.mkdir(parents=True, exist_ok=True)
DIR_UPLOADED.mkdir(parents=True, exist_ok=True)
ACTIVE_MODEL_POINTER = "active_model_path.txt"
# =========================================================
# 2. ARSITEKTUR MODEL
# =========================================================
class ModelEmosi(nn.Module):
def __init__(self, base_model_name, num_labels=8):
super().__init__()
# Load config agar fleksibel (bisa baca dari folder atau nama model)
self.config = AutoConfig.from_pretrained(base_model_name)
self.base = AutoModel.from_pretrained(base_model_name)
self.dropout = nn.Dropout(0.3)
self.classifier = nn.Linear(self.config.hidden_size, num_labels)
def forward(self, input_ids, attention_mask):
out = self.base(input_ids=input_ids, attention_mask=attention_mask)
if hasattr(out, "pooler_output") and out.pooler_output is not None:
x = out.pooler_output
else:
# Fallback jika model tidak punya pooler (misal DistilBERT)
x = out.last_hidden_state[:, 0, :]
return self.classifier(self.dropout(x))
# =========================================================
# 3. HELPER FUNCTIONS
# =========================================================
def clean_data(df):
# Pastikan kolom label ada dan bertipe float
for l in LIST_LABEL:
if l not in df.columns: df[l] = 0
df[l] = pd.to_numeric(df[l], errors='coerce').fillna(0).astype(float)
# Bersihkan teks
if "text" in df.columns:
df["text"] = df["text"].astype(str).str.replace("\n", " ").str.strip()
return df
def get_active_model_path():
if os.path.exists(ACTIVE_MODEL_POINTER):
with open(ACTIVE_MODEL_POINTER, "r") as f:
path = f.read().strip()
if os.path.exists(path): return path
return None
def set_active_model_path(path):
with open(ACTIVE_MODEL_POINTER, "w") as f:
f.write(str(path))
# =========================================================
# 4. LOGIKA TRAINING (CPU - HANYA UNTUK DATA KECIL)
# =========================================================
def run_training_generator(file_obj, sep, epochs, batch_size, lr, progress=gr.Progress()):
yield "⏳ Membaca dataset...", None
try:
df = pd.read_csv(file_obj.name, sep=sep)
df = clean_data(df)
except Exception as e:
yield f"❌ Error: {str(e)}", None
return
device = "cpu"
# Default model dasar untuk training manual di CPU
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_fn(texts):
return tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
encodings = tokenize_fn(df["text"].tolist())
labels = torch.tensor(df[LIST_LABEL].values, dtype=torch.float)
dataset = TensorDataset(encodings["input_ids"], encodings["attention_mask"], labels)
train_loader = DataLoader(dataset, batch_size=int(batch_size), shuffle=True)
model = ModelEmosi(model_name)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=float(lr))
loss_fn = nn.BCEWithLogitsLoss()
log_text = f"πŸš€ Mulai Training CPU...\nData: {len(df)} baris\n"
yield log_text, None
model.train()
for ep in range(int(epochs)):
total_loss = 0
for step, batch in enumerate(train_loader):
b_ids, b_mask, b_lbl = batch
optimizer.zero_grad()
out = model(b_ids, b_mask)
loss = loss_fn(out, b_lbl)
loss.backward()
optimizer.step()
total_loss += loss.item()
# Update progress bar setiap 5 step
if step % 5 == 0:
progress((ep * len(train_loader) + step) / (int(epochs) * len(train_loader)))
avg_loss = total_loss / len(train_loader)
log_text += f"βœ… Epoch {ep+1} | Loss: {avg_loss:.4f}\n"
yield log_text, None
# Simpan Model
model.base.save_pretrained(DIR_TRAINED)
tokenizer.save_pretrained(DIR_TRAINED)
torch.save(model.classifier.state_dict(), DIR_TRAINED / "classifier_head.pt")
set_active_model_path(DIR_TRAINED)
yield log_text + "\nπŸŽ‰ Selesai & Disimpan!", "Model Lokal (Baru Dilatih)"
# =========================================================
# 5. LOGIKA UPLOAD (DARI COLAB)
# =========================================================
def handle_zip_upload(file_obj):
if file_obj is None: return "❌ Tidak ada file.", None
try:
if DIR_UPLOADED.exists(): shutil.rmtree(DIR_UPLOADED)
DIR_UPLOADED.mkdir()
with zipfile.ZipFile(file_obj.name, 'r') as zip_ref:
zip_ref.extractall(DIR_UPLOADED)
# Handle jika zip membungkus folder (bukan isi file langsung)
files_in_dir = list(DIR_UPLOADED.iterdir())
if len(files_in_dir) == 1 and files_in_dir[0].is_dir():
subfolder = files_in_dir[0]
for item in subfolder.iterdir():
shutil.move(str(item), str(DIR_UPLOADED))
subfolder.rmdir()
set_active_model_path(DIR_UPLOADED)
return f"βœ… Model berhasil dimuat dari ZIP!\nLokasi: {DIR_UPLOADED}", "Model Upload (Dari Colab)"
except Exception as e:
return f"❌ Error unzip: {str(e)}", None
# =========================================================
# 6. LOGIKA PREDIKSI
# =========================================================
def load_model_inference():
path = get_active_model_path()
if not path: raise ValueError("Belum ada model aktif.")
path = Path(path)
tokenizer = AutoTokenizer.from_pretrained(path)
model = ModelEmosi(path)
head_path = path / "classifier_head.pt"
if head_path.exists():
model.classifier.load_state_dict(torch.load(head_path, map_location="cpu"))
model.eval()
return model, tokenizer
def predict_text(text):
if not text: return None
try:
model, tokenizer = load_model_inference()
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
with torch.no_grad():
out = model(inputs["input_ids"], inputs["attention_mask"])
probs = torch.sigmoid(out).numpy()[0]
return {LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))}
except Exception as e:
return {"Error": str(e)}
def predict_csv(file_obj, sep):
try:
df = pd.read_csv(file_obj.name, sep=sep)
df = clean_data(df)
model, tokenizer = load_model_inference()
results = []
for txt in df["text"]:
inputs = tokenizer(txt, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
with torch.no_grad():
out = model(inputs["input_ids"], inputs["attention_mask"])
probs = torch.sigmoid(out).numpy()[0]
results.append({LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))})
# Hitung statistik
avg = {l: 0.0 for l in LIST_LABEL}
for r in results:
for l,v in r.items(): avg[l] += v
for l in avg: avg[l] /= len(results)
top3 = sorted(avg.items(), key=lambda x: x[1], reverse=True)[:3]
return {
"Total Data": len(results),
"Top 3 Emosi Dominan": {k: round(v,4) for k,v in top3},
"Rata-rata Skor": avg
}
except Exception as e:
return {"Error": str(e)}
# =========================================================
# 7. TAMPILAN ANTARMUKA (UI GRADIO)
# =========================================================
with gr.Blocks(title="Emotion AI Manager") as app:
gr.Markdown("#AI Emotion Classifier System")
# Status Bar Global
lbl_active_model = gr.Textbox(label="Status Model Aktif", value="Belum ada model yang dipilih.", interactive=False)
# TAB UTAMA 1: SETUP & PELATIHAN
with gr.Tab("βš™οΈ Pelatihan & Model"):
with gr.Tabs():
# Sub-Tab 1.1: Upload Pretrained Model
with gr.Tab("πŸ“‚ Upload Pretrained Model"):
gr.Markdown("Sudah punya model terlatih? gunakan model hasil training model")
in_zip = gr.File(label="Upload File .zip Model", file_types=[".zip"])
btn_upload = gr.Button("Ekstrak & Aktifkan Model", variant="primary")
out_log_upload = gr.Textbox(label="Log Sistem")
btn_upload.click(handle_zip_upload, inputs=in_zip, outputs=[out_log_upload, lbl_active_model])
# Sub-Tab 1.2: Latihan Manual
with gr.Tab("πŸ‹οΈβ€β™€οΈ Latihan Manual"):
gr.Markdown("Belum punya model? latih file csv [text;label emosi (1/0)]")
with gr.Row():
in_csv = gr.File(label="Dataset CSV")
in_sep = gr.Textbox(label="Separator", value=";")
with gr.Row():
in_ep = gr.Number(label="Epoch", value=1)
in_bs = gr.Number(label="Batch", value=4)
in_lr = gr.Number(label="LR", value=2e-5)
btn_train = gr.Button("Mulai Latihan")
out_log_train = gr.Textbox(label="Log Training", lines=6)
btn_train.click(run_training_generator, inputs=[in_csv, in_sep, in_ep, in_bs, in_lr], outputs=[out_log_train, lbl_active_model])
# TAB UTAMA 2: PENGUJIAN
with gr.Tab("πŸ§ͺ Testing"):
with gr.Tabs():
# Sub-Tab 2.1: Uji Tunggal
with gr.Tab("πŸ“ Uji Tunggal (Teks)"):
in_txt = gr.Textbox(label="Masukkan Kalimat", placeholder="Saya merasa...")
btn_pred_txt = gr.Button("Prediksi Emosi")
out_lbl = gr.Label(label="Confidence Score")
btn_pred_txt.click(predict_text, inputs=in_txt, outputs=out_lbl)
# Sub-Tab 2.2: Uji Batch
with gr.Tab("πŸ“Š Uji Batch (CSV)"):
in_csv_test = gr.File(label="Upload CSV Test")
in_sep_test = gr.Textbox(label="Separator", value=";")
btn_pred_csv = gr.Button("Analisis Batch")
out_json = gr.JSON(label="Hasil Analisis")
btn_pred_csv.click(predict_csv, inputs=[in_csv_test, in_sep_test], outputs=out_json)
app.queue().launch()