Spaces:

Darendra
/

BERT_Emotion_Classification

Sleeping

App Files Files Community

BERT_Emotion_Classification / app.py

Darendra

Update app.py

74dd21d verified 4 months ago

raw

history blame

10.8 kB

	import os
	import torch
	import pandas as pd
	import numpy as np
	import gradio as gr
	import zipfile
	import shutil
	from pathlib import Path
	from torch import nn
	from torch.utils.data import DataLoader, TensorDataset
	from transformers import AutoTokenizer, AutoModel, AutoConfig

	# =========================================================
	# 1. KONFIGURASI & SETUP
	# =========================================================
	LIST_LABEL = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
	DIR_TRAINED = Path("saved_models/trained_local")
	DIR_UPLOADED = Path("saved_models/uploaded_colab")

	DIR_TRAINED.mkdir(parents=True, exist_ok=True)
	DIR_UPLOADED.mkdir(parents=True, exist_ok=True)

	ACTIVE_MODEL_POINTER = "active_model_path.txt"

	# =========================================================
	# 2. ARSITEKTUR MODEL
	# =========================================================
	class ModelEmosi(nn.Module):
	def __init__(self, base_model_name, num_labels=8):
	super().__init__()
	self.config = AutoConfig.from_pretrained(base_model_name)
	self.base = AutoModel.from_pretrained(base_model_name)
	self.dropout = nn.Dropout(0.3)
	self.classifier = nn.Linear(self.config.hidden_size, num_labels)

	def forward(self, input_ids, attention_mask):
	out = self.base(input_ids=input_ids, attention_mask=attention_mask)
	if hasattr(out, "pooler_output") and out.pooler_output is not None:
	x = out.pooler_output
	else:
	x = out.last_hidden_state[:, 0, :]
	return self.classifier(self.dropout(x))

	# =========================================================
	# 3. HELPER FUNCTIONS
	# =========================================================
	def clean_data(df):
	for l in LIST_LABEL:
	if l not in df.columns: df[l] = 0
	df[l] = pd.to_numeric(df[l], errors='coerce').fillna(0).astype(float)
	if "text" in df.columns:
	df["text"] = df["text"].astype(str).str.replace("\n", " ").str.strip()
	return df

	def get_active_model_path():
	if os.path.exists(ACTIVE_MODEL_POINTER):
	with open(ACTIVE_MODEL_POINTER, "r") as f:
	path = f.read().strip()
	if os.path.exists(path): return path
	return None

	def set_active_model_path(path):
	with open(ACTIVE_MODEL_POINTER, "w") as f:
	f.write(str(path))

	# =========================================================
	# 4. LOGIKA TRAINING (CPU)
	# =========================================================
	def run_training_generator(file_obj, sep, epochs, batch_size, lr, progress=gr.Progress()):
	yield "⏳ Membaca dataset...", None
	try:
	df = pd.read_csv(file_obj.name, sep=sep)
	df = clean_data(df)
	except Exception as e:
	yield f"❌ Error: {str(e)}", None
	return

	device = "cpu"
	model_name = "bert-base-multilingual-cased"
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	def tokenize_fn(texts):
	return tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

	encodings = tokenize_fn(df["text"].tolist())
	labels = torch.tensor(df[LIST_LABEL].values, dtype=torch.float)
	dataset = TensorDataset(encodings["input_ids"], encodings["attention_mask"], labels)
	train_loader = DataLoader(dataset, batch_size=int(batch_size), shuffle=True)

	model = ModelEmosi(model_name)
	model.to(device)
	optimizer = torch.optim.AdamW(model.parameters(), lr=float(lr))
	loss_fn = nn.BCEWithLogitsLoss()

	log_text = f"🚀 Mulai Training CPU...\nData: {len(df)} baris\n"
	yield log_text, None

	model.train()
	for ep in range(int(epochs)):
	total_loss = 0
	for step, batch in enumerate(train_loader):
	b_ids, b_mask, b_lbl = batch
	optimizer.zero_grad()
	out = model(b_ids, b_mask)
	loss = loss_fn(out, b_lbl)
	loss.backward()
	optimizer.step()
	total_loss += loss.item()

	if step % 5 == 0:
	progress((ep * len(train_loader) + step) / (int(epochs) * len(train_loader)))

	avg_loss = total_loss / len(train_loader)
	log_text += f"✅ Epoch {ep+1} \| Loss: {avg_loss:.4f}\n"
	yield log_text, None

	model.base.save_pretrained(DIR_TRAINED)
	tokenizer.save_pretrained(DIR_TRAINED)
	torch.save(model.classifier.state_dict(), DIR_TRAINED / "classifier_head.pt")
	set_active_model_path(DIR_TRAINED)

	yield log_text + "\n🎉 Selesai & Disimpan!", "Model Lokal (Baru Dilatih)"

	# =========================================================
	# 5. LOGIKA UPLOAD (DARI COLAB)
	# =========================================================
	def handle_zip_upload(file_obj):
	if file_obj is None: return "❌ Tidak ada file.", None
	try:
	if DIR_UPLOADED.exists(): shutil.rmtree(DIR_UPLOADED)
	DIR_UPLOADED.mkdir()

	with zipfile.ZipFile(file_obj.name, 'r') as zip_ref:
	zip_ref.extractall(DIR_UPLOADED)

	# Handle jika ada subfolder
	files_in_dir = list(DIR_UPLOADED.iterdir())
	if len(files_in_dir) == 1 and files_in_dir[0].is_dir():
	subfolder = files_in_dir[0]
	for item in subfolder.iterdir():
	shutil.move(str(item), str(DIR_UPLOADED))
	subfolder.rmdir()

	set_active_model_path(DIR_UPLOADED)
	return f"✅ Model berhasil dimuat dari ZIP!\nLokasi: {DIR_UPLOADED}", "Model Upload (Dari Colab)"
	except Exception as e:
	return f"❌ Error unzip: {str(e)}", None

	# =========================================================
	# 6. LOGIKA PREDIKSI
	# =========================================================
	def load_model_inference():
	path = get_active_model_path()
	if not path: raise ValueError("Belum ada model aktif.")

	path = Path(path)
	config = AutoConfig.from_pretrained(path)
	tokenizer = AutoTokenizer.from_pretrained(path)
	model = ModelEmosi(path)

	head_path = path / "classifier_head.pt"
	if head_path.exists():
	model.classifier.load_state_dict(torch.load(head_path, map_location="cpu"))

	model.eval()
	return model, tokenizer

	def predict_text(text):
	if not text: return None
	try:
	model, tokenizer = load_model_inference()
	inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
	with torch.no_grad():
	out = model(inputs["input_ids"], inputs["attention_mask"])
	probs = torch.sigmoid(out).numpy()[0]
	return {LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))}
	except Exception as e:
	return {"Error": str(e)}

	def predict_csv(file_obj, sep):
	try:
	df = pd.read_csv(file_obj.name, sep=sep)
	df = clean_data(df)
	model, tokenizer = load_model_inference()
	results = []
	for txt in df["text"]:
	inputs = tokenizer(txt, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
	with torch.no_grad():
	out = model(inputs["input_ids"], inputs["attention_mask"])
	probs = torch.sigmoid(out).numpy()[0]
	results.append({LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))})

	avg = {l: 0.0 for l in LIST_LABEL}
	for r in results:
	for l,v in r.items(): avg[l] += v
	for l in avg: avg[l] /= len(results)

	top3 = sorted(avg.items(), key=lambda x: x[1], reverse=True)[:3]
	return {"Total Data": len(results), "Top 3 Emosi": {k: round(v,4) for k,v in top3}, "Rata-rata": avg}
	except Exception as e:
	return {"Error": str(e)}

	# =========================================================
	# 7. TAMPILAN ANTARMUKA (UI)
	# =========================================================
	with gr.Blocks(title="Emotion AI Manager") as app:
	gr.Markdown("# 🎭 AI Emotion Classifier System")

	# Status Bar Global
	lbl_active_model = gr.Textbox(label="Status Model Aktif", value="Belum ada model yang dipilih.", interactive=False)

	# TAB UTAMA 1: SETUP & PELATIHAN
	with gr.Tab("⚙️ Pelatihan & Model"):
	with gr.Tabs():

	# Sub-Tab 1.1: Upload (Paling Recommended)
	with gr.Tab("📂 Upload Pretrained (Recommended)"):
	gr.Markdown("Gunakan model hasil training GPU (Colab) agar cepat.")
	in_zip = gr.File(label="Upload File .zip Model", file_types=[".zip"])
	btn_upload = gr.Button("Ekstrak & Aktifkan Model", variant="primary")
	out_log_upload = gr.Textbox(label="Log Sistem")

	btn_upload.click(handle_zip_upload, inputs=in_zip, outputs=[out_log_upload, lbl_active_model])

	# Sub-Tab 1.2: Latihan Manual
	with gr.Tab("🏋️‍♀️ Latihan Manual (CPU)"):
	gr.Markdown("⚠️ Lambat di Hugging Face Space. Gunakan data kecil saja.")
	with gr.Row():
	in_csv = gr.File(label="Dataset CSV")
	in_sep = gr.Textbox(label="Separator", value=";")
	with gr.Row():
	in_ep = gr.Number(label="Epoch", value=1)
	in_bs = gr.Number(label="Batch", value=4)
	in_lr = gr.Number(label="LR", value=2e-5)

	btn_train = gr.Button("Mulai Latihan")
	out_log_train = gr.Textbox(label="Log Training", lines=6)

	btn_train.click(run_training_generator, inputs=[in_csv, in_sep, in_ep, in_bs, in_lr], outputs=[out_log_train, lbl_active_model])

	# TAB UTAMA 2: PENGUJIAN
	with gr.Tab("🧪 Pengujian (Testing)"):
	with gr.Tabs():

	# Sub-Tab 2.1: Uji Tunggal
	with gr.Tab("📝 Uji Tunggal (Teks)"):
	in_txt = gr.Textbox(label="Masukkan Kalimat", placeholder="Saya merasa...")
	btn_pred_txt = gr.Button("Prediksi Emosi")
	out_lbl = gr.Label(label="Confidence Score")

	btn_pred_txt.click(predict_text, inputs=in_txt, outputs=out_lbl)

	# Sub-Tab 2.2: Uji Batch
	with gr.Tab("📊 Uji Batch (CSV)"):
	in_csv_test = gr.File(label="Upload CSV Test")
	in_sep_test = gr.Textbox(label="Separator", value=";")
	btn_pred_csv = gr.Button("Analisis Batch")
	out_json = gr.JSON(label="Hasil Analisis")

	btn_pred_csv.click(predict_csv, inputs=[in_csv_test, in_sep_test], outputs=out_json)

	app.queue().launch()