Spaces:

Darendra
/

BERT_Emotion_Classification

Sleeping

App Files Files Community

Darendra commited on Dec 8, 2025

Commit

2faddd5

verified ·

1 Parent(s): d235f02

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -45

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ==============================================================
-#                       EMOTION CLASSIFIER
 # ==============================================================
 import os
 import math
@@ -7,10 +7,9 @@ import torch
 import pandas as pd
 import numpy as np
 import gradio as gr
-import matplotlib.pyplot as plt
 from pathlib import Path
 from torch import nn
-from torch.utils.data import Dataset, DataLoader, TensorDataset
 from sklearn.model_selection import train_test_split
 from transformers import (
     AutoTokenizer,
@@ -23,29 +22,24 @@ from transformers import (
 # CONFIG
 # =========================================================
 LIST_LABEL = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
-LABEL2ID = {l:i for i,l in enumerate(LIST_LABEL)}
-ID2LABEL = {i:l for i,l in enumerate(LIST_LABEL)}
 FOLDER_MODEL = Path("saved_models")
 FOLDER_MODEL.mkdir(exist_ok=True)
 # ==============================================================
-#                     File & Utils
 # ==============================================================
 def read_file_upload(file_obj):
     """Handle file upload dari Gradio."""
     if file_obj is None:
         raise ValueError("File belum diupload.")
-    # Kalau inputnya string path
     if isinstance(file_obj, str):
         return file_obj
-    # Kalau inputnya object file (Gradio baru)
     if hasattr(file_obj, "name"):
         return file_obj.name
-    # Kalau binary stream
     if hasattr(file_obj, "read"):
         temp_path = Path("/tmp") / f"upload_{np.random.randint(1e9)}.csv"
         with open(temp_path, "wb") as f:
@@ -54,7 +48,6 @@ def read_file_upload(file_obj):
     raise ValueError("Tipe file tidak didukung.")
-# --- FUNGSI YANG DIUBAH (LEBIH SINGKAT) ---
 def save_last_model(name):
     (FOLDER_MODEL / "last_model_name.txt").write_text(name)
@@ -63,7 +56,6 @@ def load_last_model():
     if path_file.exists():
         return path_file.read_text().strip()
     return None
-# ------------------------------------------
 def get_model_path(model_name):
     return FOLDER_MODEL / model_name.replace("/", "_")
@@ -72,10 +64,18 @@ def get_model_path(model_name):
 #                        Data Cleaning
 # ==============================================================
 def clean_labels(df):
-    """Isi label kosong dengan 0."""
     for l in LIST_LABEL:
         if l not in df.columns:
             df[l] = 0
     return df
 def clean_text(df, col="text"):
@@ -89,7 +89,6 @@ def clean_text(df, col="text"):
 #                        Model Architecture
 # =========================================================
 class ModelEmosi(nn.Module):
-    """Backbone BERT + Classifier Head."""
     def __init__(self, base_model_name, num_labels=8):
         super().__init__()
         self.config = AutoConfig.from_pretrained(base_model_name)
@@ -111,7 +110,7 @@ class ModelEmosi(nn.Module):
         return self.classifier(x)
 # ==============================================================
-#                       Tokenizer & Dataset
 # ==============================================================
 def tokenize_batch(texts, tokenizer, max_len=128):
     return tokenizer(
@@ -124,6 +123,8 @@ def tokenize_batch(texts, tokenizer, max_len=128):
 def create_dataset(df, tokenizer, max_len=128):
     encodings = tokenize_batch(df["text"].tolist(), tokenizer, max_len)
     labels = torch.tensor(df[LIST_LABEL].values, dtype=torch.float)
     return TensorDataset(
@@ -133,10 +134,9 @@ def create_dataset(df, tokenizer, max_len=128):
     )
 # ==============================================================
-#                           Weights
 # ==============================================================
 def hitung_pos_weight(df):
-    """Biar adil kalau datanya imbalanced."""
     counts = df[LIST_LABEL].sum(axis=0)
     N = len(df)
     pw = []
@@ -145,15 +145,13 @@ def hitung_pos_weight(df):
     return torch.tensor(pw, dtype=torch.float)
 # ==============================================================
-#                       Save & Load Logic
 # ==============================================================
 def save_model(model, tokenizer, folder):
     os.makedirs(folder, exist_ok=True)
     model.base.save_pretrained(folder)
     tokenizer.save_pretrained(folder)
     torch.save(model.classifier.state_dict(), str(Path(folder) / "classifier_head.pt"))
-    # Update panggilan fungsi di sini
     save_last_model(str(folder))
 def load_model(folder):
@@ -172,6 +170,7 @@ def load_model(folder):
 # ==============================================================
 def jalankan_training(
     df,
     model_name="bert-base-multilingual-cased",
     epochs=3,
     batch_size=8,
@@ -183,6 +182,12 @@ def jalankan_training(
     freeze_layers=6,
     device=None
 ):
     device = device or ("cuda" if torch.cuda.is_available() else "cpu")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -208,6 +213,7 @@ def jalankan_training(
     model = ModelEmosi(model_name)
     model.to(device)
     for name, param in model.base.named_parameters():
         if name.startswith("embeddings."):
             param.requires_grad = False
@@ -242,10 +248,17 @@ def jalankan_training(
     history = {"train_loss": [], "val_loss": []}
     save_path = str(get_model_path(model_name))
     for ep in range(1, epochs+1):
         model.train()
         total_train_loss = 0
         for input_ids, mask, labels in train_loader:
             input_ids = input_ids.to(device)
             mask = mask.to(device)
@@ -264,6 +277,7 @@ def jalankan_training(
         avg_train_loss = total_train_loss / len(train_loader.dataset)
         history["train_loss"].append(avg_train_loss)
         model.eval()
         total_val_loss = 0
         with torch.no_grad():
@@ -278,28 +292,32 @@ def jalankan_training(
         avg_val_loss = total_val_loss / len(val_loader.dataset)
         history["val_loss"].append(avg_val_loss)
-        print(f"Epoch {ep} | Train Loss={avg_train_loss:.4f} | Val Loss={avg_val_loss:.4f}")
         if avg_val_loss < best_val_loss:
             best_val_loss = avg_val_loss
             no_improve = 0
             save_model(model, tokenizer, save_path)
-            print(f"Best model saved to {save_path}")
         else:
             no_improve += 1
-            if no_improve >= patience:
-                print("Early stopping triggered.")
-                break
-    return model, tokenizer, history
 # ==============================================================
 #                            PREDICTION
 # ==============================================================
 def predict_satu(text, folder=None):
-    # Update panggilan fungsi di sini
     folder = folder or load_last_model()
     if folder is None:
         return {"Error": "Belum ada model yang dilatih."}
@@ -320,9 +338,7 @@ def predict_satu(text, folder=None):
     return {LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))}
 def predict_batch(text_list, folder=None, batch_size=32):
-    # Update panggilan fungsi di sini
     folder = folder or load_last_model()
     if folder is None:
         return []
@@ -338,7 +354,6 @@ def predict_batch(text_list, folder=None, batch_size=32):
             max_length=128,
             return_tensors="pt"
         )
         with torch.no_grad():
             out = model(encoded["input_ids"], encoded["attention_mask"])
             probs = torch.sigmoid(out).numpy()
@@ -372,10 +387,11 @@ def summarize_result(preds):
     }
 # ==============================================================
-#                             GRADIO UI
 # ==============================================================
 def wrapper_training(file_obj, sep, model_name, epoch, batch, lr,
-                  max_len, wd, warmup, pat, freeze):
     csv_path = read_file_upload(file_obj)
     df = pd.read_csv(csv_path, sep=sep)
@@ -383,8 +399,12 @@ def wrapper_training(file_obj, sep, model_name, epoch, batch, lr,
     df = clean_labels(df)
     df = clean_text(df)
-    _, _, history = jalankan_training(
         df=df,
         model_name=model_name,
         epochs=int(epoch),
         batch_size=int(batch),
@@ -394,13 +414,17 @@ def wrapper_training(file_obj, sep, model_name, epoch, batch, lr,
         warmup_ratio=float(warmup),
         patience=int(pat),
         freeze_layers=int(freeze)
-    )
-    return {
-        "status": "Training Selesai!",
-        "history": history,
-        "model_used": model_name
-    }
 def wrapper_predict_satu(text):
     return predict_satu(text)
@@ -408,15 +432,13 @@ def wrapper_predict_satu(text):
 def wrapper_predict_dataset(file_obj, sep, batch_size):
     csv_path = read_file_upload(file_obj)
     df = pd.read_csv(csv_path, sep=sep)
     df = clean_labels(df)
     df = clean_text(df)
     preds = predict_batch(df["text"].tolist(), batch_size=int(batch_size))
     return summarize_result(preds)
 # ==============================================================
-#                         INTERFACE
 # ==============================================================
 with gr.Blocks() as app:
     gr.Markdown("## Emotion Classifier — IndoBERT / Multilingual")
@@ -447,13 +469,17 @@ with gr.Blocks() as app:
         in_warmup = gr.Number(label="Warmup Ratio", value=0.1, visible=False)
         btn_train = gr.Button("Mulai Training", variant="primary")
-        out_train = gr.JSON(label="Training Log")
         btn_train.click(
             wrapper_training,
             inputs=[in_file, in_sep, in_model, in_epoch, in_batch,
                     in_lr, in_len, in_wd, in_warmup, in_pat, in_freeze],
-            outputs=out_train
         )
     with gr.Tab("Tes Satu Kalimat"):

 # ==============================================================
+#                       KLASIFIKASI EMOSI
 # ==============================================================
 import os
 import math
 import pandas as pd
 import numpy as np
 import gradio as gr
 from pathlib import Path
 from torch import nn
+from torch.utils.data import DataLoader, TensorDataset
 from sklearn.model_selection import train_test_split
 from transformers import (
     AutoTokenizer,
 # CONFIG
 # =========================================================
 LIST_LABEL = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
 FOLDER_MODEL = Path("saved_models")
 FOLDER_MODEL.mkdir(exist_ok=True)
 # ==============================================================
+#                      File & Utils
 # ==============================================================
 def read_file_upload(file_obj):
     """Handle file upload dari Gradio."""
     if file_obj is None:
         raise ValueError("File belum diupload.")
     if isinstance(file_obj, str):
         return file_obj
     if hasattr(file_obj, "name"):
         return file_obj.name
     if hasattr(file_obj, "read"):
         temp_path = Path("/tmp") / f"upload_{np.random.randint(1e9)}.csv"
         with open(temp_path, "wb") as f:
     raise ValueError("Tipe file tidak didukung.")
 def save_last_model(name):
     (FOLDER_MODEL / "last_model_name.txt").write_text(name)
     if path_file.exists():
         return path_file.read_text().strip()
     return None
 def get_model_path(model_name):
     return FOLDER_MODEL / model_name.replace("/", "_")
 #                        Data Cleaning
 # ==============================================================
 def clean_labels(df):
+    """
+    1. Isi label kosong dengan 0.
+    2. Pastikan tipe data label adalah Numeric (Float), bukan Object/String.
+    """
     for l in LIST_LABEL:
         if l not in df.columns:
             df[l] = 0
+        # --- PERBAIKAN UTAMA DI SINI ---
+        # Paksa konversi ke angka. Error (text/kosong) jadi NaN, lalu diisi 0.
+        df[l] = pd.to_numeric(df[l], errors='coerce').fillna(0).astype(float)
     return df
 def clean_text(df, col="text"):
 #                        Model Architecture
 # =========================================================
 class ModelEmosi(nn.Module):
     def __init__(self, base_model_name, num_labels=8):
         super().__init__()
         self.config = AutoConfig.from_pretrained(base_model_name)
         return self.classifier(x)
 # ==============================================================
+#                        Tokenizer & Dataset
 # ==============================================================
 def tokenize_batch(texts, tokenizer, max_len=128):
     return tokenizer(
 def create_dataset(df, tokenizer, max_len=128):
     encodings = tokenize_batch(df["text"].tolist(), tokenizer, max_len)
+    # Karena sudah dibersihkan di clean_labels, ini aman
     labels = torch.tensor(df[LIST_LABEL].values, dtype=torch.float)
     return TensorDataset(
     )
 # ==============================================================
+#                            Weights
 # ==============================================================
 def hitung_pos_weight(df):
     counts = df[LIST_LABEL].sum(axis=0)
     N = len(df)
     pw = []
     return torch.tensor(pw, dtype=torch.float)
 # ==============================================================
+#                        Save & Load Logic
 # ==============================================================
 def save_model(model, tokenizer, folder):
     os.makedirs(folder, exist_ok=True)
     model.base.save_pretrained(folder)
     tokenizer.save_pretrained(folder)
     torch.save(model.classifier.state_dict(), str(Path(folder) / "classifier_head.pt"))
     save_last_model(str(folder))
 def load_model(folder):
 # ==============================================================
 def jalankan_training(
     df,
+    progress_bar=None, # Tambahan untuk Gradio Progress
     model_name="bert-base-multilingual-cased",
     epochs=3,
     batch_size=8,
     freeze_layers=6,
     device=None
 ):
+    """
+    Fungsi ini diubah menjadi Generator (yield) agar bisa streaming log ke UI.
+    """
+    # 1. Yield pesan awal
+    yield "Mempersiapkan dataset dan tokenizer...", None
     device = device or ("cuda" if torch.cuda.is_available() else "cpu")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = ModelEmosi(model_name)
     model.to(device)
+    # Freeze layers logic
     for name, param in model.base.named_parameters():
         if name.startswith("embeddings."):
             param.requires_grad = False
     history = {"train_loss": [], "val_loss": []}
     save_path = str(get_model_path(model_name))
+    yield f"Mulai Training di device: {device}\nTotal Steps: {total_steps}", None
     for ep in range(1, epochs+1):
+        # Update progress bar Gradio (jika ada)
+        if progress_bar:
+            progress_bar(float(ep)/epochs, desc=f"Epoch {ep}/{epochs}")
         model.train()
         total_train_loss = 0
+        # Loop batch
         for input_ids, mask, labels in train_loader:
             input_ids = input_ids.to(device)
             mask = mask.to(device)
         avg_train_loss = total_train_loss / len(train_loader.dataset)
         history["train_loss"].append(avg_train_loss)
+        # Validation
         model.eval()
         total_val_loss = 0
         with torch.no_grad():
         avg_val_loss = total_val_loss / len(val_loader.dataset)
         history["val_loss"].append(avg_val_loss)
+        # LOGGING MESSAGE
+        log_msg = f"✅ Epoch {ep} | Train Loss={avg_train_loss:.4f} | Val Loss={avg_val_loss:.4f}"
         if avg_val_loss < best_val_loss:
             best_val_loss = avg_val_loss
             no_improve = 0
             save_model(model, tokenizer, save_path)
+            log_msg += " --> (Model Saved 💾)"
         else:
             no_improve += 1
+            log_msg += f" --> (No Improve: {no_improve}/{patience})"
+        # Yield log per epoch
+        yield log_msg, None
+        if no_improve >= patience:
+            yield "⛔ Early stopping triggered.", None
+            break
+    yield "Training Selesai! 🎉", history
 # ==============================================================
 #                            PREDICTION
 # ==============================================================
 def predict_satu(text, folder=None):
     folder = folder or load_last_model()
     if folder is None:
         return {"Error": "Belum ada model yang dilatih."}
     return {LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))}
 def predict_batch(text_list, folder=None, batch_size=32):
     folder = folder or load_last_model()
     if folder is None:
         return []
             max_length=128,
             return_tensors="pt"
         )
         with torch.no_grad():
             out = model(encoded["input_ids"], encoded["attention_mask"])
             probs = torch.sigmoid(out).numpy()
     }
 # ==============================================================
+#                          GRADIO UI
 # ==============================================================
 def wrapper_training(file_obj, sep, model_name, epoch, batch, lr,
+                   max_len, wd, warmup, pat, freeze,
+                   progress=gr.Progress()): # Tambahkan progress bar object
     csv_path = read_file_upload(file_obj)
     df = pd.read_csv(csv_path, sep=sep)
     df = clean_labels(df)
     df = clean_text(df)
+    accumulated_log = ""
+    # Memanggil generator jalankan_training
+    for log_msg, history_result in jalankan_training(
         df=df,
+        progress_bar=progress, # Kirim progress bar ke backend
         model_name=model_name,
         epochs=int(epoch),
         batch_size=int(batch),
         warmup_ratio=float(warmup),
         patience=int(pat),
         freeze_layers=int(freeze)
+    ):
+        # Update log text real-time
+        accumulated_log += log_msg + "\n"
+        # Jika training selesai, history_result tidak None
+        if history_result is not None:
+             # Yield terakhir: log penuh + JSON history
+             yield accumulated_log, history_result
+        else:
+             # Yield proses: log berjalan + JSON kosong/null
+             yield accumulated_log, None
 def wrapper_predict_satu(text):
     return predict_satu(text)
 def wrapper_predict_dataset(file_obj, sep, batch_size):
     csv_path = read_file_upload(file_obj)
     df = pd.read_csv(csv_path, sep=sep)
     df = clean_labels(df)
     df = clean_text(df)
     preds = predict_batch(df["text"].tolist(), batch_size=int(batch_size))
     return summarize_result(preds)
 # ==============================================================
+#                          INTERFACE
 # ==============================================================
 with gr.Blocks() as app:
     gr.Markdown("## Emotion Classifier — IndoBERT / Multilingual")
         in_warmup = gr.Number(label="Warmup Ratio", value=0.1, visible=False)
         btn_train = gr.Button("Mulai Training", variant="primary")
+        # OUTPUT: DUA KOLOM (Log Teks & Hasil JSON)
+        with gr.Row():
+            out_log = gr.Textbox(label="Log Latihan (Real-time)", lines=10, interactive=False)
+            out_result = gr.JSON(label="Hasil Akhir (History)")
         btn_train.click(
             wrapper_training,
             inputs=[in_file, in_sep, in_model, in_epoch, in_batch,
                     in_lr, in_len, in_wd, in_warmup, in_pat, in_freeze],
+            outputs=[out_log, out_result] # Output ke dua komponen
         )
     with gr.Tab("Tes Satu Kalimat"):