Spaces:

Darendra
/

BERT_Emotion_Classification

Sleeping

App Files Files Community

Darendra commited on Dec 16, 2025

Commit

54584f7

verified ·

1 Parent(s): 02adcda

Update app.py

Browse files

Files changed (1) hide show

app.py +188 -103

app.py CHANGED Viewed

@@ -1,203 +1,288 @@
 import os
 import torch
 import pandas as pd
-import numpy as np
 import gradio as gr
-import zipfile
 import shutil
-import sys
 from pathlib import Path
-from torch import nn
-from torch.utils.data import DataLoader, TensorDataset
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
 # =========================================================
-# 1. KONFIGURASI & SETUP
 # =========================================================
 LIST_LABEL = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
-# Setup Path
-def get_root_path():
-    if getattr(sys, 'frozen', False):
-        return Path(sys.executable).parent
-    else:
-        return Path(__file__).parent
-BASE_DIR = get_root_path()
-DIR_TRAINED = BASE_DIR / "saved_models" / "trained_local"
-DIR_UPLOADED = BASE_DIR / "saved_models" / "uploaded_colab"
-ACTIVE_MODEL_POINTER = BASE_DIR / "active_model_path.txt"
-DIR_TRAINED.mkdir(parents=True, exist_ok=True)
 DIR_UPLOADED.mkdir(parents=True, exist_ok=True)
 # =========================================================
-# 2. HELPER FUNCTIONS
 # =========================================================
 def clean_data(df):
-    # Cek kolom label dan tipenya
     for l in LIST_LABEL:
         if l not in df.columns: df[l] = 0
-        # Fix format koma (1,00 -> 1.00)
         df[l] = df[l].astype(str).str.replace(',', '.', regex=False)
         df[l] = pd.to_numeric(df[l], errors='coerce').fillna(0).astype(float)
-    # Bersihkan teks
     col_text = next((c for c in df.columns if c.lower() in ['text', 'kalimat', 'content', 'tweet']), None)
     if col_text:
         df["text_clean"] = df[col_text].astype(str).str.replace("\n", " ").str.strip()
     elif "text" in df.columns:
          df["text_clean"] = df["text"].astype(str).str.replace("\n", " ").str.strip()
     return df
-def get_active_model_path():
-    if os.path.exists(ACTIVE_MODEL_POINTER):
-        with open(ACTIVE_MODEL_POINTER, "r") as f:
-            path = f.read().strip()
-        if os.path.exists(path): return path
-    return None
-def set_active_model_path(path):
-    with open(ACTIVE_MODEL_POINTER, "w") as f:
-        f.write(str(path))
 # =========================================================
-# 3. LOGIKA UPLOAD
 # =========================================================
 def handle_zip_upload(file_obj):
     if file_obj is None: return "❌ Tidak ada file.", None
     try:
-        # Bersihkan folder lama
         if DIR_UPLOADED.exists(): shutil.rmtree(DIR_UPLOADED)
         DIR_UPLOADED.mkdir(parents=True, exist_ok=True)
         with zipfile.ZipFile(file_obj.name, 'r') as zip_ref:
             zip_ref.extractall(DIR_UPLOADED)
-        # Handle jika zip membungkus folder (nested folder)
-        # Cari file config.json untuk menentukan root folder model
-        config_path = list(DIR_UPLOADED.rglob("config.json"))
-        if not config_path:
-             return "❌ Error: Tidak ditemukan config.json di dalam zip.", None
         final_model_path = config_path[0].parent
-        # Simpan path yang valid
-        set_active_model_path(final_model_path)
-        return f"✅ Model berhasil dimuat!\nLokasi: {final_model_path}", "Model Upload (Siap)"
     except Exception as e:
         return f"❌ Error unzip: {str(e)}", None
 # =========================================================
-# 4. LOGIKA PREDIKSI
 # =========================================================
 def load_model_inference():
-    path = get_active_model_path()
-    if not path: raise ValueError("Belum ada model aktif. Upload dulu!")
-    path = Path(path)
     try:
-        tokenizer = AutoTokenizer.from_pretrained(str(path))
-        model = AutoModelForSequenceClassification.from_pretrained(str(path))
         model.eval()
         return model, tokenizer
-    except Exception as e:
-        raise ValueError(f"Gagal load model: {e}")
 def predict_text(text):
     if not text: return None
     try:
         model, tokenizer = load_model_inference()
         inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
         with torch.no_grad():
             out = model(**inputs)
             probs = torch.sigmoid(out.logits).numpy()[0]
         return {LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))}
     except Exception as e:
         return {"Error": str(e)}
 def predict_csv(file_obj, sep):
     try:
-        # Cek separator
-        try:
-            df = pd.read_csv(file_obj.name, sep=sep)
-        except:
-            df = pd.read_csv(file_obj.name, sep=",")
         df = clean_data(df)
         model, tokenizer = load_model_inference()
         results = []
-        # Cek kolom text
-        if "text_clean" not in df.columns: return {"Error": "Kolom teks tidak ditemukan"}
         for txt in df["text_clean"]:
             inputs = tokenizer(txt, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
             with torch.no_grad():
                 out = model(**inputs)
                 probs = torch.sigmoid(out.logits).numpy()[0]
             results.append({LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))})
-        # Hitung statistik
         avg = {l: 0.0 for l in LIST_LABEL}
         for r in results:
             for l,v in r.items(): avg[l] += v
         for l in avg: avg[l] /= len(results)
         top3 = sorted(avg.items(), key=lambda x: x[1], reverse=True)[:3]
-        return {
-            "Total Data": len(results),
-            "Top 3 Emosi Dominan": {k: round(v,4) for k,v in top3},
-            "Rata-rata Skor": avg
-        }
     except Exception as e:
         return {"Error": str(e)}
 # =========================================================
-# 5. TAMPILAN ANTARMUKA (UI GRADIO)
 # =========================================================
-with gr.Blocks(title="Emotion AI Manager") as app:
-    gr.Markdown("# 🧠 AI Emotion Classifier System")
-    # Status Bar Global
-    lbl_active_model = gr.Textbox(label="Status Model Aktif", value="Belum ada model.", interactive=False)
     with gr.Tabs():
-        # TAB 1: UPLOAD
-        with gr.Tab("📂 Upload Model"):
-            gr.Markdown("Upload file `.zip` model hasil training.")
-            in_zip = gr.File(label="Upload File .zip", file_types=[".zip"])
-            btn_upload = gr.Button("Ekstrak & Aktifkan", variant="primary")
-            out_log_upload = gr.Textbox(label="Log Sistem")
-            btn_upload.click(handle_zip_upload, inputs=in_zip, outputs=[out_log_upload, lbl_active_model])
-        # TAB 2: PENGUJIAN
         with gr.Tab("🧪 Testing"):
             with gr.Tabs():
-                # Sub-Tab 2.1: Uji Tunggal
-                with gr.Tab("📝 Uji Tunggal"):
-                    in_txt = gr.Textbox(label="Masukkan Kalimat", placeholder="Saya merasa...", lines=2)
-                    btn_pred_txt = gr.Button("Prediksi", variant="primary")
                     out_lbl = gr.Label(label="Hasil Prediksi")
-                    btn_pred_txt.click(predict_text, inputs=in_txt, outputs=out_lbl)
-                # Sub-Tab 2.2: Uji Batch
                 with gr.Tab("📊 Uji Batch (CSV)"):
                     in_csv_test = gr.File(label="Upload CSV Test")
-                    in_sep_test = gr.Textbox(label="Separator", value=";")
-                    btn_pred_csv = gr.Button("Analisis Batch")
                     out_json = gr.JSON(label="Hasil Analisis")
-                    btn_pred_csv.click(predict_csv, inputs=[in_csv_test, in_sep_test], outputs=out_json)
 if __name__ == "__main__":
-    app.queue().launch()

 import os
 import torch
 import pandas as pd
 import gradio as gr
 import shutil
+import zipfile
 from pathlib import Path
+from torch.utils.data import DataLoader, Dataset
+from torch.optim import AdamW
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
 # =========================================================
+# 1. KONFIGURASI & VARIABEL
 # =========================================================
 LIST_LABEL = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
+# Folder penyimpanan sementara
+DIR_UPLOADED = Path("temp_models/uploaded_zip")
+DIR_TRAINED = Path("temp_models/trained_cloud")
 DIR_UPLOADED.mkdir(parents=True, exist_ok=True)
+DIR_TRAINED.mkdir(parents=True, exist_ok=True)
+# Variabel Global untuk menyimpan path model aktif
+active_model_path = None
 # =========================================================
+# 2. HELPER & DATASET
 # =========================================================
+class EmosiDataset(Dataset):
+    def __init__(self, df, tokenizer, max_len=128):
+        self.df = df
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+        self.labels = df[LIST_LABEL].values
+        self.texts = df["text_clean"].astype(str).tolist()
+    def __len__(self):
+        return len(self.df)
+    def __getitem__(self, item):
+        text = self.texts[item]
+        inputs = self.tokenizer(
+            text,
+            truncation=True,
+            padding='max_length',
+            max_length=self.max_len,
+            return_tensors='pt'
+        )
+        return {
+            'input_ids': inputs['input_ids'].flatten(),
+            'attention_mask': inputs['attention_mask'].flatten(),
+            'labels': torch.tensor(self.labels[item], dtype=torch.float)
+        }
 def clean_data(df):
     for l in LIST_LABEL:
         if l not in df.columns: df[l] = 0
         df[l] = df[l].astype(str).str.replace(',', '.', regex=False)
         df[l] = pd.to_numeric(df[l], errors='coerce').fillna(0).astype(float)
     col_text = next((c for c in df.columns if c.lower() in ['text', 'kalimat', 'content', 'tweet']), None)
     if col_text:
         df["text_clean"] = df[col_text].astype(str).str.replace("\n", " ").str.strip()
     elif "text" in df.columns:
          df["text_clean"] = df["text"].astype(str).str.replace("\n", " ").str.strip()
     return df
 # =========================================================
+# 3. UPLOAD ZIP
 # =========================================================
 def handle_zip_upload(file_obj):
+    global active_model_path
     if file_obj is None: return "❌ Tidak ada file.", None
     try:
         if DIR_UPLOADED.exists(): shutil.rmtree(DIR_UPLOADED)
         DIR_UPLOADED.mkdir(parents=True, exist_ok=True)
         with zipfile.ZipFile(file_obj.name, 'r') as zip_ref:
             zip_ref.extractall(DIR_UPLOADED)
+        # Cari config.json
+        config_path = list(DIR_UPLOADED.rglob("config.json"))
+        if not config_path:
+            return "❌ Error: Tidak ditemukan config.json dalam ZIP.", None
         final_model_path = config_path[0].parent
+        active_model_path = str(final_model_path)
+        return f"✅ Model ZIP Berhasil Dimuat!\nLokasi: {active_model_path}", "Status: Memakai Model Upload ZIP"
     except Exception as e:
         return f"❌ Error unzip: {str(e)}", None
 # =========================================================
+# 4. TRAINING CLOUD
+# =========================================================
+def train_model_cloud(file_obj, sep, epochs, batch_size, lr, progress=gr.Progress()):
+    global active_model_path
+    yield "⏳ Membaca dataset...", None
+    if file_obj is None:
+        yield "❌ File CSV belum diupload!", None
+        return
+    try:
+        df = pd.read_csv(file_obj.name, sep=sep)
+        df = clean_data(df)
+        if "text_clean" not in df.columns:
+            yield "❌ Kolom teks tidak ditemukan.", None
+            return
+        MODEL_NAME = "indobenchmark/indobert-base-p1"
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        model = AutoModelForSequenceClassification.from_pretrained(
+            MODEL_NAME, num_labels=len(LIST_LABEL), problem_type="multi_label_classification"
+        )
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model.to(device)
+        dataset = EmosiDataset(df, tokenizer)
+        loader = DataLoader(dataset, batch_size=int(batch_size), shuffle=True)
+        optimizer = AdamW(model.parameters(), lr=float(lr))
+        log_text = f"🚀 Mulai Training di {device}...\nData: {len(df)} baris.\n"
+        yield log_text, None
+        model.train()
+        for ep in range(int(epochs)):
+            total_loss = 0
+            steps = len(loader)
+            for i, batch in enumerate(loader):
+                optimizer.zero_grad()
+                input_ids = batch['input_ids'].to(device)
+                attention_mask = batch['attention_mask'].to(device)
+                labels = batch['labels'].to(device)
+                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
+                loss = outputs.loss
+                loss.backward()
+                optimizer.step()
+                total_loss += loss.item()
+                if i % 5 == 0:
+                    progress((ep * steps + i) / (int(epochs) * steps), desc=f"Ep {ep+1} Loss: {total_loss/(i+1):.4f}")
+            avg_loss = total_loss / steps
+            log_text += f"✅ Epoch {ep+1}/{epochs} | Loss: {avg_loss:.4f}\n"
+            yield log_text, None
+        # Simpan
+        yield log_text + "\n💾 Menyimpan model...", None
+        if DIR_TRAINED.exists(): shutil.rmtree(DIR_TRAINED)
+        DIR_TRAINED.mkdir(parents=True, exist_ok=True)
+        model.save_pretrained(DIR_TRAINED)
+        tokenizer.save_pretrained(DIR_TRAINED)
+        active_model_path = str(DIR_TRAINED)
+        yield log_text + f"\n🎉 Selesai! Model training aktif.", "Status: Memakai Model Hasil Training"
+    except Exception as e:
+        yield f"❌ Error: {str(e)}", None
+# =========================================================
+# 5. LOAD & PREDIKSI
 # =========================================================
 def load_model_inference():
+    global active_model_path
+    # Prioritas 1: Model aktif (hasil upload/training barusan)
+    if active_model_path and os.path.exists(active_model_path):
+        target_path = active_model_path
+    # Prioritas 2: Folder default (upload manual via Files HF)
+    elif os.path.exists("model_default") and os.path.exists("model_default/config.json"):
+        target_path = "model_default"
+    # Prioritas 3: Download Base Model
+    else:
+        return AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=8), \
+               AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
     try:
+        tokenizer = AutoTokenizer.from_pretrained(target_path)
+        model = AutoModelForSequenceClassification.from_pretrained(target_path)
         model.eval()
         return model, tokenizer
+    except:
+        return AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=8), \
+               AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
 def predict_text(text):
     if not text: return None
     try:
         model, tokenizer = load_model_inference()
         inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
         with torch.no_grad():
             out = model(**inputs)
             probs = torch.sigmoid(out.logits).numpy()[0]
         return {LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))}
     except Exception as e:
         return {"Error": str(e)}
 def predict_csv(file_obj, sep):
     try:
+        try: df = pd.read_csv(file_obj.name, sep=sep)
+        except: df = pd.read_csv(file_obj.name, sep=",")
         df = clean_data(df)
         model, tokenizer = load_model_inference()
+        if "text_clean" not in df.columns: return {"Error": "Kolom teks tidak ditemukan"}
         results = []
         for txt in df["text_clean"]:
             inputs = tokenizer(txt, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
             with torch.no_grad():
                 out = model(**inputs)
                 probs = torch.sigmoid(out.logits).numpy()[0]
             results.append({LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))})
         avg = {l: 0.0 for l in LIST_LABEL}
         for r in results:
             for l,v in r.items(): avg[l] += v
         for l in avg: avg[l] /= len(results)
         top3 = sorted(avg.items(), key=lambda x: x[1], reverse=True)[:3]
+        return {"Info": f"Total {len(results)} data", "Dominan": {k: round(v,4) for k,v in top3}, "Detail": avg}
     except Exception as e:
         return {"Error": str(e)}
 # =========================================================
+# 6. UI GRADIO
 # =========================================================
+with gr.Blocks(title="IndoBERT Emotion Cloud") as app:
+    gr.Markdown("# ☁️ IndoBERT Emotion Classifier")
+    # Label Status Global
+    lbl_status = gr.Textbox(label="Status Model Aktif", value="Default (IndoBERT Base / Uploaded Manual)", interactive=False)
     with gr.Tabs():
+        # === TAB 1: KONFIGURASI MODEL ===
+        with gr.Tab("⚙️ Konfigurasi Model"):
+            with gr.Tabs():
+                # --- Sub Tab 1: Upload ---
+                with gr.Tab("📂 Unggah Model"):
+                    gr.Markdown("Upload file `.zip` berisi model yang sudah dilatih (dari Komputer).")
+                    in_zip = gr.File(label="File ZIP Model")
+                    btn_upload = gr.Button("Ekstrak & Pakai Model", variant="primary")
+                    out_log_upload = gr.Textbox(label="Log Sistem")
+                    btn_upload.click(handle_zip_upload, inputs=in_zip, outputs=[out_log_upload, lbl_status])
+                # --- Sub Tab 2: Training ---
+                with gr.Tab("🏋️‍♀️ Latih Model"):
+                    gr.Markdown("Latih model baru menggunakan Dataset CSV sendiri di Cloud.")
+                    with gr.Row():
+                        in_csv = gr.File(label="Dataset CSV")
+                        in_sep = gr.Textbox(label="Separator", value=";")
+                    with gr.Row():
+                        in_ep = gr.Number(label="Epoch", value=1, precision=0)
+                        in_bs = gr.Number(label="Batch Size", value=4, precision=0)
+                        in_lr = gr.Number(label="Learning Rate", value=2e-5)
+                    btn_train = gr.Button("Mulai Training", variant="stop")
+                    out_log_train = gr.Textbox(label="Log Training", lines=5)
+                    btn_train.click(train_model_cloud, inputs=[in_csv, in_sep, in_ep, in_bs, in_lr], outputs=[out_log_train, lbl_status])
+        # === TAB 2: TESTING ===
         with gr.Tab("🧪 Testing"):
+            gr.Markdown("Uji model yang sedang aktif.")
             with gr.Tabs():
+                with gr.Tab("📝 Uji Satu Kalimat"):
+                    in_txt = gr.Textbox(label="Masukkan Kalimat", lines=2, placeholder="Contoh: Saya sangat bahagia hari ini...")
+                    btn_pred = gr.Button("Prediksi Emosi")
                     out_lbl = gr.Label(label="Hasil Prediksi")
+                    btn_pred.click(predict_text, inputs=in_txt, outputs=out_lbl)
                 with gr.Tab("📊 Uji Batch (CSV)"):
                     in_csv_test = gr.File(label="Upload CSV Test")
+                    btn_batch = gr.Button("Analisis Batch")
                     out_json = gr.JSON(label="Hasil Analisis")
+                    btn_batch.click(predict_csv, inputs=[in_csv_test, in_sep], outputs=out_json)
 if __name__ == "__main__":
+    app.launch()