Spaces:

Darendra
/

BERT_Emotion_Classification

Sleeping

App Files Files Community

Darendra commited on Dec 16, 2025

Commit

02adcda

verified ·

1 Parent(s): dafa625

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -172

app.py CHANGED Viewed

@@ -5,56 +5,51 @@ import numpy as np
 import gradio as gr
 import zipfile
 import shutil
 from pathlib import Path
 from torch import nn
 from torch.utils.data import DataLoader, TensorDataset
-from transformers import AutoTokenizer, AutoModel, AutoConfig
 # =========================================================
 # 1. KONFIGURASI & SETUP
 # =========================================================
-LIST_LABEL = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
-DIR_TRAINED = Path("saved_models/trained_local")
-DIR_UPLOADED = Path("saved_models/uploaded_colab")
-DIR_TRAINED.mkdir(parents=True, exist_ok=True)
-DIR_UPLOADED.mkdir(parents=True, exist_ok=True)
-ACTIVE_MODEL_POINTER = "active_model_path.txt"
-# =========================================================
-# 2. ARSITEKTUR MODEL
-# =========================================================
-class ModelEmosi(nn.Module):
-    def __init__(self, base_model_name, num_labels=8):
-        super().__init__()
-        # Load config agar fleksibel (bisa baca dari folder atau nama model)
-        self.config = AutoConfig.from_pretrained(base_model_name)
-        self.base = AutoModel.from_pretrained(base_model_name)
-        self.dropout = nn.Dropout(0.3)
-        self.classifier = nn.Linear(self.config.hidden_size, num_labels)
-    def forward(self, input_ids, attention_mask):
-        out = self.base(input_ids=input_ids, attention_mask=attention_mask)
-        if hasattr(out, "pooler_output") and out.pooler_output is not None:
-            x = out.pooler_output
-        else:
-            # Fallback jika model tidak punya pooler (misal DistilBERT)
-            x = out.last_hidden_state[:, 0, :]
-        return self.classifier(self.dropout(x))
 # =========================================================
-# 3. HELPER FUNCTIONS
 # =========================================================
 def clean_data(df):
-    # Pastikan kolom label ada dan bertipe float
     for l in LIST_LABEL:
         if l not in df.columns: df[l] = 0
         df[l] = pd.to_numeric(df[l], errors='coerce').fillna(0).astype(float)
     # Bersihkan teks
-    if "text" in df.columns:
-        df["text"] = df["text"].astype(str).str.replace("\n", " ").str.strip()
     return df
 def get_active_model_path():
@@ -69,108 +64,48 @@ def set_active_model_path(path):
         f.write(str(path))
 # =========================================================
-# 4. LOGIKA TRAINING (CPU - HANYA UNTUK DATA KECIL)
-# =========================================================
-def run_training_generator(file_obj, sep, epochs, batch_size, lr, progress=gr.Progress()):
-    yield "⏳ Membaca dataset...", None
-    try:
-        df = pd.read_csv(file_obj.name, sep=sep)
-        df = clean_data(df)
-    except Exception as e:
-        yield f"❌ Error: {str(e)}", None
-        return
-    device = "cpu"
-    # Default model dasar untuk training manual di CPU
-    model_name = "bert-base-multilingual-cased"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    def tokenize_fn(texts):
-        return tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
-    encodings = tokenize_fn(df["text"].tolist())
-    labels = torch.tensor(df[LIST_LABEL].values, dtype=torch.float)
-    dataset = TensorDataset(encodings["input_ids"], encodings["attention_mask"], labels)
-    train_loader = DataLoader(dataset, batch_size=int(batch_size), shuffle=True)
-    model = ModelEmosi(model_name)
-    model.to(device)
-    optimizer = torch.optim.AdamW(model.parameters(), lr=float(lr))
-    loss_fn = nn.BCEWithLogitsLoss()
-    log_text = f"🚀 Mulai Training CPU...\nData: {len(df)} baris\n"
-    yield log_text, None
-    model.train()
-    for ep in range(int(epochs)):
-        total_loss = 0
-        for step, batch in enumerate(train_loader):
-            b_ids, b_mask, b_lbl = batch
-            optimizer.zero_grad()
-            out = model(b_ids, b_mask)
-            loss = loss_fn(out, b_lbl)
-            loss.backward()
-            optimizer.step()
-            total_loss += loss.item()
-            # Update progress bar setiap 5 step
-            if step % 5 == 0:
-                progress((ep * len(train_loader) + step) / (int(epochs) * len(train_loader)))
-        avg_loss = total_loss / len(train_loader)
-        log_text += f"✅ Epoch {ep+1} | Loss: {avg_loss:.4f}\n"
-        yield log_text, None
-    # Simpan Model
-    model.base.save_pretrained(DIR_TRAINED)
-    tokenizer.save_pretrained(DIR_TRAINED)
-    torch.save(model.classifier.state_dict(), DIR_TRAINED / "classifier_head.pt")
-    set_active_model_path(DIR_TRAINED)
-    yield log_text + "\n🎉 Selesai & Disimpan!", "Model Lokal (Baru Dilatih)"
-# =========================================================
-# 5. LOGIKA UPLOAD (DARI COLAB)
 # =========================================================
 def handle_zip_upload(file_obj):
     if file_obj is None: return "❌ Tidak ada file.", None
     try:
         if DIR_UPLOADED.exists(): shutil.rmtree(DIR_UPLOADED)
-        DIR_UPLOADED.mkdir()
         with zipfile.ZipFile(file_obj.name, 'r') as zip_ref:
             zip_ref.extractall(DIR_UPLOADED)
-        # Handle jika zip membungkus folder (bukan isi file langsung)
-        files_in_dir = list(DIR_UPLOADED.iterdir())
-        if len(files_in_dir) == 1 and files_in_dir[0].is_dir():
-            subfolder = files_in_dir[0]
-            for item in subfolder.iterdir():
-                shutil.move(str(item), str(DIR_UPLOADED))
-            subfolder.rmdir()
-        set_active_model_path(DIR_UPLOADED)
-        return f"✅ Model berhasil dimuat dari ZIP!\nLokasi: {DIR_UPLOADED}", "Model Upload (Dari Colab)"
     except Exception as e:
         return f"❌ Error unzip: {str(e)}", None
 # =========================================================
-# 6. LOGIKA PREDIKSI
 # =========================================================
 def load_model_inference():
     path = get_active_model_path()
-    if not path: raise ValueError("Belum ada model aktif.")
     path = Path(path)
-    tokenizer = AutoTokenizer.from_pretrained(path)
-    model = ModelEmosi(path)
-    head_path = path / "classifier_head.pt"
-    if head_path.exists():
-        model.classifier.load_state_dict(torch.load(head_path, map_location="cpu"))
         model.eval()
-    return model, tokenizer
 def predict_text(text):
     if not text: return None
@@ -179,8 +114,8 @@ def predict_text(text):
         inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
         with torch.no_grad():
-            out = model(inputs["input_ids"], inputs["attention_mask"])
-            probs = torch.sigmoid(out).numpy()[0]
         return {LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))}
     except Exception as e:
@@ -188,16 +123,25 @@ def predict_text(text):
 def predict_csv(file_obj, sep):
     try:
-        df = pd.read_csv(file_obj.name, sep=sep)
         df = clean_data(df)
         model, tokenizer = load_model_inference()
         results = []
-        for txt in df["text"]:
             inputs = tokenizer(txt, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
             with torch.no_grad():
-                out = model(inputs["input_ids"], inputs["attention_mask"])
-                probs = torch.sigmoid(out).numpy()[0]
             results.append({LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))})
         # Hitung statistik
@@ -217,62 +161,43 @@ def predict_csv(file_obj, sep):
         return {"Error": str(e)}
 # =========================================================
-# 7. TAMPILAN ANTARMUKA (UI GRADIO)
 # =========================================================
 with gr.Blocks(title="Emotion AI Manager") as app:
-    gr.Markdown("#AI Emotion Classifier System")
     # Status Bar Global
-    lbl_active_model = gr.Textbox(label="Status Model Aktif", value="Belum ada model yang dipilih.", interactive=False)
-    # TAB UTAMA 1: SETUP & PELATIHAN
-    with gr.Tab("⚙️ Pelatihan & Model"):
-        with gr.Tabs():
-            # Sub-Tab 1.1: Upload Pretrained Model
-            with gr.Tab("📂 Upload Pretrained Model"):
-                gr.Markdown("Sudah punya model terlatih? gunakan model hasil training model")
-                in_zip = gr.File(label="Upload File .zip Model", file_types=[".zip"])
-                btn_upload = gr.Button("Ekstrak & Aktifkan Model", variant="primary")
-                out_log_upload = gr.Textbox(label="Log Sistem")
-                btn_upload.click(handle_zip_upload, inputs=in_zip, outputs=[out_log_upload, lbl_active_model])
-            # Sub-Tab 1.2: Latihan Manual
-            with gr.Tab("🏋️‍♀️ Latihan Manual"):
-                gr.Markdown("Belum punya model? latih file csv [text;label emosi (1/0)]")
-                with gr.Row():
-                    in_csv = gr.File(label="Dataset CSV")
-                    in_sep = gr.Textbox(label="Separator", value=";")
-                with gr.Row():
-                    in_ep = gr.Number(label="Epoch", value=1)
-                    in_bs = gr.Number(label="Batch", value=4)
-                    in_lr = gr.Number(label="LR", value=2e-5)
-                btn_train = gr.Button("Mulai Latihan")
-                out_log_train = gr.Textbox(label="Log Training", lines=6)
-                btn_train.click(run_training_generator, inputs=[in_csv, in_sep, in_ep, in_bs, in_lr], outputs=[out_log_train, lbl_active_model])
-    # TAB UTAMA 2: PENGUJIAN
-    with gr.Tab("🧪 Testing"):
-        with gr.Tabs():
-            # Sub-Tab 2.1: Uji Tunggal
-            with gr.Tab("📝 Uji Tunggal (Teks)"):
-                in_txt = gr.Textbox(label="Masukkan Kalimat", placeholder="Saya merasa...")
-                btn_pred_txt = gr.Button("Prediksi Emosi")
-                out_lbl = gr.Label(label="Confidence Score")
-                btn_pred_txt.click(predict_text, inputs=in_txt, outputs=out_lbl)
-            # Sub-Tab 2.2: Uji Batch
-            with gr.Tab("📊 Uji Batch (CSV)"):
-                in_csv_test = gr.File(label="Upload CSV Test")
-                in_sep_test = gr.Textbox(label="Separator", value=";")
-                btn_pred_csv = gr.Button("Analisis Batch")
-                out_json = gr.JSON(label="Hasil Analisis")
-                btn_pred_csv.click(predict_csv, inputs=[in_csv_test, in_sep_test], outputs=out_json)
-app.queue().launch()

 import gradio as gr
 import zipfile
 import shutil
+import sys
 from pathlib import Path
 from torch import nn
 from torch.utils.data import DataLoader, TensorDataset
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
 # =========================================================
 # 1. KONFIGURASI & SETUP
 # =========================================================
+LIST_LABEL = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
+# Setup Path
+def get_root_path():
+    if getattr(sys, 'frozen', False):
+        return Path(sys.executable).parent
+    else:
+        return Path(__file__).parent
+BASE_DIR = get_root_path()
+DIR_TRAINED = BASE_DIR / "saved_models" / "trained_local"
+DIR_UPLOADED = BASE_DIR / "saved_models" / "uploaded_colab"
+ACTIVE_MODEL_POINTER = BASE_DIR / "active_model_path.txt"
+DIR_TRAINED.mkdir(parents=True, exist_ok=True)
+DIR_UPLOADED.mkdir(parents=True, exist_ok=True)
 # =========================================================
+# 2. HELPER FUNCTIONS
 # =========================================================
 def clean_data(df):
+    # Cek kolom label dan tipenya
     for l in LIST_LABEL:
         if l not in df.columns: df[l] = 0
+        # Fix format koma (1,00 -> 1.00)
+        df[l] = df[l].astype(str).str.replace(',', '.', regex=False)
         df[l] = pd.to_numeric(df[l], errors='coerce').fillna(0).astype(float)
     # Bersihkan teks
+    col_text = next((c for c in df.columns if c.lower() in ['text', 'kalimat', 'content', 'tweet']), None)
+    if col_text:
+        df["text_clean"] = df[col_text].astype(str).str.replace("\n", " ").str.strip()
+    elif "text" in df.columns:
+         df["text_clean"] = df["text"].astype(str).str.replace("\n", " ").str.strip()
     return df
 def get_active_model_path():
         f.write(str(path))
 # =========================================================
+# 3. LOGIKA UPLOAD
 # =========================================================
 def handle_zip_upload(file_obj):
     if file_obj is None: return "❌ Tidak ada file.", None
     try:
+        # Bersihkan folder lama
         if DIR_UPLOADED.exists(): shutil.rmtree(DIR_UPLOADED)
+        DIR_UPLOADED.mkdir(parents=True, exist_ok=True)
         with zipfile.ZipFile(file_obj.name, 'r') as zip_ref:
             zip_ref.extractall(DIR_UPLOADED)
+        # Handle jika zip membungkus folder (nested folder)
+        # Cari file config.json untuk menentukan root folder model
+        config_path = list(DIR_UPLOADED.rglob("config.json"))
+        if not config_path:
+             return "❌ Error: Tidak ditemukan config.json di dalam zip.", None
+        final_model_path = config_path[0].parent
+        # Simpan path yang valid
+        set_active_model_path(final_model_path)
+        return f"✅ Model berhasil dimuat!\nLokasi: {final_model_path}", "Model Upload (Siap)"
     except Exception as e:
         return f"❌ Error unzip: {str(e)}", None
 # =========================================================
+# 4. LOGIKA PREDIKSI
 # =========================================================
 def load_model_inference():
     path = get_active_model_path()
+    if not path: raise ValueError("Belum ada model aktif. Upload dulu!")
     path = Path(path)
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(str(path))
+        model = AutoModelForSequenceClassification.from_pretrained(str(path))
         model.eval()
+        return model, tokenizer
+    except Exception as e:
+        raise ValueError(f"Gagal load model: {e}")
 def predict_text(text):
     if not text: return None
         inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
         with torch.no_grad():
+            out = model(**inputs)
+            probs = torch.sigmoid(out.logits).numpy()[0]
         return {LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))}
     except Exception as e:
 def predict_csv(file_obj, sep):
     try:
+        # Cek separator
+        try:
+            df = pd.read_csv(file_obj.name, sep=sep)
+        except:
+            df = pd.read_csv(file_obj.name, sep=",")
         df = clean_data(df)
         model, tokenizer = load_model_inference()
         results = []
+        # Cek kolom text
+        if "text_clean" not in df.columns: return {"Error": "Kolom teks tidak ditemukan"}
+        for txt in df["text_clean"]:
             inputs = tokenizer(txt, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
             with torch.no_grad():
+                out = model(**inputs)
+                probs = torch.sigmoid(out.logits).numpy()[0]
             results.append({LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))})
         # Hitung statistik
         return {"Error": str(e)}
 # =========================================================
+# 5. TAMPILAN ANTARMUKA (UI GRADIO)
 # =========================================================
 with gr.Blocks(title="Emotion AI Manager") as app:
+    gr.Markdown("# 🧠 AI Emotion Classifier System")
     # Status Bar Global
+    lbl_active_model = gr.Textbox(label="Status Model Aktif", value="Belum ada model.", interactive=False)
+    with gr.Tabs():
+        # TAB 1: UPLOAD
+        with gr.Tab("📂 Upload Model"):
+            gr.Markdown("Upload file `.zip` model hasil training.")
+            in_zip = gr.File(label="Upload File .zip", file_types=[".zip"])
+            btn_upload = gr.Button("Ekstrak & Aktifkan", variant="primary")
+            out_log_upload = gr.Textbox(label="Log Sistem")
+            btn_upload.click(handle_zip_upload, inputs=in_zip, outputs=[out_log_upload, lbl_active_model])
+        # TAB 2: PENGUJIAN
+        with gr.Tab("🧪 Testing"):
+            with gr.Tabs():
+                # Sub-Tab 2.1: Uji Tunggal
+                with gr.Tab("📝 Uji Tunggal"):
+                    in_txt = gr.Textbox(label="Masukkan Kalimat", placeholder="Saya merasa...", lines=2)
+                    btn_pred_txt = gr.Button("Prediksi", variant="primary")
+                    out_lbl = gr.Label(label="Hasil Prediksi")
+                    btn_pred_txt.click(predict_text, inputs=in_txt, outputs=out_lbl)
+                # Sub-Tab 2.2: Uji Batch
+                with gr.Tab("📊 Uji Batch (CSV)"):
+                    in_csv_test = gr.File(label="Upload CSV Test")
+                    in_sep_test = gr.Textbox(label="Separator", value=";")
+                    btn_pred_csv = gr.Button("Analisis Batch")
+                    out_json = gr.JSON(label="Hasil Analisis")
+                    btn_pred_csv.click(predict_csv, inputs=[in_csv_test, in_sep_test], outputs=out_json)
+if __name__ == "__main__":
+    app.queue().launch()