Darendra commited on
Commit
2faddd5
·
verified ·
1 Parent(s): d235f02

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -45
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # ==============================================================
2
- # EMOTION CLASSIFIER
3
  # ==============================================================
4
  import os
5
  import math
@@ -7,10 +7,9 @@ import torch
7
  import pandas as pd
8
  import numpy as np
9
  import gradio as gr
10
- import matplotlib.pyplot as plt
11
  from pathlib import Path
12
  from torch import nn
13
- from torch.utils.data import Dataset, DataLoader, TensorDataset
14
  from sklearn.model_selection import train_test_split
15
  from transformers import (
16
  AutoTokenizer,
@@ -23,29 +22,24 @@ from transformers import (
23
  # CONFIG
24
  # =========================================================
25
  LIST_LABEL = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
26
- LABEL2ID = {l:i for i,l in enumerate(LIST_LABEL)}
27
- ID2LABEL = {i:l for i,l in enumerate(LIST_LABEL)}
28
 
29
  FOLDER_MODEL = Path("saved_models")
30
  FOLDER_MODEL.mkdir(exist_ok=True)
31
 
32
  # ==============================================================
33
- # File & Utils
34
  # ==============================================================
35
  def read_file_upload(file_obj):
36
  """Handle file upload dari Gradio."""
37
  if file_obj is None:
38
  raise ValueError("File belum diupload.")
39
 
40
- # Kalau inputnya string path
41
  if isinstance(file_obj, str):
42
  return file_obj
43
 
44
- # Kalau inputnya object file (Gradio baru)
45
  if hasattr(file_obj, "name"):
46
  return file_obj.name
47
 
48
- # Kalau binary stream
49
  if hasattr(file_obj, "read"):
50
  temp_path = Path("/tmp") / f"upload_{np.random.randint(1e9)}.csv"
51
  with open(temp_path, "wb") as f:
@@ -54,7 +48,6 @@ def read_file_upload(file_obj):
54
 
55
  raise ValueError("Tipe file tidak didukung.")
56
 
57
- # --- FUNGSI YANG DIUBAH (LEBIH SINGKAT) ---
58
  def save_last_model(name):
59
  (FOLDER_MODEL / "last_model_name.txt").write_text(name)
60
 
@@ -63,7 +56,6 @@ def load_last_model():
63
  if path_file.exists():
64
  return path_file.read_text().strip()
65
  return None
66
- # ------------------------------------------
67
 
68
  def get_model_path(model_name):
69
  return FOLDER_MODEL / model_name.replace("/", "_")
@@ -72,10 +64,18 @@ def get_model_path(model_name):
72
  # Data Cleaning
73
  # ==============================================================
74
  def clean_labels(df):
75
- """Isi label kosong dengan 0."""
 
 
 
76
  for l in LIST_LABEL:
77
  if l not in df.columns:
78
  df[l] = 0
 
 
 
 
 
79
  return df
80
 
81
  def clean_text(df, col="text"):
@@ -89,7 +89,6 @@ def clean_text(df, col="text"):
89
  # Model Architecture
90
  # =========================================================
91
  class ModelEmosi(nn.Module):
92
- """Backbone BERT + Classifier Head."""
93
  def __init__(self, base_model_name, num_labels=8):
94
  super().__init__()
95
  self.config = AutoConfig.from_pretrained(base_model_name)
@@ -111,7 +110,7 @@ class ModelEmosi(nn.Module):
111
  return self.classifier(x)
112
 
113
  # ==============================================================
114
- # Tokenizer & Dataset
115
  # ==============================================================
116
  def tokenize_batch(texts, tokenizer, max_len=128):
117
  return tokenizer(
@@ -124,6 +123,8 @@ def tokenize_batch(texts, tokenizer, max_len=128):
124
 
125
  def create_dataset(df, tokenizer, max_len=128):
126
  encodings = tokenize_batch(df["text"].tolist(), tokenizer, max_len)
 
 
127
  labels = torch.tensor(df[LIST_LABEL].values, dtype=torch.float)
128
 
129
  return TensorDataset(
@@ -133,10 +134,9 @@ def create_dataset(df, tokenizer, max_len=128):
133
  )
134
 
135
  # ==============================================================
136
- # Weights
137
  # ==============================================================
138
  def hitung_pos_weight(df):
139
- """Biar adil kalau datanya imbalanced."""
140
  counts = df[LIST_LABEL].sum(axis=0)
141
  N = len(df)
142
  pw = []
@@ -145,15 +145,13 @@ def hitung_pos_weight(df):
145
  return torch.tensor(pw, dtype=torch.float)
146
 
147
  # ==============================================================
148
- # Save & Load Logic
149
  # ==============================================================
150
  def save_model(model, tokenizer, folder):
151
  os.makedirs(folder, exist_ok=True)
152
  model.base.save_pretrained(folder)
153
  tokenizer.save_pretrained(folder)
154
  torch.save(model.classifier.state_dict(), str(Path(folder) / "classifier_head.pt"))
155
-
156
- # Update panggilan fungsi di sini
157
  save_last_model(str(folder))
158
 
159
  def load_model(folder):
@@ -172,6 +170,7 @@ def load_model(folder):
172
  # ==============================================================
173
  def jalankan_training(
174
  df,
 
175
  model_name="bert-base-multilingual-cased",
176
  epochs=3,
177
  batch_size=8,
@@ -183,6 +182,12 @@ def jalankan_training(
183
  freeze_layers=6,
184
  device=None
185
  ):
 
 
 
 
 
 
186
  device = device or ("cuda" if torch.cuda.is_available() else "cpu")
187
  tokenizer = AutoTokenizer.from_pretrained(model_name)
188
 
@@ -208,6 +213,7 @@ def jalankan_training(
208
  model = ModelEmosi(model_name)
209
  model.to(device)
210
 
 
211
  for name, param in model.base.named_parameters():
212
  if name.startswith("embeddings."):
213
  param.requires_grad = False
@@ -242,10 +248,17 @@ def jalankan_training(
242
  history = {"train_loss": [], "val_loss": []}
243
  save_path = str(get_model_path(model_name))
244
 
 
 
245
  for ep in range(1, epochs+1):
 
 
 
 
246
  model.train()
247
  total_train_loss = 0
248
 
 
249
  for input_ids, mask, labels in train_loader:
250
  input_ids = input_ids.to(device)
251
  mask = mask.to(device)
@@ -264,6 +277,7 @@ def jalankan_training(
264
  avg_train_loss = total_train_loss / len(train_loader.dataset)
265
  history["train_loss"].append(avg_train_loss)
266
 
 
267
  model.eval()
268
  total_val_loss = 0
269
  with torch.no_grad():
@@ -278,28 +292,32 @@ def jalankan_training(
278
  avg_val_loss = total_val_loss / len(val_loader.dataset)
279
  history["val_loss"].append(avg_val_loss)
280
 
281
- print(f"Epoch {ep} | Train Loss={avg_train_loss:.4f} | Val Loss={avg_val_loss:.4f}")
 
282
 
283
  if avg_val_loss < best_val_loss:
284
  best_val_loss = avg_val_loss
285
  no_improve = 0
286
  save_model(model, tokenizer, save_path)
287
- print(f"Best model saved to {save_path}")
288
  else:
289
  no_improve += 1
290
- if no_improve >= patience:
291
- print("Early stopping triggered.")
292
- break
 
 
 
 
 
293
 
294
- return model, tokenizer, history
295
 
296
  # ==============================================================
297
  # PREDICTION
298
  # ==============================================================
299
  def predict_satu(text, folder=None):
300
- # Update panggilan fungsi di sini
301
  folder = folder or load_last_model()
302
-
303
  if folder is None:
304
  return {"Error": "Belum ada model yang dilatih."}
305
 
@@ -320,9 +338,7 @@ def predict_satu(text, folder=None):
320
  return {LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))}
321
 
322
  def predict_batch(text_list, folder=None, batch_size=32):
323
- # Update panggilan fungsi di sini
324
  folder = folder or load_last_model()
325
-
326
  if folder is None:
327
  return []
328
 
@@ -338,7 +354,6 @@ def predict_batch(text_list, folder=None, batch_size=32):
338
  max_length=128,
339
  return_tensors="pt"
340
  )
341
-
342
  with torch.no_grad():
343
  out = model(encoded["input_ids"], encoded["attention_mask"])
344
  probs = torch.sigmoid(out).numpy()
@@ -372,10 +387,11 @@ def summarize_result(preds):
372
  }
373
 
374
  # ==============================================================
375
- # GRADIO UI
376
  # ==============================================================
377
  def wrapper_training(file_obj, sep, model_name, epoch, batch, lr,
378
- max_len, wd, warmup, pat, freeze):
 
379
 
380
  csv_path = read_file_upload(file_obj)
381
  df = pd.read_csv(csv_path, sep=sep)
@@ -383,8 +399,12 @@ def wrapper_training(file_obj, sep, model_name, epoch, batch, lr,
383
  df = clean_labels(df)
384
  df = clean_text(df)
385
 
386
- _, _, history = jalankan_training(
 
 
 
387
  df=df,
 
388
  model_name=model_name,
389
  epochs=int(epoch),
390
  batch_size=int(batch),
@@ -394,13 +414,17 @@ def wrapper_training(file_obj, sep, model_name, epoch, batch, lr,
394
  warmup_ratio=float(warmup),
395
  patience=int(pat),
396
  freeze_layers=int(freeze)
397
- )
398
-
399
- return {
400
- "status": "Training Selesai!",
401
- "history": history,
402
- "model_used": model_name
403
- }
 
 
 
 
404
 
405
  def wrapper_predict_satu(text):
406
  return predict_satu(text)
@@ -408,15 +432,13 @@ def wrapper_predict_satu(text):
408
  def wrapper_predict_dataset(file_obj, sep, batch_size):
409
  csv_path = read_file_upload(file_obj)
410
  df = pd.read_csv(csv_path, sep=sep)
411
-
412
  df = clean_labels(df)
413
  df = clean_text(df)
414
-
415
  preds = predict_batch(df["text"].tolist(), batch_size=int(batch_size))
416
  return summarize_result(preds)
417
 
418
  # ==============================================================
419
- # INTERFACE
420
  # ==============================================================
421
  with gr.Blocks() as app:
422
  gr.Markdown("## Emotion Classifier — IndoBERT / Multilingual")
@@ -447,13 +469,17 @@ with gr.Blocks() as app:
447
  in_warmup = gr.Number(label="Warmup Ratio", value=0.1, visible=False)
448
 
449
  btn_train = gr.Button("Mulai Training", variant="primary")
450
- out_train = gr.JSON(label="Training Log")
 
 
 
 
451
 
452
  btn_train.click(
453
  wrapper_training,
454
  inputs=[in_file, in_sep, in_model, in_epoch, in_batch,
455
  in_lr, in_len, in_wd, in_warmup, in_pat, in_freeze],
456
- outputs=out_train
457
  )
458
 
459
  with gr.Tab("Tes Satu Kalimat"):
 
1
  # ==============================================================
2
+ # KLASIFIKASI EMOSI
3
  # ==============================================================
4
  import os
5
  import math
 
7
  import pandas as pd
8
  import numpy as np
9
  import gradio as gr
 
10
  from pathlib import Path
11
  from torch import nn
12
+ from torch.utils.data import DataLoader, TensorDataset
13
  from sklearn.model_selection import train_test_split
14
  from transformers import (
15
  AutoTokenizer,
 
22
  # CONFIG
23
  # =========================================================
24
  LIST_LABEL = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
 
 
25
 
26
  FOLDER_MODEL = Path("saved_models")
27
  FOLDER_MODEL.mkdir(exist_ok=True)
28
 
29
  # ==============================================================
30
+ # File & Utils
31
  # ==============================================================
32
  def read_file_upload(file_obj):
33
  """Handle file upload dari Gradio."""
34
  if file_obj is None:
35
  raise ValueError("File belum diupload.")
36
 
 
37
  if isinstance(file_obj, str):
38
  return file_obj
39
 
 
40
  if hasattr(file_obj, "name"):
41
  return file_obj.name
42
 
 
43
  if hasattr(file_obj, "read"):
44
  temp_path = Path("/tmp") / f"upload_{np.random.randint(1e9)}.csv"
45
  with open(temp_path, "wb") as f:
 
48
 
49
  raise ValueError("Tipe file tidak didukung.")
50
 
 
51
  def save_last_model(name):
52
  (FOLDER_MODEL / "last_model_name.txt").write_text(name)
53
 
 
56
  if path_file.exists():
57
  return path_file.read_text().strip()
58
  return None
 
59
 
60
  def get_model_path(model_name):
61
  return FOLDER_MODEL / model_name.replace("/", "_")
 
64
  # Data Cleaning
65
  # ==============================================================
66
  def clean_labels(df):
67
+ """
68
+ 1. Isi label kosong dengan 0.
69
+ 2. Pastikan tipe data label adalah Numeric (Float), bukan Object/String.
70
+ """
71
  for l in LIST_LABEL:
72
  if l not in df.columns:
73
  df[l] = 0
74
+
75
+ # --- PERBAIKAN UTAMA DI SINI ---
76
+ # Paksa konversi ke angka. Error (text/kosong) jadi NaN, lalu diisi 0.
77
+ df[l] = pd.to_numeric(df[l], errors='coerce').fillna(0).astype(float)
78
+
79
  return df
80
 
81
  def clean_text(df, col="text"):
 
89
  # Model Architecture
90
  # =========================================================
91
  class ModelEmosi(nn.Module):
 
92
  def __init__(self, base_model_name, num_labels=8):
93
  super().__init__()
94
  self.config = AutoConfig.from_pretrained(base_model_name)
 
110
  return self.classifier(x)
111
 
112
  # ==============================================================
113
+ # Tokenizer & Dataset
114
  # ==============================================================
115
  def tokenize_batch(texts, tokenizer, max_len=128):
116
  return tokenizer(
 
123
 
124
  def create_dataset(df, tokenizer, max_len=128):
125
  encodings = tokenize_batch(df["text"].tolist(), tokenizer, max_len)
126
+
127
+ # Karena sudah dibersihkan di clean_labels, ini aman
128
  labels = torch.tensor(df[LIST_LABEL].values, dtype=torch.float)
129
 
130
  return TensorDataset(
 
134
  )
135
 
136
  # ==============================================================
137
+ # Weights
138
  # ==============================================================
139
  def hitung_pos_weight(df):
 
140
  counts = df[LIST_LABEL].sum(axis=0)
141
  N = len(df)
142
  pw = []
 
145
  return torch.tensor(pw, dtype=torch.float)
146
 
147
  # ==============================================================
148
+ # Save & Load Logic
149
  # ==============================================================
150
  def save_model(model, tokenizer, folder):
151
  os.makedirs(folder, exist_ok=True)
152
  model.base.save_pretrained(folder)
153
  tokenizer.save_pretrained(folder)
154
  torch.save(model.classifier.state_dict(), str(Path(folder) / "classifier_head.pt"))
 
 
155
  save_last_model(str(folder))
156
 
157
  def load_model(folder):
 
170
  # ==============================================================
171
  def jalankan_training(
172
  df,
173
+ progress_bar=None, # Tambahan untuk Gradio Progress
174
  model_name="bert-base-multilingual-cased",
175
  epochs=3,
176
  batch_size=8,
 
182
  freeze_layers=6,
183
  device=None
184
  ):
185
+ """
186
+ Fungsi ini diubah menjadi Generator (yield) agar bisa streaming log ke UI.
187
+ """
188
+ # 1. Yield pesan awal
189
+ yield "Mempersiapkan dataset dan tokenizer...", None
190
+
191
  device = device or ("cuda" if torch.cuda.is_available() else "cpu")
192
  tokenizer = AutoTokenizer.from_pretrained(model_name)
193
 
 
213
  model = ModelEmosi(model_name)
214
  model.to(device)
215
 
216
+ # Freeze layers logic
217
  for name, param in model.base.named_parameters():
218
  if name.startswith("embeddings."):
219
  param.requires_grad = False
 
248
  history = {"train_loss": [], "val_loss": []}
249
  save_path = str(get_model_path(model_name))
250
 
251
+ yield f"Mulai Training di device: {device}\nTotal Steps: {total_steps}", None
252
+
253
  for ep in range(1, epochs+1):
254
+ # Update progress bar Gradio (jika ada)
255
+ if progress_bar:
256
+ progress_bar(float(ep)/epochs, desc=f"Epoch {ep}/{epochs}")
257
+
258
  model.train()
259
  total_train_loss = 0
260
 
261
+ # Loop batch
262
  for input_ids, mask, labels in train_loader:
263
  input_ids = input_ids.to(device)
264
  mask = mask.to(device)
 
277
  avg_train_loss = total_train_loss / len(train_loader.dataset)
278
  history["train_loss"].append(avg_train_loss)
279
 
280
+ # Validation
281
  model.eval()
282
  total_val_loss = 0
283
  with torch.no_grad():
 
292
  avg_val_loss = total_val_loss / len(val_loader.dataset)
293
  history["val_loss"].append(avg_val_loss)
294
 
295
+ # LOGGING MESSAGE
296
+ log_msg = f"✅ Epoch {ep} | Train Loss={avg_train_loss:.4f} | Val Loss={avg_val_loss:.4f}"
297
 
298
  if avg_val_loss < best_val_loss:
299
  best_val_loss = avg_val_loss
300
  no_improve = 0
301
  save_model(model, tokenizer, save_path)
302
+ log_msg += " --> (Model Saved 💾)"
303
  else:
304
  no_improve += 1
305
+ log_msg += f" --> (No Improve: {no_improve}/{patience})"
306
+
307
+ # Yield log per epoch
308
+ yield log_msg, None
309
+
310
+ if no_improve >= patience:
311
+ yield "⛔ Early stopping triggered.", None
312
+ break
313
 
314
+ yield "Training Selesai! 🎉", history
315
 
316
  # ==============================================================
317
  # PREDICTION
318
  # ==============================================================
319
  def predict_satu(text, folder=None):
 
320
  folder = folder or load_last_model()
 
321
  if folder is None:
322
  return {"Error": "Belum ada model yang dilatih."}
323
 
 
338
  return {LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))}
339
 
340
  def predict_batch(text_list, folder=None, batch_size=32):
 
341
  folder = folder or load_last_model()
 
342
  if folder is None:
343
  return []
344
 
 
354
  max_length=128,
355
  return_tensors="pt"
356
  )
 
357
  with torch.no_grad():
358
  out = model(encoded["input_ids"], encoded["attention_mask"])
359
  probs = torch.sigmoid(out).numpy()
 
387
  }
388
 
389
  # ==============================================================
390
+ # GRADIO UI
391
  # ==============================================================
392
  def wrapper_training(file_obj, sep, model_name, epoch, batch, lr,
393
+ max_len, wd, warmup, pat, freeze,
394
+ progress=gr.Progress()): # Tambahkan progress bar object
395
 
396
  csv_path = read_file_upload(file_obj)
397
  df = pd.read_csv(csv_path, sep=sep)
 
399
  df = clean_labels(df)
400
  df = clean_text(df)
401
 
402
+ accumulated_log = ""
403
+
404
+ # Memanggil generator jalankan_training
405
+ for log_msg, history_result in jalankan_training(
406
  df=df,
407
+ progress_bar=progress, # Kirim progress bar ke backend
408
  model_name=model_name,
409
  epochs=int(epoch),
410
  batch_size=int(batch),
 
414
  warmup_ratio=float(warmup),
415
  patience=int(pat),
416
  freeze_layers=int(freeze)
417
+ ):
418
+ # Update log text real-time
419
+ accumulated_log += log_msg + "\n"
420
+
421
+ # Jika training selesai, history_result tidak None
422
+ if history_result is not None:
423
+ # Yield terakhir: log penuh + JSON history
424
+ yield accumulated_log, history_result
425
+ else:
426
+ # Yield proses: log berjalan + JSON kosong/null
427
+ yield accumulated_log, None
428
 
429
  def wrapper_predict_satu(text):
430
  return predict_satu(text)
 
432
  def wrapper_predict_dataset(file_obj, sep, batch_size):
433
  csv_path = read_file_upload(file_obj)
434
  df = pd.read_csv(csv_path, sep=sep)
 
435
  df = clean_labels(df)
436
  df = clean_text(df)
 
437
  preds = predict_batch(df["text"].tolist(), batch_size=int(batch_size))
438
  return summarize_result(preds)
439
 
440
  # ==============================================================
441
+ # INTERFACE
442
  # ==============================================================
443
  with gr.Blocks() as app:
444
  gr.Markdown("## Emotion Classifier — IndoBERT / Multilingual")
 
469
  in_warmup = gr.Number(label="Warmup Ratio", value=0.1, visible=False)
470
 
471
  btn_train = gr.Button("Mulai Training", variant="primary")
472
+
473
+ # OUTPUT: DUA KOLOM (Log Teks & Hasil JSON)
474
+ with gr.Row():
475
+ out_log = gr.Textbox(label="Log Latihan (Real-time)", lines=10, interactive=False)
476
+ out_result = gr.JSON(label="Hasil Akhir (History)")
477
 
478
  btn_train.click(
479
  wrapper_training,
480
  inputs=[in_file, in_sep, in_model, in_epoch, in_batch,
481
  in_lr, in_len, in_wd, in_warmup, in_pat, in_freeze],
482
+ outputs=[out_log, out_result] # Output ke dua komponen
483
  )
484
 
485
  with gr.Tab("Tes Satu Kalimat"):