Darendra commited on
Commit
1a47d90
·
verified ·
1 Parent(s): 7db873d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +248 -240
app.py CHANGED
@@ -1,7 +1,6 @@
1
  # ==============================================================
2
  # EMOTION CLASSIFIER
3
  # ==============================================================
4
-
5
  import os
6
  import math
7
  import torch
@@ -9,7 +8,6 @@ import pandas as pd
9
  import numpy as np
10
  import gradio as gr
11
  import matplotlib.pyplot as plt
12
-
13
  from pathlib import Path
14
  from torch import nn
15
  from torch.utils.data import Dataset, DataLoader, TensorDataset
@@ -24,97 +22,96 @@ from transformers import (
24
  # =========================================================
25
  # CONFIG
26
  # =========================================================
27
- LABELS = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
28
- LABEL2ID = {l:i for i,l in enumerate(LABELS)}
29
- ID2LABEL = {i:l for i,l in enumerate(LABELS)}
30
 
31
- SAVED_ROOT = Path("saved_models")
32
- SAVED_ROOT.mkdir(exist_ok=True)
33
 
34
  # ==============================================================
35
- # Simpan dan Muat Data
36
  # ==============================================================
37
- def read_uploaded_file(uploaded):
38
- if uploaded is None:
39
- raise ValueError("No file provided")
40
-
41
- if isinstance(uploaded, str):
42
- return uploaded
43
-
44
- if hasattr(uploaded, "name"):
45
- return uploaded.name
46
-
47
- if hasattr(uploaded, "read"):
48
- tmp = Path("/tmp") / f"uploaded_{np.random.randint(1e9)}.csv"
49
- with open(tmp, "wb") as f:
50
- f.write(uploaded.read())
51
- return str(tmp)
52
-
53
- raise ValueError("Unsupported uploaded file type")
54
-
55
-
56
- def save_last_model_name(name):
57
- (SAVED_ROOT / "last_model.txt").write_text(name)
58
-
59
-
60
- def load_last_model_name():
61
- p = SAVED_ROOT / "last_model.txt"
62
- if p.exists():
63
- return p.read_text().strip()
 
 
 
64
  return None
 
65
 
66
-
67
- def model_folder(model_name):
68
- return SAVED_ROOT / model_name.replace("/", "_")
69
-
70
 
71
  # ==============================================================
72
- # Pembersihan Data
73
  # ==============================================================
74
  def clean_labels(df):
75
- for l in LABELS:
 
76
  if l not in df.columns:
77
  df[l] = 0
78
  return df
79
 
80
-
81
  def clean_text(df, col="text"):
 
82
  if col not in df.columns:
83
- raise KeyError(f"CSV must contain a column '{col}'")
84
  df[col] = df[col].astype(str).str.replace("\n", " ").str.strip()
85
  return df
86
 
87
-
88
  # =========================================================
89
- # Model AI
90
  # =========================================================
91
- class EmotionModel(nn.Module):
92
- """Consistent backbone + dropout + classifier."""
93
  def __init__(self, base_model_name, num_labels=8):
94
  super().__init__()
95
  self.config = AutoConfig.from_pretrained(base_model_name)
96
  self.base = AutoModel.from_pretrained(base_model_name)
97
- self.drop = nn.Dropout(0.3)
98
- self.clf = nn.Linear(self.config.hidden_size, num_labels)
99
 
100
- def forward(self, ids, mask):
101
  out = self.base(
102
- input_ids=ids,
103
- attention_mask=mask
104
  )
105
-
106
- # Prefer pooler_output if exists
107
  if hasattr(out, "pooler_output") and out.pooler_output is not None:
108
  x = out.pooler_output
109
  else:
110
  x = out.last_hidden_state[:, 0, :]
111
-
112
- x = self.drop(x)
113
- return self.clf(x)
114
-
115
 
116
  # ==============================================================
117
- # Tokenisasi Dataset
118
  # ==============================================================
119
  def tokenize_batch(texts, tokenizer, max_len=128):
120
  return tokenizer(
@@ -125,63 +122,55 @@ def tokenize_batch(texts, tokenizer, max_len=128):
125
  return_tensors="pt"
126
  )
127
 
128
-
129
- def build_tensor_dataset(df, tokenizer, max_len=128):
130
- enc = tokenize_batch(df["text"].tolist(), tokenizer, max_len)
131
- labels = torch.tensor(df[LABELS].values, dtype=torch.float)
132
  return TensorDataset(
133
- enc["input_ids"],
134
- enc["attention_mask"],
135
  labels
136
  )
137
 
138
-
139
  # ==============================================================
140
- # Bobot
141
  # ==============================================================
142
- def compute_pos_weight(df):
143
- counts = df[LABELS].sum(axis=0)
 
144
  N = len(df)
145
  pw = []
146
  for c in counts:
147
  pw.append((N - c) / c if c > 0 else 1.0)
148
  return torch.tensor(pw, dtype=torch.float)
149
 
150
-
151
  # ==============================================================
152
- # Simpan dan Muat Model
153
  # ==============================================================
154
  def save_model(model, tokenizer, folder):
155
  os.makedirs(folder, exist_ok=True)
156
-
157
- # Save backbone HF style
158
  model.base.save_pretrained(folder)
159
  tokenizer.save_pretrained(folder)
160
-
161
- # Save classifier head
162
- torch.save(model.clf.state_dict(), str(Path(folder) / "classifier.pt"))
163
-
164
- # Save last-used name
165
- save_last_model_name(str(folder))
166
-
167
 
168
  def load_model(folder):
169
  folder = str(folder)
170
  config = AutoConfig.from_pretrained(folder)
171
  tokenizer = AutoTokenizer.from_pretrained(folder)
172
-
173
- model = EmotionModel(folder)
174
- state = torch.load(f"{folder}/classifier.pt", map_location="cpu")
175
- model.clf.load_state_dict(state)
176
  model.eval()
177
-
178
  return model, tokenizer, config
179
 
180
-
181
  # ==============================================================
182
- # Pelatihan
183
  # ==============================================================
184
- def train_model(
185
  df,
186
  model_name="bert-base-multilingual-cased",
187
  epochs=3,
@@ -195,32 +184,30 @@ def train_model(
195
  device=None
196
  ):
197
  device = device or ("cuda" if torch.cuda.is_available() else "cpu")
198
-
199
  tokenizer = AutoTokenizer.from_pretrained(model_name)
200
-
201
  df = df.reset_index(drop=True)
202
- dataset = build_tensor_dataset(df, tokenizer, max_len)
203
-
204
- idx = list(range(len(dataset)))
205
  train_idx, val_idx = train_test_split(idx, test_size=0.15, random_state=42)
206
-
207
- def subset(ds, idxs):
208
  return TensorDataset(
209
- torch.stack([ds[i][0] for i in idxs]),
210
- torch.stack([ds[i][1] for i in idxs]),
211
- torch.stack([ds[i][2] for i in idxs]),
212
  )
213
-
214
- train_ds = subset(dataset, train_idx)
215
- val_ds = subset(dataset, val_idx)
216
-
217
  train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
218
  val_loader = DataLoader(val_ds, batch_size=batch_size)
219
-
220
- model = EmotionModel(model_name)
221
  model.to(device)
222
-
223
- # Freeze lower layers
224
  for name, param in model.base.named_parameters():
225
  if name.startswith("embeddings."):
226
  param.requires_grad = False
@@ -231,91 +218,93 @@ def train_model(
231
  param.requires_grad = False
232
  except:
233
  pass
234
-
235
- pos_weight = compute_pos_weight(df).to(device)
236
  loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
237
-
238
  optimizer = torch.optim.AdamW(
239
  filter(lambda p: p.requires_grad, model.parameters()),
240
  lr=lr,
241
  weight_decay=weight_decay
242
  )
243
-
244
  total_steps = len(train_loader) * epochs
245
  warmup_steps = int(warmup_ratio * total_steps)
246
-
247
  scheduler = get_linear_schedule_with_warmup(
248
  optimizer,
249
  num_warmup_steps=warmup_steps,
250
  num_training_steps=total_steps
251
  )
252
-
253
- best_val = float("inf")
254
  no_improve = 0
255
-
256
  history = {"train_loss": [], "val_loss": []}
257
-
258
- save_path = str(model_folder(model_name))
259
-
260
  for ep in range(1, epochs+1):
261
  model.train()
262
- t_loss = 0
263
-
264
- for input_ids, attn, labels in train_loader:
265
  input_ids = input_ids.to(device)
266
- attn = attn.to(device)
267
  labels = labels.to(device)
268
-
269
  optimizer.zero_grad()
270
- logits = model(input_ids, attn)
271
  loss = loss_fn(logits, labels)
 
272
  loss.backward()
273
  optimizer.step()
274
  scheduler.step()
275
-
276
- t_loss += loss.item() * input_ids.size(0)
277
-
278
- train_loss = t_loss / len(train_loader.dataset)
279
- history["train_loss"].append(train_loss)
280
-
281
- # Validation
282
  model.eval()
283
- v_loss = 0
284
  with torch.no_grad():
285
- for input_ids, attn, labels in val_loader:
286
  input_ids = input_ids.to(device)
287
- attn = attn.to(device)
288
  labels = labels.to(device)
289
- logits = model(input_ids, attn)
290
  loss = loss_fn(logits, labels)
291
- v_loss += loss.item() * input_ids.size(0)
292
-
293
- val_loss = v_loss / len(val_loader.dataset)
294
- history["val_loss"].append(val_loss)
295
-
296
- print(f"Epoch {ep} | Train={train_loss:.4f} | Val={val_loss:.4f}")
297
-
298
- if val_loss < best_val:
299
- best_val = val_loss
300
  no_improve = 0
301
  save_model(model, tokenizer, save_path)
302
- print(f"Saved best model to {save_path}")
303
  else:
304
  no_improve += 1
305
  if no_improve >= patience:
306
- print("Early stopping.")
307
  break
308
-
309
  return model, tokenizer, history
310
 
311
-
312
  # ==============================================================
313
- # Uji
314
  # ==============================================================
315
- def predict_single(text, folder=None):
316
- folder = folder or load_last_model_name()
317
- model, tokenizer, cfg = load_model(folder)
318
-
 
 
 
 
 
319
  encoded = tokenizer(
320
  text,
321
  padding="max_length",
@@ -323,150 +312,169 @@ def predict_single(text, folder=None):
323
  max_length=128,
324
  return_tensors="pt"
325
  )
326
-
327
  with torch.no_grad():
328
  out = model(encoded["input_ids"], encoded["attention_mask"])
329
  probs = torch.sigmoid(out).numpy()[0]
 
 
330
 
331
- return {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
332
-
333
-
334
- def predict_batch(texts, folder=None, batch_size=32):
335
- folder = folder or load_last_model_name()
336
- model, tokenizer, cfg = load_model(folder)
337
 
 
338
  preds = []
339
- for i in range(0, len(texts), batch_size):
340
- batch = texts[i:i+batch_size]
341
- enc = tokenizer(
 
342
  batch,
343
  padding="max_length",
344
  truncation=True,
345
  max_length=128,
346
  return_tensors="pt"
347
  )
348
-
349
  with torch.no_grad():
350
- out = model(enc["input_ids"], enc["attention_mask"])
351
  probs = torch.sigmoid(out).numpy()
352
-
353
  for p in probs:
354
- preds.append({LABELS[j]: float(p[j]) for j in range(len(LABELS))})
355
-
356
  return preds
357
 
 
 
 
358
 
359
- def summarize_preds(preds):
360
- avg = {l: 0.0 for l in LABELS}
361
  n = len(preds)
362
-
363
  for p in preds:
364
  for l,v in p.items():
365
  avg[l] += v
 
366
  for l in avg:
367
  avg[l] /= n
368
-
369
  top3 = sorted(avg.items(), key=lambda x: x[1], reverse=True)[:3]
370
- top3 = [{"label":l, "score":float(s)} for l,s in top3]
371
-
372
- return {"n":n, "avg_distribution":avg, "top3":top3}
373
-
 
 
 
374
 
375
  # ==============================================================
376
- # GRADIO GUI
377
  # ==============================================================
378
- def wrapper_train(file_obj, sep, model_name, epochs, batch_size, lr,
379
- max_len, weight_decay, warmup_ratio, patience, freeze_layers):
380
- csv = read_uploaded_file(file_obj)
381
- df = pd.read_csv(csv, sep=sep)
 
 
382
  df = clean_labels(df)
383
  df = clean_text(df)
384
-
385
- _, _, history = train_model(
386
  df=df,
387
  model_name=model_name,
388
- epochs=int(epochs),
389
- batch_size=int(batch_size),
390
  lr=float(lr),
391
  max_len=int(max_len),
392
- weight_decay=float(weight_decay),
393
- warmup_ratio=float(warmup_ratio),
394
- patience=int(patience),
395
- freeze_layers=int(freeze_layers)
396
  )
397
-
398
  return {
399
- "message": "Training finished.",
400
  "history": history,
401
- "model_name": model_name
402
  }
403
 
 
 
404
 
405
- def wrapper_single(text):
406
- return predict_single(text)
407
-
408
-
409
- def wrapper_dataset(file_obj, sep, max_len, batch_size):
410
- csv = read_uploaded_file(file_obj)
411
- df = pd.read_csv(csv, sep=sep)
412
  df = clean_labels(df)
413
  df = clean_text(df)
414
-
415
  preds = predict_batch(df["text"].tolist(), batch_size=int(batch_size))
416
- return summarize_preds(preds)
417
-
418
 
419
  # ==============================================================
420
- # Menjalankan GRADIO
421
  # ==============================================================
422
  with gr.Blocks() as app:
423
- gr.Markdown("## Emotion Classifier — Dava (Final Version)")
424
-
425
- with gr.Tab("Training"):
426
- file_in = gr.File(label="Upload Training CSV")
427
- sep_in = gr.Textbox(label="Delimiter", value=",")
428
- model_name_in = gr.Dropdown(
429
- label="Backbone Model",
 
 
430
  choices=["bert-base-multilingual-cased", "indobert-base-p1"],
431
  value="bert-base-multilingual-cased"
432
  )
433
- epochs_in = gr.Number(label="Epochs", value=3)
434
- bs_in = gr.Number(label="Batch Size", value=8)
435
- lr_in = gr.Number(label="Learning Rate", value=2e-5)
436
- maxlen_in = gr.Number(label="Max Length", value=128)
437
- wd_in = gr.Number(label="Weight Decay", value=0.01)
438
- warmup_in = gr.Number(label="Warmup Ratio", value=0.1)
439
- patience_in = gr.Number(label="Patience", value=2)
440
- freeze_in = gr.Number(label="Freeze Layers", value=6)
441
-
442
- btn_train = gr.Button("Start Training")
443
- out_train = gr.JSON(label="Train Result")
444
-
 
 
 
 
 
 
445
  btn_train.click(
446
- wrapper_train,
447
- inputs=[file_in, sep_in, model_name_in, epochs_in, bs_in,
448
- lr_in, maxlen_in, wd_in, warmup_in, patience_in, freeze_in],
449
  outputs=out_train
450
  )
451
 
452
- with gr.Tab("Single Prediction"):
453
- text_in = gr.Textbox(label="Text")
454
- btn_single = gr.Button("Predict")
455
- out_single = gr.JSON(label="Emotion Scores")
456
- btn_single.click(wrapper_single, inputs=[text_in], outputs=out_single)
457
-
458
- with gr.Tab("Dataset Prediction"):
459
- file_test = gr.File(label="Upload CSV")
460
- sep_test = gr.Textbox(label="Delimiter", value=",")
461
- maxlen_test = gr.Number(label="Max Length", value=128)
462
- bs_test = gr.Number(label="Batch Size", value=32)
463
-
 
464
  btn_test = gr.Button("Run Prediction")
465
- out_test = gr.JSON(label="Summary Result")
466
-
467
  btn_test.click(
468
- wrapper_dataset,
469
- inputs=[file_test, sep_test, maxlen_test, bs_test],
470
  outputs=out_test
471
  )
472
 
 
1
  # ==============================================================
2
  # EMOTION CLASSIFIER
3
  # ==============================================================
 
4
  import os
5
  import math
6
  import torch
 
8
  import numpy as np
9
  import gradio as gr
10
  import matplotlib.pyplot as plt
 
11
  from pathlib import Path
12
  from torch import nn
13
  from torch.utils.data import Dataset, DataLoader, TensorDataset
 
22
  # =========================================================
23
  # CONFIG
24
  # =========================================================
25
+ LIST_LABEL = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
26
+ LABEL2ID = {l:i for i,l in enumerate(LIST_LABEL)}
27
+ ID2LABEL = {i:l for i,l in enumerate(LIST_LABEL)}
28
 
29
+ FOLDER_MODEL = Path("saved_models")
30
+ FOLDER_MODEL.mkdir(exist_ok=True)
31
 
32
  # ==============================================================
33
+ # File & Utils
34
  # ==============================================================
35
+ def read_file_upload(file_obj):
36
+ """Handle file upload dari Gradio."""
37
+ if file_obj is None:
38
+ raise ValueError("File belum diupload.")
39
+
40
+ # Kalau inputnya string path
41
+ if isinstance(file_obj, str):
42
+ return file_obj
43
+
44
+ # Kalau inputnya object file (Gradio baru)
45
+ if hasattr(file_obj, "name"):
46
+ return file_obj.name
47
+
48
+ # Kalau binary stream
49
+ if hasattr(file_obj, "read"):
50
+ temp_path = Path("/tmp") / f"upload_{np.random.randint(1e9)}.csv"
51
+ with open(temp_path, "wb") as f:
52
+ f.write(file_obj.read())
53
+ return str(temp_path)
54
+
55
+ raise ValueError("Tipe file tidak didukung.")
56
+
57
+ # --- FUNGSI YANG DIUBAH (LEBIH SINGKAT) ---
58
+ def save_last_model(name):
59
+ (FOLDER_MODEL / "last_model_name.txt").write_text(name)
60
+
61
+ def load_last_model():
62
+ path_file = FOLDER_MODEL / "last_model_name.txt"
63
+ if path_file.exists():
64
+ return path_file.read_text().strip()
65
  return None
66
+ # ------------------------------------------
67
 
68
+ def get_model_path(model_name):
69
+ return FOLDER_MODEL / model_name.replace("/", "_")
 
 
70
 
71
  # ==============================================================
72
+ # Data Cleaning
73
  # ==============================================================
74
  def clean_labels(df):
75
+ """Isi label kosong dengan 0."""
76
+ for l in LIST_LABEL:
77
  if l not in df.columns:
78
  df[l] = 0
79
  return df
80
 
 
81
  def clean_text(df, col="text"):
82
+ """Hapus enter dan spasi berlebih."""
83
  if col not in df.columns:
84
+ raise KeyError(f"CSV harus punya kolom '{col}'")
85
  df[col] = df[col].astype(str).str.replace("\n", " ").str.strip()
86
  return df
87
 
 
88
  # =========================================================
89
+ # Model Architecture
90
  # =========================================================
91
+ class ModelEmosi(nn.Module):
92
+ """Backbone BERT + Classifier Head."""
93
  def __init__(self, base_model_name, num_labels=8):
94
  super().__init__()
95
  self.config = AutoConfig.from_pretrained(base_model_name)
96
  self.base = AutoModel.from_pretrained(base_model_name)
97
+ self.dropout = nn.Dropout(0.3)
98
+ self.classifier = nn.Linear(self.config.hidden_size, num_labels)
99
 
100
+ def forward(self, input_ids, attention_mask):
101
  out = self.base(
102
+ input_ids=input_ids,
103
+ attention_mask=attention_mask
104
  )
 
 
105
  if hasattr(out, "pooler_output") and out.pooler_output is not None:
106
  x = out.pooler_output
107
  else:
108
  x = out.last_hidden_state[:, 0, :]
109
+
110
+ x = self.dropout(x)
111
+ return self.classifier(x)
 
112
 
113
  # ==============================================================
114
+ # Tokenizer & Dataset
115
  # ==============================================================
116
  def tokenize_batch(texts, tokenizer, max_len=128):
117
  return tokenizer(
 
122
  return_tensors="pt"
123
  )
124
 
125
+ def create_dataset(df, tokenizer, max_len=128):
126
+ encodings = tokenize_batch(df["text"].tolist(), tokenizer, max_len)
127
+ labels = torch.tensor(df[LIST_LABEL].values, dtype=torch.float)
128
+
129
  return TensorDataset(
130
+ encodings["input_ids"],
131
+ encodings["attention_mask"],
132
  labels
133
  )
134
 
 
135
  # ==============================================================
136
+ # Weights
137
  # ==============================================================
138
+ def hitung_pos_weight(df):
139
+ """Biar adil kalau datanya imbalanced."""
140
+ counts = df[LIST_LABEL].sum(axis=0)
141
  N = len(df)
142
  pw = []
143
  for c in counts:
144
  pw.append((N - c) / c if c > 0 else 1.0)
145
  return torch.tensor(pw, dtype=torch.float)
146
 
 
147
  # ==============================================================
148
+ # Save & Load Logic
149
  # ==============================================================
150
  def save_model(model, tokenizer, folder):
151
  os.makedirs(folder, exist_ok=True)
 
 
152
  model.base.save_pretrained(folder)
153
  tokenizer.save_pretrained(folder)
154
+ torch.save(model.classifier.state_dict(), str(Path(folder) / "classifier_head.pt"))
155
+
156
+ # Update panggilan fungsi di sini
157
+ save_last_model(str(folder))
 
 
 
158
 
159
  def load_model(folder):
160
  folder = str(folder)
161
  config = AutoConfig.from_pretrained(folder)
162
  tokenizer = AutoTokenizer.from_pretrained(folder)
163
+ model = ModelEmosi(folder)
164
+
165
+ state = torch.load(f"{folder}/classifier_head.pt", map_location="cpu")
166
+ model.classifier.load_state_dict(state)
167
  model.eval()
 
168
  return model, tokenizer, config
169
 
 
170
  # ==============================================================
171
+ # TRAINING
172
  # ==============================================================
173
+ def jalankan_training(
174
  df,
175
  model_name="bert-base-multilingual-cased",
176
  epochs=3,
 
184
  device=None
185
  ):
186
  device = device or ("cuda" if torch.cuda.is_available() else "cpu")
 
187
  tokenizer = AutoTokenizer.from_pretrained(model_name)
188
+
189
  df = df.reset_index(drop=True)
190
+ full_dataset = create_dataset(df, tokenizer, max_len)
191
+
192
+ idx = list(range(len(full_dataset)))
193
  train_idx, val_idx = train_test_split(idx, test_size=0.15, random_state=42)
194
+
195
+ def get_subset(ds, indices):
196
  return TensorDataset(
197
+ torch.stack([ds[i][0] for i in indices]),
198
+ torch.stack([ds[i][1] for i in indices]),
199
+ torch.stack([ds[i][2] for i in indices]),
200
  )
201
+
202
+ train_ds = get_subset(full_dataset, train_idx)
203
+ val_ds = get_subset(full_dataset, val_idx)
204
+
205
  train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
206
  val_loader = DataLoader(val_ds, batch_size=batch_size)
207
+
208
+ model = ModelEmosi(model_name)
209
  model.to(device)
210
+
 
211
  for name, param in model.base.named_parameters():
212
  if name.startswith("embeddings."):
213
  param.requires_grad = False
 
218
  param.requires_grad = False
219
  except:
220
  pass
221
+
222
+ pos_weight = hitung_pos_weight(df).to(device)
223
  loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
224
+
225
  optimizer = torch.optim.AdamW(
226
  filter(lambda p: p.requires_grad, model.parameters()),
227
  lr=lr,
228
  weight_decay=weight_decay
229
  )
230
+
231
  total_steps = len(train_loader) * epochs
232
  warmup_steps = int(warmup_ratio * total_steps)
233
+
234
  scheduler = get_linear_schedule_with_warmup(
235
  optimizer,
236
  num_warmup_steps=warmup_steps,
237
  num_training_steps=total_steps
238
  )
239
+
240
+ best_val_loss = float("inf")
241
  no_improve = 0
 
242
  history = {"train_loss": [], "val_loss": []}
243
+ save_path = str(get_model_path(model_name))
244
+
 
245
  for ep in range(1, epochs+1):
246
  model.train()
247
+ total_train_loss = 0
248
+
249
+ for input_ids, mask, labels in train_loader:
250
  input_ids = input_ids.to(device)
251
+ mask = mask.to(device)
252
  labels = labels.to(device)
253
+
254
  optimizer.zero_grad()
255
+ logits = model(input_ids, mask)
256
  loss = loss_fn(logits, labels)
257
+
258
  loss.backward()
259
  optimizer.step()
260
  scheduler.step()
261
+
262
+ total_train_loss += loss.item() * input_ids.size(0)
263
+
264
+ avg_train_loss = total_train_loss / len(train_loader.dataset)
265
+ history["train_loss"].append(avg_train_loss)
266
+
 
267
  model.eval()
268
+ total_val_loss = 0
269
  with torch.no_grad():
270
+ for input_ids, mask, labels in val_loader:
271
  input_ids = input_ids.to(device)
272
+ mask = mask.to(device)
273
  labels = labels.to(device)
274
+ logits = model(input_ids, mask)
275
  loss = loss_fn(logits, labels)
276
+ total_val_loss += loss.item() * input_ids.size(0)
277
+
278
+ avg_val_loss = total_val_loss / len(val_loader.dataset)
279
+ history["val_loss"].append(avg_val_loss)
280
+
281
+ print(f"Epoch {ep} | Train Loss={avg_train_loss:.4f} | Val Loss={avg_val_loss:.4f}")
282
+
283
+ if avg_val_loss < best_val_loss:
284
+ best_val_loss = avg_val_loss
285
  no_improve = 0
286
  save_model(model, tokenizer, save_path)
287
+ print(f"Best model saved to {save_path}")
288
  else:
289
  no_improve += 1
290
  if no_improve >= patience:
291
+ print("Early stopping triggered.")
292
  break
293
+
294
  return model, tokenizer, history
295
 
 
296
  # ==============================================================
297
+ # PREDICTION
298
  # ==============================================================
299
+ def predict_satu(text, folder=None):
300
+ # Update panggilan fungsi di sini
301
+ folder = folder or load_last_model()
302
+
303
+ if folder is None:
304
+ return {"Error": "Belum ada model yang dilatih."}
305
+
306
+ model, tokenizer, _ = load_model(folder)
307
+
308
  encoded = tokenizer(
309
  text,
310
  padding="max_length",
 
312
  max_length=128,
313
  return_tensors="pt"
314
  )
315
+
316
  with torch.no_grad():
317
  out = model(encoded["input_ids"], encoded["attention_mask"])
318
  probs = torch.sigmoid(out).numpy()[0]
319
+
320
+ return {LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))}
321
 
322
+ def predict_batch(text_list, folder=None, batch_size=32):
323
+ # Update panggilan fungsi di sini
324
+ folder = folder or load_last_model()
325
+
326
+ if folder is None:
327
+ return []
328
 
329
+ model, tokenizer, _ = load_model(folder)
330
  preds = []
331
+
332
+ for i in range(0, len(text_list), batch_size):
333
+ batch = text_list[i:i+batch_size]
334
+ encoded = tokenizer(
335
  batch,
336
  padding="max_length",
337
  truncation=True,
338
  max_length=128,
339
  return_tensors="pt"
340
  )
341
+
342
  with torch.no_grad():
343
+ out = model(encoded["input_ids"], encoded["attention_mask"])
344
  probs = torch.sigmoid(out).numpy()
345
+
346
  for p in probs:
347
+ preds.append({LIST_LABEL[j]: float(p[j]) for j in range(len(LIST_LABEL))})
348
+
349
  return preds
350
 
351
+ def summarize_result(preds):
352
+ if not preds:
353
+ return {"Info": "Tidak ada hasil."}
354
 
355
+ avg = {l: 0.0 for l in LIST_LABEL}
 
356
  n = len(preds)
357
+
358
  for p in preds:
359
  for l,v in p.items():
360
  avg[l] += v
361
+
362
  for l in avg:
363
  avg[l] /= n
364
+
365
  top3 = sorted(avg.items(), key=lambda x: x[1], reverse=True)[:3]
366
+ top3_fmt = [{"label":l, "score":float(s)} for l,s in top3]
367
+
368
+ return {
369
+ "jumlah_data": n,
370
+ "distribusi_rata2": avg,
371
+ "top_3": top3_fmt
372
+ }
373
 
374
  # ==============================================================
375
+ # GRADIO UI
376
  # ==============================================================
377
+ def wrapper_training(file_obj, sep, model_name, epoch, batch, lr,
378
+ max_len, wd, warmup, pat, freeze):
379
+
380
+ csv_path = read_file_upload(file_obj)
381
+ df = pd.read_csv(csv_path, sep=sep)
382
+
383
  df = clean_labels(df)
384
  df = clean_text(df)
385
+
386
+ _, _, history = jalankan_training(
387
  df=df,
388
  model_name=model_name,
389
+ epochs=int(epoch),
390
+ batch_size=int(batch),
391
  lr=float(lr),
392
  max_len=int(max_len),
393
+ weight_decay=float(wd),
394
+ warmup_ratio=float(warmup),
395
+ patience=int(pat),
396
+ freeze_layers=int(freeze)
397
  )
398
+
399
  return {
400
+ "status": "Training Selesai!",
401
  "history": history,
402
+ "model_used": model_name
403
  }
404
 
405
+ def wrapper_predict_satu(text):
406
+ return predict_satu(text)
407
 
408
+ def wrapper_predict_dataset(file_obj, sep, batch_size):
409
+ csv_path = read_file_upload(file_obj)
410
+ df = pd.read_csv(csv_path, sep=sep)
411
+
 
 
 
412
  df = clean_labels(df)
413
  df = clean_text(df)
414
+
415
  preds = predict_batch(df["text"].tolist(), batch_size=int(batch_size))
416
+ return summarize_result(preds)
 
417
 
418
  # ==============================================================
419
+ # INTERFACE
420
  # ==============================================================
421
  with gr.Blocks() as app:
422
+ gr.Markdown("## Emotion Classifier — IndoBERT / Multilingual")
423
+
424
+ with gr.Tab("Menu Training"):
425
+ gr.Markdown("Upload dataset CSV untuk fine-tuning model.")
426
+ in_file = gr.File(label="Upload File CSV")
427
+ in_sep = gr.Textbox(label="Delimiter (Pemisah)", value=";")
428
+
429
+ in_model = gr.Dropdown(
430
+ label="Base Model",
431
  choices=["bert-base-multilingual-cased", "indobert-base-p1"],
432
  value="bert-base-multilingual-cased"
433
  )
434
+
435
+ with gr.Row():
436
+ in_epoch = gr.Number(label="Epochs", value=3)
437
+ in_batch = gr.Number(label="Batch Size", value=8)
438
+ in_lr = gr.Number(label="Learning Rate", value=2e-5)
439
+
440
+ with gr.Row():
441
+ in_len = gr.Number(label="Max Length", value=128)
442
+ in_pat = gr.Number(label="Patience (Early Stop)", value=2)
443
+ in_freeze = gr.Number(label="Freeze Layers", value=6)
444
+
445
+ # Hidden advanced params
446
+ in_wd = gr.Number(label="Weight Decay", value=0.01, visible=False)
447
+ in_warmup = gr.Number(label="Warmup Ratio", value=0.1, visible=False)
448
+
449
+ btn_train = gr.Button("Mulai Training", variant="primary")
450
+ out_train = gr.JSON(label="Training Log")
451
+
452
  btn_train.click(
453
+ wrapper_training,
454
+ inputs=[in_file, in_sep, in_model, in_epoch, in_batch,
455
+ in_lr, in_len, in_wd, in_warmup, in_pat, in_freeze],
456
  outputs=out_train
457
  )
458
 
459
+ with gr.Tab("Tes Satu Kalimat"):
460
+ in_text = gr.Textbox(label="Input Teks", placeholder="Contoh: Aku senang sekali hari ini...")
461
+ btn_satu = gr.Button("Prediksi")
462
+ out_satu = gr.Label(label="Confidence Score")
463
+
464
+ btn_satu.click(wrapper_predict_satu, inputs=[in_text], outputs=out_satu)
465
+
466
+ with gr.Tab("Tes Satu File"):
467
+ gr.Markdown("Upload file CSV baru untuk prediksi massal.")
468
+ in_file_test = gr.File(label="Upload CSV")
469
+ in_sep_test = gr.Textbox(label="Delimiter", value=";")
470
+ in_bs_test = gr.Number(label="Batch Size", value=32)
471
+
472
  btn_test = gr.Button("Run Prediction")
473
+ out_test = gr.JSON(label="Summary")
474
+
475
  btn_test.click(
476
+ wrapper_predict_dataset,
477
+ inputs=[in_file_test, in_sep_test, in_bs_test],
478
  outputs=out_test
479
  )
480