Darendra commited on
Commit
d0cc31c
·
verified ·
1 Parent(s): 062b593

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +312 -272
app.py CHANGED
@@ -1,168 +1,186 @@
 
 
 
 
1
  import os
2
- import io
3
  import math
4
  import torch
5
  import pandas as pd
6
  import numpy as np
7
- import matplotlib.pyplot as plt
8
  import gradio as gr
 
 
9
  from pathlib import Path
10
  from torch import nn
11
  from torch.utils.data import Dataset, DataLoader, TensorDataset
12
  from sklearn.model_selection import train_test_split
13
- from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
14
-
15
- # ---------------------------
16
- # Konfigurasi & Label
17
- # ---------------------------
 
 
 
 
 
18
  LABELS = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
19
  LABEL2ID = {l:i for i,l in enumerate(LABELS)}
20
  ID2LABEL = {i:l for i,l in enumerate(LABELS)}
 
21
  SAVED_ROOT = Path("saved_models")
22
  SAVED_ROOT.mkdir(exist_ok=True)
23
 
24
- # ---------------------------
25
- # Utility I/O small helpers
26
- # ---------------------------
27
  def read_uploaded_file(uploaded):
28
- # uploaded can be a gradio file object or a path string
29
  if uploaded is None:
30
  raise ValueError("No file provided")
 
31
  if isinstance(uploaded, str):
32
  return uploaded
33
- # gradio returns a tempfile-like object with 'name' attribute
34
  if hasattr(uploaded, "name"):
35
  return uploaded.name
36
- # fallback: bytesIO-like
37
  if hasattr(uploaded, "read"):
38
- # write to temp file
39
  tmp = Path("/tmp") / f"uploaded_{np.random.randint(1e9)}.csv"
40
  with open(tmp, "wb") as f:
41
  f.write(uploaded.read())
42
  return str(tmp)
 
43
  raise ValueError("Unsupported uploaded file type")
44
 
45
- def save_last_model_name(model_name: str):
46
- (SAVED_ROOT / "last_model.txt").write_text(model_name)
47
 
48
- def load_last_model_name() -> str:
 
 
 
 
49
  p = SAVED_ROOT / "last_model.txt"
50
  if p.exists():
51
  return p.read_text().strip()
52
  return None
53
 
54
- def model_folder(model_name: str) -> Path:
 
55
  return SAVED_ROOT / model_name.replace("/", "_")
56
 
57
- # ---------------------------
58
- # Data loading & cleaning
59
- # ---------------------------
60
- def load_dataset(path_or_file, sep=","):
61
- path = read_uploaded_file(path_or_file)
62
- df = pd.read_csv(path, sep=sep)
63
- return df
64
 
 
 
 
65
  def clean_labels(df):
66
- # ensure all LABELS exist as columns (0/1)
67
  for l in LABELS:
68
  if l not in df.columns:
69
  df[l] = 0
70
  return df
71
 
72
- def clean_text(df, text_col="text"):
73
- if text_col not in df.columns:
74
- raise KeyError(f"CSV must contain column named '{text_col}' (found columns: {df.columns.tolist()})")
75
- df[text_col] = df[text_col].astype(str).str.replace("\n", " ").str.strip()
 
76
  return df
77
 
78
- # ---------------------------
79
- # Model class (BERT + head)
80
- # ---------------------------
81
- class EmotionClassifier(nn.Module):
82
- def __init__(self, model_name="bert-base-multilingual-cased", num_labels=8):
 
 
83
  super().__init__()
84
- self.bert = AutoModel.from_pretrained(model_name)
 
85
  self.drop = nn.Dropout(0.3)
86
- self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
87
-
88
- def forward(self, input_ids, attention_mask=None, token_type_ids=None):
89
- outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
90
- cls = outputs.last_hidden_state[:,0,:]
91
- x = self.drop(cls)
92
- logits = self.classifier(x)
93
- return logits
94
-
95
- # ---------------------------
96
- # Tokenisasi dan dataset (optimized batch)
97
- # ---------------------------
98
- def tokenize_dataset_batch(texts, tokenizer, max_len=128):
99
- enc = tokenizer.batch_encode_plus(
 
 
 
 
 
 
 
 
 
100
  texts,
101
  padding="max_length",
102
  truncation=True,
103
  max_length=max_len,
104
  return_tensors="pt"
105
  )
106
- return enc # dict: input_ids, attention_mask, (token_type_ids)
107
 
108
  def build_tensor_dataset(df, tokenizer, max_len=128):
109
- texts = df["text"].tolist()
110
- enc = tokenize_dataset_batch(texts, tokenizer, max_len=max_len)
111
  labels = torch.tensor(df[LABELS].values, dtype=torch.float)
112
- dataset = TensorDataset(enc["input_ids"], enc["attention_mask"], labels)
113
- return dataset
 
 
 
 
114
 
115
- # ---------------------------
116
- # Pos-weight compute
117
- # ---------------------------
118
  def compute_pos_weight(df):
119
- counts = df[LABELS].sum(axis=0).astype(int).to_list()
120
  N = len(df)
121
  pw = []
122
  for c in counts:
123
- if c == 0:
124
- pw.append(1.0)
125
- else:
126
- pw.append((N - c) / c)
127
  return torch.tensor(pw, dtype=torch.float)
128
 
129
- # ---------------------------
130
- # Save / Load trained model files
131
- # ---------------------------
132
- def save_trained(model, tokenizer, model_name:str):
133
- folder = model_folder(model_name)
134
- folder.mkdir(parents=True, exist_ok=True)
135
- # save model weights
136
- torch.save(model.state_dict(), folder / "best_model.pt")
137
- # save tokenizer config
138
- tokenizer.save_pretrained(str(folder))
139
- # save a text marker
140
- save_last_model_name(model_name)
141
- return str(folder)
142
-
143
- def load_trained(model_name: str = None, device=None):
144
- if model_name is None:
145
- model_name = load_last_model_name()
146
- if model_name is None:
147
- raise ValueError("No trained model found. Train a model first.")
148
- folder = model_folder(model_name)
149
- if not folder.exists():
150
- raise FileNotFoundError(f"Saved model folder not found: {folder}")
151
- # load tokenizer and instantiate model then load state dict
152
- tokenizer = AutoTokenizer.from_pretrained(str(folder))
153
- # we need the original base model identifier to instantiate architecture.
154
- # Assume original model_name saved in folder name; instantiate using folder's config via AutoModel? We used AutoModel, but for simplicity re-use model_name prefix by reading tokenizer._name_or_path if possible
155
- base_name = tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else model_name
156
- model = EmotionClassifier(base_name)
157
- state = torch.load(folder / "best_model.pt", map_location=device or "cpu")
158
- model.load_state_dict(state)
159
- if device:
160
- model.to(device)
161
- return model, tokenizer, model_name
162
-
163
- # ---------------------------
164
- # Training loop (uses trainable params only)
165
- # ---------------------------
166
  def train_model(
167
  df,
168
  model_name="bert-base-multilingual-cased",
@@ -176,33 +194,34 @@ def train_model(
176
  freeze_layers=6,
177
  device=None
178
  ):
179
- device = device or (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
 
180
  tokenizer = AutoTokenizer.from_pretrained(model_name)
181
 
182
- # prepare dataset
183
  df = df.reset_index(drop=True)
184
- enc_dataset = build_tensor_dataset(df, tokenizer, max_len=max_len)
185
- # split indices
186
- n = len(enc_dataset)
187
- idx = list(range(n))
188
  train_idx, val_idx = train_test_split(idx, test_size=0.15, random_state=42)
189
- def subset(ds, indices):
190
- input_ids = torch.stack([ds[i][0] for i in indices])
191
- attn = torch.stack([ds[i][1] for i in indices])
192
- labels = torch.stack([ds[i][2] for i in indices])
193
- return TensorDataset(input_ids, attn, labels)
194
 
195
- train_ds = subset(enc_dataset, train_idx)
196
- val_ds = subset(enc_dataset, val_idx)
 
 
 
 
 
 
 
197
 
198
  train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
199
  val_loader = DataLoader(val_ds, batch_size=batch_size)
200
 
201
- model = EmotionClassifier(model_name)
202
  model.to(device)
203
 
204
- # freeze layers if requested (works for BERT-like named params)
205
- for name, param in model.bert.named_parameters():
206
  if name.startswith("embeddings."):
207
  param.requires_grad = False
208
  elif name.startswith("encoder.layer"):
@@ -216,127 +235,154 @@ def train_model(
216
  pos_weight = compute_pos_weight(df).to(device)
217
  loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
218
 
219
- optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=weight_decay)
 
 
 
 
 
220
  total_steps = len(train_loader) * epochs
221
- warmup_steps = int(warmup_ratio * total_steps) if total_steps>0 else 0
222
- scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
 
 
 
 
 
223
 
224
  best_val = float("inf")
225
- epochs_no_improve = 0
226
- history = {"train_loss":[], "val_loss":[]}
 
 
 
227
 
228
- for epoch in range(1, epochs+1):
229
  model.train()
230
- running_loss = 0.0
231
- for batch in train_loader:
232
- optimizer.zero_grad()
233
- input_ids = batch[0].to(device)
234
- attn = batch[1].to(device)
235
- labels = batch[2].to(device)
236
 
237
- logits = model(input_ids=input_ids, attention_mask=attn)
 
238
  loss = loss_fn(logits, labels)
239
  loss.backward()
240
  optimizer.step()
241
- if scheduler is not None:
242
- scheduler.step()
243
 
244
- running_loss += loss.item() * input_ids.size(0)
245
 
246
- avg_train = running_loss / len(train_loader.dataset)
247
- history["train_loss"].append(avg_train)
248
 
249
- # validation
250
  model.eval()
251
- vloss = 0.0
252
  with torch.no_grad():
253
- for batch in val_loader:
254
- input_ids = batch[0].to(device)
255
- attn = batch[1].to(device)
256
- labels = batch[2].to(device)
257
- logits = model(input_ids=input_ids, attention_mask=attn)
258
  loss = loss_fn(logits, labels)
259
- vloss += loss.item() * input_ids.size(0)
260
- avg_val = vloss / len(val_loader.dataset)
261
- history["val_loss"].append(avg_val)
262
-
263
- print(f"Epoch {epoch}/{epochs} | Train loss {avg_train:.4f} | Val loss {avg_val:.4f}")
264
-
265
- if avg_val < best_val:
266
- best_val = avg_val
267
- epochs_no_improve = 0
268
- # save model+tokenizer to folder
269
- save_trained(model, tokenizer, model_name)
270
- print(f"Saved best model for {model_name}")
271
  else:
272
- epochs_no_improve += 1
273
- if epochs_no_improve >= patience:
274
- print("Early stopping triggered")
275
  break
276
 
277
  return model, tokenizer, history
278
 
279
- # ---------------------------
280
- # Inference helpers (batch optimized)
281
- # ---------------------------
282
- def predict_batch_from_texts(texts, model, tokenizer, max_len=128, batch_size=32, device=None):
283
- device = device or (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
284
- model.to(device)
285
- model.eval()
286
- results = []
287
- # batch tokenize
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  for i in range(0, len(texts), batch_size):
289
- batch_texts = texts[i:i+batch_size]
290
- enc = tokenizer.batch_encode_plus(batch_texts, padding="max_length", truncation=True, max_length=max_len, return_tensors="pt")
291
- input_ids = enc["input_ids"].to(device)
292
- attn = enc["attention_mask"].to(device)
 
 
 
 
 
293
  with torch.no_grad():
294
- logits = model(input_ids=input_ids, attention_mask=attn)
295
- probs = torch.sigmoid(logits).cpu().numpy()
 
296
  for p in probs:
297
- results.append({LABELS[j]: float(p[j]) for j in range(len(LABELS))})
298
- return results
299
-
300
- def predict_single_using_saved(text, max_len=128, batch_size=32):
301
- last = load_last_model_name()
302
- if last is None:
303
- raise ValueError("No trained model found. Train first.")
304
- model, tokenizer, _ = load_trained(last)
305
- res = predict_batch_from_texts([text], model, tokenizer, max_len=max_len, batch_size=batch_size)
306
- return res[0]
307
-
308
- # ---------------------------
309
- # Summary utility
310
- # ---------------------------
311
- def summary_top3_from_preds(preds):
312
- # preds: list of dict {label:prob}
313
- avg = {l:0.0 for l in LABELS}
314
- n = max(1, len(preds))
315
  for p in preds:
316
  for l,v in p.items():
317
- avg[l] += float(v)
318
  for l in avg:
319
  avg[l] /= n
320
- sorted_avg = sorted(avg.items(), key=lambda x: x[1], reverse=True)
321
- top3 = [{"label": sorted_avg[i][0], "score": float(sorted_avg[i][1])} for i in range(min(3, len(sorted_avg)))]
322
- return {"n": n, "avg_distribution": avg, "top3": top3}
323
-
324
- # ---------------------------
325
- # Wrappers for GUI
326
- # ---------------------------
327
- def wrapper_training(
328
- file_obj, sep=",",
329
- model_name="bert-base-multilingual-cased",
330
- epochs=3, batch_size=8, lr=2e-5, max_len=128,
331
- weight_decay=0.01, warmup_ratio=0.1, patience=2, freeze_layers=6
332
- ):
333
- # file_obj can be gr.File or path string
334
- csv_path = read_uploaded_file(file_obj)
335
- df = pd.read_csv(csv_path, sep=sep)
336
  df = clean_labels(df)
337
  df = clean_text(df)
338
 
339
- model, tokenizer, history = train_model(
340
  df=df,
341
  model_name=model_name,
342
  epochs=int(epochs),
@@ -348,86 +394,80 @@ def wrapper_training(
348
  patience=int(patience),
349
  freeze_layers=int(freeze_layers)
350
  )
351
- # return a short report and history summary
352
  return {
353
- "message": f"Training finished. Best model saved under saved_models/{model_name}",
354
- "history": {"train_loss": history["train_loss"], "val_loss": history["val_loss"]},
355
  "model_name": model_name
356
  }
357
 
358
- def wrapper_predict_single(text, max_len=128):
359
- res = predict_single_using_saved(text, max_len=max_len)
360
- return res
361
 
362
- def wrapper_predict_dataset(file_obj, sep=",", max_len=128, batch_size=32):
363
- csv_path = read_uploaded_file(file_obj)
364
- df = pd.read_csv(csv_path, sep=sep)
 
 
 
 
365
  df = clean_labels(df)
366
  df = clean_text(df)
367
- texts = df["text"].tolist()
368
- last = load_last_model_name()
369
- if last is None:
370
- return {"error":"No trained model found. Train first."}
371
- model, tokenizer, _ = load_trained(last)
372
- preds = predict_batch_from_texts(texts, model, tokenizer, max_len=max_len, batch_size=int(batch_size))
373
- summary = summary_top3_from_preds(preds)
374
- return {"n": summary["n"], "top3": summary["top3"], "avg_distribution": summary["avg_distribution"]}
375
-
376
- # ---------------------------
377
- # Plot helper (optional in notebook)
378
- # ---------------------------
379
- def plot_emotion_pie_from_avg(avg_dict):
380
- labels = list(avg_dict.keys())
381
- values = list(avg_dict.values())
382
- plt.figure(figsize=(6,6))
383
- plt.pie(values, labels=labels, autopct="%1.1f%%")
384
- plt.title("Emotion Distribution (average)")
385
- plt.show()
386
-
387
- # ---------------------------
388
- # Gradio GUI
389
- # ---------------------------
390
  with gr.Blocks() as app:
391
- gr.Markdown("## Emotion Classifier — Dava (Revised)")
392
 
393
  with gr.Tab("Training"):
394
- file_in = gr.File(label="Upload training CSV")
395
  sep_in = gr.Textbox(label="Delimiter", value=",")
396
- model_name_in = gr.Dropdown(label="Model backbone", choices=[
397
- "bert-base-multilingual-cased", "indobert-base-uncased", "bert-base-uncased"
398
- ], value="bert-base-multilingual-cased")
 
 
399
  epochs_in = gr.Number(label="Epochs", value=3)
400
- batch_in = gr.Number(label="Batch size", value=8)
401
- lr_in = gr.Number(label="Learning rate", value=2e-5)
402
- maxlen_in = gr.Number(label="Max length", value=128)
403
- weightdecay_in = gr.Number(label="Weight decay", value=0.01)
404
- warmup_in = gr.Number(label="Warmup ratio", value=0.1)
405
- patience_in = gr.Number(label="Early stop patience", value=2)
406
- freeze_in = gr.Number(label="Freeze layers (first n)", value=6)
407
- train_btn = gr.Button("Start Training")
408
- train_out = gr.JSON(label="Training result (history + message)")
409
-
410
- train_btn.click(
411
- fn=wrapper_training,
412
- inputs=[file_in, sep_in, model_name_in, epochs_in, batch_in, lr_in, maxlen_in, weightdecay_in, warmup_in, patience_in, freeze_in],
413
- outputs=train_out
 
 
414
  )
415
 
416
- with gr.Tab("Single Inference"):
417
- text_in = gr.Textbox(label="Text to analyze")
418
- single_btn = gr.Button("Predict")
419
- single_out = gr.JSON(label="Emotion probabilities")
420
-
421
- single_btn.click(fn=wrapper_predict_single, inputs=[text_in], outputs=single_out)
422
 
423
- with gr.Tab("Dataset Inference"):
424
- file_test = gr.File(label="Upload CSV for inference")
425
  sep_test = gr.Textbox(label="Delimiter", value=",")
426
- maxlen_test = gr.Number(label="Max length", value=128)
427
- batchsize_test = gr.Number(label="Batch size (inference)", value=32)
428
- test_btn = gr.Button("Run Inference")
429
- test_out = gr.JSON(label="Summary result")
430
 
431
- test_btn.click(fn=wrapper_predict_dataset, inputs=[file_test, sep_test, maxlen_test, batchsize_test], outputs=test_out)
 
 
 
 
 
 
 
432
 
433
- app.launch()
 
1
+ # ==============================================================
2
+ # EMOTION CLASSIFIER
3
+ # ==============================================================
4
+
5
  import os
 
6
  import math
7
  import torch
8
  import pandas as pd
9
  import numpy as np
 
10
  import gradio as gr
11
+ import matplotlib.pyplot as plt
12
+
13
  from pathlib import Path
14
  from torch import nn
15
  from torch.utils.data import Dataset, DataLoader, TensorDataset
16
  from sklearn.model_selection import train_test_split
17
+ from transformers import (
18
+ AutoTokenizer,
19
+ AutoModel,
20
+ AutoConfig,
21
+ get_linear_schedule_with_warmup
22
+ )
23
+
24
+ # =========================================================
25
+ # CONFIG
26
+ # =========================================================
27
  LABELS = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
28
  LABEL2ID = {l:i for i,l in enumerate(LABELS)}
29
  ID2LABEL = {i:l for i,l in enumerate(LABELS)}
30
+
31
  SAVED_ROOT = Path("saved_models")
32
  SAVED_ROOT.mkdir(exist_ok=True)
33
 
34
+ # ==============================================================
35
+ # Simpan dan Muat Data
36
+ # ==============================================================
37
  def read_uploaded_file(uploaded):
 
38
  if uploaded is None:
39
  raise ValueError("No file provided")
40
+
41
  if isinstance(uploaded, str):
42
  return uploaded
43
+
44
  if hasattr(uploaded, "name"):
45
  return uploaded.name
46
+
47
  if hasattr(uploaded, "read"):
 
48
  tmp = Path("/tmp") / f"uploaded_{np.random.randint(1e9)}.csv"
49
  with open(tmp, "wb") as f:
50
  f.write(uploaded.read())
51
  return str(tmp)
52
+
53
  raise ValueError("Unsupported uploaded file type")
54
 
 
 
55
 
56
+ def save_last_model_name(name):
57
+ (SAVED_ROOT / "last_model.txt").write_text(name)
58
+
59
+
60
+ def load_last_model_name():
61
  p = SAVED_ROOT / "last_model.txt"
62
  if p.exists():
63
  return p.read_text().strip()
64
  return None
65
 
66
+
67
+ def model_folder(model_name):
68
  return SAVED_ROOT / model_name.replace("/", "_")
69
 
 
 
 
 
 
 
 
70
 
71
+ # ==============================================================
72
+ # Pembersihan Data
73
+ # ==============================================================
74
  def clean_labels(df):
 
75
  for l in LABELS:
76
  if l not in df.columns:
77
  df[l] = 0
78
  return df
79
 
80
+
81
+ def clean_text(df, col="text"):
82
+ if col not in df.columns:
83
+ raise KeyError(f"CSV must contain a column '{col}'")
84
+ df[col] = df[col].astype(str).str.replace("\n", " ").str.strip()
85
  return df
86
 
87
+
88
+ # =========================================================
89
+ # Model AI
90
+ # =========================================================
91
+ class EmotionModel(nn.Module):
92
+ """Consistent backbone + dropout + classifier."""
93
+ def __init__(self, base_model_name, num_labels=8):
94
  super().__init__()
95
+ self.config = AutoConfig.from_pretrained(base_model_name)
96
+ self.base = AutoModel.from_pretrained(base_model_name)
97
  self.drop = nn.Dropout(0.3)
98
+ self.clf = nn.Linear(self.config.hidden_size, num_labels)
99
+
100
+ def forward(self, ids, mask):
101
+ out = self.base(
102
+ input_ids=ids,
103
+ attention_mask=mask
104
+ )
105
+
106
+ # Prefer pooler_output if exists
107
+ if hasattr(out, "pooler_output") and out.pooler_output is not None:
108
+ x = out.pooler_output
109
+ else:
110
+ x = out.last_hidden_state[:, 0, :]
111
+
112
+ x = self.drop(x)
113
+ return self.clf(x)
114
+
115
+
116
+ # ==============================================================
117
+ # Tokenisasi Dataset
118
+ # ==============================================================
119
+ def tokenize_batch(texts, tokenizer, max_len=128):
120
+ return tokenizer(
121
  texts,
122
  padding="max_length",
123
  truncation=True,
124
  max_length=max_len,
125
  return_tensors="pt"
126
  )
127
+
128
 
129
  def build_tensor_dataset(df, tokenizer, max_len=128):
130
+ enc = tokenize_batch(df["text"].tolist(), tokenizer, max_len)
 
131
  labels = torch.tensor(df[LABELS].values, dtype=torch.float)
132
+ return TensorDataset(
133
+ enc["input_ids"],
134
+ enc["attention_mask"],
135
+ labels
136
+ )
137
+
138
 
139
+ # ==============================================================
140
+ # Bobot
141
+ # ==============================================================
142
  def compute_pos_weight(df):
143
+ counts = df[LABELS].sum(axis=0)
144
  N = len(df)
145
  pw = []
146
  for c in counts:
147
+ pw.append((N - c) / c if c > 0 else 1.0)
 
 
 
148
  return torch.tensor(pw, dtype=torch.float)
149
 
150
+
151
+ # ==============================================================
152
+ # Simpan dan Muat Model
153
+ # ==============================================================
154
+ def save_model(model, tokenizer, folder):
155
+ os.makedirs(folder, exist_ok=True)
156
+
157
+ # Save backbone HF style
158
+ model.base.save_pretrained(folder)
159
+ tokenizer.save_pretrained(folder)
160
+
161
+ # Save classifier head
162
+ torch.save(model.clf.state_dict(), str(Path(folder) / "classifier.pt"))
163
+
164
+ # Save last-used name
165
+ save_last_model_name(str(folder))
166
+
167
+
168
+ def load_model(folder):
169
+ folder = str(folder)
170
+ config = AutoConfig.from_pretrained(folder)
171
+ tokenizer = AutoTokenizer.from_pretrained(folder)
172
+
173
+ model = EmotionModel(folder)
174
+ state = torch.load(f"{folder}/classifier.pt", map_location="cpu")
175
+ model.clf.load_state_dict(state)
176
+ model.eval()
177
+
178
+ return model, tokenizer, config
179
+
180
+
181
+ # ==============================================================
182
+ # Pelatihan
183
+ # ==============================================================
 
 
 
184
  def train_model(
185
  df,
186
  model_name="bert-base-multilingual-cased",
 
194
  freeze_layers=6,
195
  device=None
196
  ):
197
+ device = device or ("cuda" if torch.cuda.is_available() else "cpu")
198
+
199
  tokenizer = AutoTokenizer.from_pretrained(model_name)
200
 
 
201
  df = df.reset_index(drop=True)
202
+ dataset = build_tensor_dataset(df, tokenizer, max_len)
203
+
204
+ idx = list(range(len(dataset)))
 
205
  train_idx, val_idx = train_test_split(idx, test_size=0.15, random_state=42)
 
 
 
 
 
206
 
207
+ def subset(ds, idxs):
208
+ return TensorDataset(
209
+ torch.stack([ds[i][0] for i in idxs]),
210
+ torch.stack([ds[i][1] for i in idxs]),
211
+ torch.stack([ds[i][2] for i in idxs]),
212
+ )
213
+
214
+ train_ds = subset(dataset, train_idx)
215
+ val_ds = subset(dataset, val_idx)
216
 
217
  train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
218
  val_loader = DataLoader(val_ds, batch_size=batch_size)
219
 
220
+ model = EmotionModel(model_name)
221
  model.to(device)
222
 
223
+ # Freeze lower layers
224
+ for name, param in model.base.named_parameters():
225
  if name.startswith("embeddings."):
226
  param.requires_grad = False
227
  elif name.startswith("encoder.layer"):
 
235
  pos_weight = compute_pos_weight(df).to(device)
236
  loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
237
 
238
+ optimizer = torch.optim.AdamW(
239
+ filter(lambda p: p.requires_grad, model.parameters()),
240
+ lr=lr,
241
+ weight_decay=weight_decay
242
+ )
243
+
244
  total_steps = len(train_loader) * epochs
245
+ warmup_steps = int(warmup_ratio * total_steps)
246
+
247
+ scheduler = get_linear_schedule_with_warmup(
248
+ optimizer,
249
+ num_warmup_steps=warmup_steps,
250
+ num_training_steps=total_steps
251
+ )
252
 
253
  best_val = float("inf")
254
+ no_improve = 0
255
+
256
+ history = {"train_loss": [], "val_loss": []}
257
+
258
+ save_path = str(model_folder(model_name))
259
 
260
+ for ep in range(1, epochs+1):
261
  model.train()
262
+ t_loss = 0
263
+
264
+ for input_ids, attn, labels in train_loader:
265
+ input_ids = input_ids.to(device)
266
+ attn = attn.to(device)
267
+ labels = labels.to(device)
268
 
269
+ optimizer.zero_grad()
270
+ logits = model(input_ids, attn)
271
  loss = loss_fn(logits, labels)
272
  loss.backward()
273
  optimizer.step()
274
+ scheduler.step()
 
275
 
276
+ t_loss += loss.item() * input_ids.size(0)
277
 
278
+ train_loss = t_loss / len(train_loader.dataset)
279
+ history["train_loss"].append(train_loss)
280
 
281
+ # Validation
282
  model.eval()
283
+ v_loss = 0
284
  with torch.no_grad():
285
+ for input_ids, attn, labels in val_loader:
286
+ input_ids = input_ids.to(device)
287
+ attn = attn.to(device)
288
+ labels = labels.to(device)
289
+ logits = model(input_ids, attn)
290
  loss = loss_fn(logits, labels)
291
+ v_loss += loss.item() * input_ids.size(0)
292
+
293
+ val_loss = v_loss / len(val_loader.dataset)
294
+ history["val_loss"].append(val_loss)
295
+
296
+ print(f"Epoch {ep} | Train={train_loss:.4f} | Val={val_loss:.4f}")
297
+
298
+ if val_loss < best_val:
299
+ best_val = val_loss
300
+ no_improve = 0
301
+ save_model(model, tokenizer, save_path)
302
+ print(f"Saved best model to {save_path}")
303
  else:
304
+ no_improve += 1
305
+ if no_improve >= patience:
306
+ print("Early stopping.")
307
  break
308
 
309
  return model, tokenizer, history
310
 
311
+
312
+ # ==============================================================
313
+ # Uji
314
+ # ==============================================================
315
+ def predict_single(text, folder=None):
316
+ folder = folder or load_last_model_name()
317
+ model, tokenizer, cfg = load_model(folder)
318
+
319
+ encoded = tokenizer(
320
+ text,
321
+ padding="max_length",
322
+ truncation=True,
323
+ max_length=128,
324
+ return_tensors="pt"
325
+ )
326
+
327
+ with torch.no_grad():
328
+ out = model(encoded["input_ids"], encoded["attention_mask"])
329
+ probs = torch.sigmoid(out).numpy()[0]
330
+
331
+ return {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
332
+
333
+
334
+ def predict_batch(texts, folder=None, batch_size=32):
335
+ folder = folder or load_last_model_name()
336
+ model, tokenizer, cfg = load_model(folder)
337
+
338
+ preds = []
339
  for i in range(0, len(texts), batch_size):
340
+ batch = texts[i:i+batch_size]
341
+ enc = tokenizer(
342
+ batch,
343
+ padding="max_length",
344
+ truncation=True,
345
+ max_length=128,
346
+ return_tensors="pt"
347
+ )
348
+
349
  with torch.no_grad():
350
+ out = model(enc["input_ids"], enc["attention_mask"])
351
+ probs = torch.sigmoid(out).numpy()
352
+
353
  for p in probs:
354
+ preds.append({LABELS[j]: float(p[j]) for j in range(len(LABELS))})
355
+
356
+ return preds
357
+
358
+
359
+ def summarize_preds(preds):
360
+ avg = {l: 0.0 for l in LABELS}
361
+ n = len(preds)
362
+
 
 
 
 
 
 
 
 
 
363
  for p in preds:
364
  for l,v in p.items():
365
+ avg[l] += v
366
  for l in avg:
367
  avg[l] /= n
368
+
369
+ top3 = sorted(avg.items(), key=lambda x: x[1], reverse=True)[:3]
370
+ top3 = [{"label":l, "score":float(s)} for l,s in top3]
371
+
372
+ return {"n":n, "avg_distribution":avg, "top3":top3}
373
+
374
+
375
+ # ==============================================================
376
+ # GRADIO GUI
377
+ # ==============================================================
378
+ def wrapper_train(file_obj, sep, model_name, epochs, batch_size, lr,
379
+ max_len, weight_decay, warmup_ratio, patience, freeze_layers):
380
+ csv = read_uploaded_file(file_obj)
381
+ df = pd.read_csv(csv, sep=sep)
 
 
382
  df = clean_labels(df)
383
  df = clean_text(df)
384
 
385
+ _, _, history = train_model(
386
  df=df,
387
  model_name=model_name,
388
  epochs=int(epochs),
 
394
  patience=int(patience),
395
  freeze_layers=int(freeze_layers)
396
  )
397
+
398
  return {
399
+ "message": "Training finished.",
400
+ "history": history,
401
  "model_name": model_name
402
  }
403
 
 
 
 
404
 
405
+ def wrapper_single(text):
406
+ return predict_single(text)
407
+
408
+
409
+ def wrapper_dataset(file_obj, sep, max_len, batch_size):
410
+ csv = read_uploaded_file(file_obj)
411
+ df = pd.read_csv(csv, sep=sep)
412
  df = clean_labels(df)
413
  df = clean_text(df)
414
+
415
+ preds = predict_batch(df["text"].tolist(), batch_size=int(batch_size))
416
+ return summarize_preds(preds)
417
+
418
+
419
+ # ==============================================================
420
+ # Menjalankan GRADIO
421
+ # ==============================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  with gr.Blocks() as app:
423
+ gr.Markdown("## Emotion Classifier — Dava (Final Version)")
424
 
425
  with gr.Tab("Training"):
426
+ file_in = gr.File(label="Upload Training CSV")
427
  sep_in = gr.Textbox(label="Delimiter", value=",")
428
+ model_name_in = gr.Dropdown(
429
+ label="Backbone Model",
430
+ choices=["bert-base-multilingual-cased", "indobert-base-p1"],
431
+ value="bert-base-multilingual-cased"
432
+ )
433
  epochs_in = gr.Number(label="Epochs", value=3)
434
+ bs_in = gr.Number(label="Batch Size", value=8)
435
+ lr_in = gr.Number(label="Learning Rate", value=2e-5)
436
+ maxlen_in = gr.Number(label="Max Length", value=128)
437
+ wd_in = gr.Number(label="Weight Decay", value=0.01)
438
+ warmup_in = gr.Number(label="Warmup Ratio", value=0.1)
439
+ patience_in = gr.Number(label="Patience", value=2)
440
+ freeze_in = gr.Number(label="Freeze Layers", value=6)
441
+
442
+ btn_train = gr.Button("Start Training")
443
+ out_train = gr.JSON(label="Train Result")
444
+
445
+ btn_train.click(
446
+ wrapper_train,
447
+ inputs=[file_in, sep_in, model_name_in, epochs_in, bs_in,
448
+ lr_in, maxlen_in, wd_in, warmup_in, patience_in, freeze_in],
449
+ outputs=out_train
450
  )
451
 
452
+ with gr.Tab("Single Prediction"):
453
+ text_in = gr.Textbox(label="Text")
454
+ btn_single = gr.Button("Predict")
455
+ out_single = gr.JSON(label="Emotion Scores")
456
+ btn_single.click(wrapper_single, inputs=[text_in], outputs=out_single)
 
457
 
458
+ with gr.Tab("Dataset Prediction"):
459
+ file_test = gr.File(label="Upload CSV")
460
  sep_test = gr.Textbox(label="Delimiter", value=",")
461
+ maxlen_test = gr.Number(label="Max Length", value=128)
462
+ bs_test = gr.Number(label="Batch Size", value=32)
 
 
463
 
464
+ btn_test = gr.Button("Run Prediction")
465
+ out_test = gr.JSON(label="Summary Result")
466
+
467
+ btn_test.click(
468
+ wrapper_dataset,
469
+ inputs=[file_test, sep_test, maxlen_test, bs_test],
470
+ outputs=out_test
471
+ )
472
 
473
+ app.launch()