Darendra commited on
Commit
062b593
·
verified ·
1 Parent(s): 636be4d

Affective_Computing

Browse files
Files changed (1) hide show
  1. app.py +433 -0
app.py ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import math
4
+ import torch
5
+ import pandas as pd
6
+ import numpy as np
7
+ import matplotlib.pyplot as plt
8
+ import gradio as gr
9
+ from pathlib import Path
10
+ from torch import nn
11
+ from torch.utils.data import Dataset, DataLoader, TensorDataset
12
+ from sklearn.model_selection import train_test_split
13
+ from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
14
+
15
+ # ---------------------------
16
+ # Konfigurasi & Label
17
+ # ---------------------------
18
+ LABELS = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
19
+ LABEL2ID = {l:i for i,l in enumerate(LABELS)}
20
+ ID2LABEL = {i:l for i,l in enumerate(LABELS)}
21
+ SAVED_ROOT = Path("saved_models")
22
+ SAVED_ROOT.mkdir(exist_ok=True)
23
+
24
+ # ---------------------------
25
+ # Utility I/O small helpers
26
+ # ---------------------------
27
+ def read_uploaded_file(uploaded):
28
+ # uploaded can be a gradio file object or a path string
29
+ if uploaded is None:
30
+ raise ValueError("No file provided")
31
+ if isinstance(uploaded, str):
32
+ return uploaded
33
+ # gradio returns a tempfile-like object with 'name' attribute
34
+ if hasattr(uploaded, "name"):
35
+ return uploaded.name
36
+ # fallback: bytesIO-like
37
+ if hasattr(uploaded, "read"):
38
+ # write to temp file
39
+ tmp = Path("/tmp") / f"uploaded_{np.random.randint(1e9)}.csv"
40
+ with open(tmp, "wb") as f:
41
+ f.write(uploaded.read())
42
+ return str(tmp)
43
+ raise ValueError("Unsupported uploaded file type")
44
+
45
+ def save_last_model_name(model_name: str):
46
+ (SAVED_ROOT / "last_model.txt").write_text(model_name)
47
+
48
+ def load_last_model_name() -> str:
49
+ p = SAVED_ROOT / "last_model.txt"
50
+ if p.exists():
51
+ return p.read_text().strip()
52
+ return None
53
+
54
+ def model_folder(model_name: str) -> Path:
55
+ return SAVED_ROOT / model_name.replace("/", "_")
56
+
57
+ # ---------------------------
58
+ # Data loading & cleaning
59
+ # ---------------------------
60
+ def load_dataset(path_or_file, sep=","):
61
+ path = read_uploaded_file(path_or_file)
62
+ df = pd.read_csv(path, sep=sep)
63
+ return df
64
+
65
+ def clean_labels(df):
66
+ # ensure all LABELS exist as columns (0/1)
67
+ for l in LABELS:
68
+ if l not in df.columns:
69
+ df[l] = 0
70
+ return df
71
+
72
+ def clean_text(df, text_col="text"):
73
+ if text_col not in df.columns:
74
+ raise KeyError(f"CSV must contain column named '{text_col}' (found columns: {df.columns.tolist()})")
75
+ df[text_col] = df[text_col].astype(str).str.replace("\n", " ").str.strip()
76
+ return df
77
+
78
+ # ---------------------------
79
+ # Model class (BERT + head)
80
+ # ---------------------------
81
+ class EmotionClassifier(nn.Module):
82
+ def __init__(self, model_name="bert-base-multilingual-cased", num_labels=8):
83
+ super().__init__()
84
+ self.bert = AutoModel.from_pretrained(model_name)
85
+ self.drop = nn.Dropout(0.3)
86
+ self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
87
+
88
+ def forward(self, input_ids, attention_mask=None, token_type_ids=None):
89
+ outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
90
+ cls = outputs.last_hidden_state[:,0,:]
91
+ x = self.drop(cls)
92
+ logits = self.classifier(x)
93
+ return logits
94
+
95
+ # ---------------------------
96
+ # Tokenisasi dan dataset (optimized batch)
97
+ # ---------------------------
98
+ def tokenize_dataset_batch(texts, tokenizer, max_len=128):
99
+ enc = tokenizer.batch_encode_plus(
100
+ texts,
101
+ padding="max_length",
102
+ truncation=True,
103
+ max_length=max_len,
104
+ return_tensors="pt"
105
+ )
106
+ return enc # dict: input_ids, attention_mask, (token_type_ids)
107
+
108
+ def build_tensor_dataset(df, tokenizer, max_len=128):
109
+ texts = df["text"].tolist()
110
+ enc = tokenize_dataset_batch(texts, tokenizer, max_len=max_len)
111
+ labels = torch.tensor(df[LABELS].values, dtype=torch.float)
112
+ dataset = TensorDataset(enc["input_ids"], enc["attention_mask"], labels)
113
+ return dataset
114
+
115
+ # ---------------------------
116
+ # Pos-weight compute
117
+ # ---------------------------
118
+ def compute_pos_weight(df):
119
+ counts = df[LABELS].sum(axis=0).astype(int).to_list()
120
+ N = len(df)
121
+ pw = []
122
+ for c in counts:
123
+ if c == 0:
124
+ pw.append(1.0)
125
+ else:
126
+ pw.append((N - c) / c)
127
+ return torch.tensor(pw, dtype=torch.float)
128
+
129
+ # ---------------------------
130
+ # Save / Load trained model files
131
+ # ---------------------------
132
+ def save_trained(model, tokenizer, model_name:str):
133
+ folder = model_folder(model_name)
134
+ folder.mkdir(parents=True, exist_ok=True)
135
+ # save model weights
136
+ torch.save(model.state_dict(), folder / "best_model.pt")
137
+ # save tokenizer config
138
+ tokenizer.save_pretrained(str(folder))
139
+ # save a text marker
140
+ save_last_model_name(model_name)
141
+ return str(folder)
142
+
143
+ def load_trained(model_name: str = None, device=None):
144
+ if model_name is None:
145
+ model_name = load_last_model_name()
146
+ if model_name is None:
147
+ raise ValueError("No trained model found. Train a model first.")
148
+ folder = model_folder(model_name)
149
+ if not folder.exists():
150
+ raise FileNotFoundError(f"Saved model folder not found: {folder}")
151
+ # load tokenizer and instantiate model then load state dict
152
+ tokenizer = AutoTokenizer.from_pretrained(str(folder))
153
+ # we need the original base model identifier to instantiate architecture.
154
+ # Assume original model_name saved in folder name; instantiate using folder's config via AutoModel? We used AutoModel, but for simplicity re-use model_name prefix by reading tokenizer._name_or_path if possible
155
+ base_name = tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else model_name
156
+ model = EmotionClassifier(base_name)
157
+ state = torch.load(folder / "best_model.pt", map_location=device or "cpu")
158
+ model.load_state_dict(state)
159
+ if device:
160
+ model.to(device)
161
+ return model, tokenizer, model_name
162
+
163
+ # ---------------------------
164
+ # Training loop (uses trainable params only)
165
+ # ---------------------------
166
+ def train_model(
167
+ df,
168
+ model_name="bert-base-multilingual-cased",
169
+ epochs=3,
170
+ batch_size=8,
171
+ lr=2e-5,
172
+ max_len=128,
173
+ weight_decay=0.01,
174
+ warmup_ratio=0.1,
175
+ patience=2,
176
+ freeze_layers=6,
177
+ device=None
178
+ ):
179
+ device = device or (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
180
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
181
+
182
+ # prepare dataset
183
+ df = df.reset_index(drop=True)
184
+ enc_dataset = build_tensor_dataset(df, tokenizer, max_len=max_len)
185
+ # split indices
186
+ n = len(enc_dataset)
187
+ idx = list(range(n))
188
+ train_idx, val_idx = train_test_split(idx, test_size=0.15, random_state=42)
189
+ def subset(ds, indices):
190
+ input_ids = torch.stack([ds[i][0] for i in indices])
191
+ attn = torch.stack([ds[i][1] for i in indices])
192
+ labels = torch.stack([ds[i][2] for i in indices])
193
+ return TensorDataset(input_ids, attn, labels)
194
+
195
+ train_ds = subset(enc_dataset, train_idx)
196
+ val_ds = subset(enc_dataset, val_idx)
197
+
198
+ train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
199
+ val_loader = DataLoader(val_ds, batch_size=batch_size)
200
+
201
+ model = EmotionClassifier(model_name)
202
+ model.to(device)
203
+
204
+ # freeze layers if requested (works for BERT-like named params)
205
+ for name, param in model.bert.named_parameters():
206
+ if name.startswith("embeddings."):
207
+ param.requires_grad = False
208
+ elif name.startswith("encoder.layer"):
209
+ try:
210
+ layer_num = int(name.split(".")[2])
211
+ if layer_num < freeze_layers:
212
+ param.requires_grad = False
213
+ except:
214
+ pass
215
+
216
+ pos_weight = compute_pos_weight(df).to(device)
217
+ loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
218
+
219
+ optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=weight_decay)
220
+ total_steps = len(train_loader) * epochs
221
+ warmup_steps = int(warmup_ratio * total_steps) if total_steps>0 else 0
222
+ scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
223
+
224
+ best_val = float("inf")
225
+ epochs_no_improve = 0
226
+ history = {"train_loss":[], "val_loss":[]}
227
+
228
+ for epoch in range(1, epochs+1):
229
+ model.train()
230
+ running_loss = 0.0
231
+ for batch in train_loader:
232
+ optimizer.zero_grad()
233
+ input_ids = batch[0].to(device)
234
+ attn = batch[1].to(device)
235
+ labels = batch[2].to(device)
236
+
237
+ logits = model(input_ids=input_ids, attention_mask=attn)
238
+ loss = loss_fn(logits, labels)
239
+ loss.backward()
240
+ optimizer.step()
241
+ if scheduler is not None:
242
+ scheduler.step()
243
+
244
+ running_loss += loss.item() * input_ids.size(0)
245
+
246
+ avg_train = running_loss / len(train_loader.dataset)
247
+ history["train_loss"].append(avg_train)
248
+
249
+ # validation
250
+ model.eval()
251
+ vloss = 0.0
252
+ with torch.no_grad():
253
+ for batch in val_loader:
254
+ input_ids = batch[0].to(device)
255
+ attn = batch[1].to(device)
256
+ labels = batch[2].to(device)
257
+ logits = model(input_ids=input_ids, attention_mask=attn)
258
+ loss = loss_fn(logits, labels)
259
+ vloss += loss.item() * input_ids.size(0)
260
+ avg_val = vloss / len(val_loader.dataset)
261
+ history["val_loss"].append(avg_val)
262
+
263
+ print(f"Epoch {epoch}/{epochs} | Train loss {avg_train:.4f} | Val loss {avg_val:.4f}")
264
+
265
+ if avg_val < best_val:
266
+ best_val = avg_val
267
+ epochs_no_improve = 0
268
+ # save model+tokenizer to folder
269
+ save_trained(model, tokenizer, model_name)
270
+ print(f"Saved best model for {model_name}")
271
+ else:
272
+ epochs_no_improve += 1
273
+ if epochs_no_improve >= patience:
274
+ print("Early stopping triggered")
275
+ break
276
+
277
+ return model, tokenizer, history
278
+
279
+ # ---------------------------
280
+ # Inference helpers (batch optimized)
281
+ # ---------------------------
282
+ def predict_batch_from_texts(texts, model, tokenizer, max_len=128, batch_size=32, device=None):
283
+ device = device or (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
284
+ model.to(device)
285
+ model.eval()
286
+ results = []
287
+ # batch tokenize
288
+ for i in range(0, len(texts), batch_size):
289
+ batch_texts = texts[i:i+batch_size]
290
+ enc = tokenizer.batch_encode_plus(batch_texts, padding="max_length", truncation=True, max_length=max_len, return_tensors="pt")
291
+ input_ids = enc["input_ids"].to(device)
292
+ attn = enc["attention_mask"].to(device)
293
+ with torch.no_grad():
294
+ logits = model(input_ids=input_ids, attention_mask=attn)
295
+ probs = torch.sigmoid(logits).cpu().numpy()
296
+ for p in probs:
297
+ results.append({LABELS[j]: float(p[j]) for j in range(len(LABELS))})
298
+ return results
299
+
300
+ def predict_single_using_saved(text, max_len=128, batch_size=32):
301
+ last = load_last_model_name()
302
+ if last is None:
303
+ raise ValueError("No trained model found. Train first.")
304
+ model, tokenizer, _ = load_trained(last)
305
+ res = predict_batch_from_texts([text], model, tokenizer, max_len=max_len, batch_size=batch_size)
306
+ return res[0]
307
+
308
+ # ---------------------------
309
+ # Summary utility
310
+ # ---------------------------
311
+ def summary_top3_from_preds(preds):
312
+ # preds: list of dict {label:prob}
313
+ avg = {l:0.0 for l in LABELS}
314
+ n = max(1, len(preds))
315
+ for p in preds:
316
+ for l,v in p.items():
317
+ avg[l] += float(v)
318
+ for l in avg:
319
+ avg[l] /= n
320
+ sorted_avg = sorted(avg.items(), key=lambda x: x[1], reverse=True)
321
+ top3 = [{"label": sorted_avg[i][0], "score": float(sorted_avg[i][1])} for i in range(min(3, len(sorted_avg)))]
322
+ return {"n": n, "avg_distribution": avg, "top3": top3}
323
+
324
+ # ---------------------------
325
+ # Wrappers for GUI
326
+ # ---------------------------
327
+ def wrapper_training(
328
+ file_obj, sep=",",
329
+ model_name="bert-base-multilingual-cased",
330
+ epochs=3, batch_size=8, lr=2e-5, max_len=128,
331
+ weight_decay=0.01, warmup_ratio=0.1, patience=2, freeze_layers=6
332
+ ):
333
+ # file_obj can be gr.File or path string
334
+ csv_path = read_uploaded_file(file_obj)
335
+ df = pd.read_csv(csv_path, sep=sep)
336
+ df = clean_labels(df)
337
+ df = clean_text(df)
338
+
339
+ model, tokenizer, history = train_model(
340
+ df=df,
341
+ model_name=model_name,
342
+ epochs=int(epochs),
343
+ batch_size=int(batch_size),
344
+ lr=float(lr),
345
+ max_len=int(max_len),
346
+ weight_decay=float(weight_decay),
347
+ warmup_ratio=float(warmup_ratio),
348
+ patience=int(patience),
349
+ freeze_layers=int(freeze_layers)
350
+ )
351
+ # return a short report and history summary
352
+ return {
353
+ "message": f"Training finished. Best model saved under saved_models/{model_name}",
354
+ "history": {"train_loss": history["train_loss"], "val_loss": history["val_loss"]},
355
+ "model_name": model_name
356
+ }
357
+
358
+ def wrapper_predict_single(text, max_len=128):
359
+ res = predict_single_using_saved(text, max_len=max_len)
360
+ return res
361
+
362
+ def wrapper_predict_dataset(file_obj, sep=",", max_len=128, batch_size=32):
363
+ csv_path = read_uploaded_file(file_obj)
364
+ df = pd.read_csv(csv_path, sep=sep)
365
+ df = clean_labels(df)
366
+ df = clean_text(df)
367
+ texts = df["text"].tolist()
368
+ last = load_last_model_name()
369
+ if last is None:
370
+ return {"error":"No trained model found. Train first."}
371
+ model, tokenizer, _ = load_trained(last)
372
+ preds = predict_batch_from_texts(texts, model, tokenizer, max_len=max_len, batch_size=int(batch_size))
373
+ summary = summary_top3_from_preds(preds)
374
+ return {"n": summary["n"], "top3": summary["top3"], "avg_distribution": summary["avg_distribution"]}
375
+
376
+ # ---------------------------
377
+ # Plot helper (optional in notebook)
378
+ # ---------------------------
379
+ def plot_emotion_pie_from_avg(avg_dict):
380
+ labels = list(avg_dict.keys())
381
+ values = list(avg_dict.values())
382
+ plt.figure(figsize=(6,6))
383
+ plt.pie(values, labels=labels, autopct="%1.1f%%")
384
+ plt.title("Emotion Distribution (average)")
385
+ plt.show()
386
+
387
+ # ---------------------------
388
+ # Gradio GUI
389
+ # ---------------------------
390
+ with gr.Blocks() as app:
391
+ gr.Markdown("## Emotion Classifier — Dava (Revised)")
392
+
393
+ with gr.Tab("Training"):
394
+ file_in = gr.File(label="Upload training CSV")
395
+ sep_in = gr.Textbox(label="Delimiter", value=",")
396
+ model_name_in = gr.Dropdown(label="Model backbone", choices=[
397
+ "bert-base-multilingual-cased", "indobert-base-uncased", "bert-base-uncased"
398
+ ], value="bert-base-multilingual-cased")
399
+ epochs_in = gr.Number(label="Epochs", value=3)
400
+ batch_in = gr.Number(label="Batch size", value=8)
401
+ lr_in = gr.Number(label="Learning rate", value=2e-5)
402
+ maxlen_in = gr.Number(label="Max length", value=128)
403
+ weightdecay_in = gr.Number(label="Weight decay", value=0.01)
404
+ warmup_in = gr.Number(label="Warmup ratio", value=0.1)
405
+ patience_in = gr.Number(label="Early stop patience", value=2)
406
+ freeze_in = gr.Number(label="Freeze layers (first n)", value=6)
407
+ train_btn = gr.Button("Start Training")
408
+ train_out = gr.JSON(label="Training result (history + message)")
409
+
410
+ train_btn.click(
411
+ fn=wrapper_training,
412
+ inputs=[file_in, sep_in, model_name_in, epochs_in, batch_in, lr_in, maxlen_in, weightdecay_in, warmup_in, patience_in, freeze_in],
413
+ outputs=train_out
414
+ )
415
+
416
+ with gr.Tab("Single Inference"):
417
+ text_in = gr.Textbox(label="Text to analyze")
418
+ single_btn = gr.Button("Predict")
419
+ single_out = gr.JSON(label="Emotion probabilities")
420
+
421
+ single_btn.click(fn=wrapper_predict_single, inputs=[text_in], outputs=single_out)
422
+
423
+ with gr.Tab("Dataset Inference"):
424
+ file_test = gr.File(label="Upload CSV for inference")
425
+ sep_test = gr.Textbox(label="Delimiter", value=",")
426
+ maxlen_test = gr.Number(label="Max length", value=128)
427
+ batchsize_test = gr.Number(label="Batch size (inference)", value=32)
428
+ test_btn = gr.Button("Run Inference")
429
+ test_out = gr.JSON(label="Summary result")
430
+
431
+ test_btn.click(fn=wrapper_predict_dataset, inputs=[file_test, sep_test, maxlen_test, batchsize_test], outputs=test_out)
432
+
433
+ app.launch()