Darendra commited on
Commit
74dd21d
Β·
verified Β·
1 Parent(s): 2faddd5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +199 -444
app.py CHANGED
@@ -1,92 +1,29 @@
1
- # ==============================================================
2
- # KLASIFIKASI EMOSI
3
- # ==============================================================
4
  import os
5
- import math
6
  import torch
7
  import pandas as pd
8
  import numpy as np
9
  import gradio as gr
 
 
10
  from pathlib import Path
11
  from torch import nn
12
  from torch.utils.data import DataLoader, TensorDataset
13
- from sklearn.model_selection import train_test_split
14
- from transformers import (
15
- AutoTokenizer,
16
- AutoModel,
17
- AutoConfig,
18
- get_linear_schedule_with_warmup
19
- )
20
 
21
  # =========================================================
22
- # CONFIG
23
  # =========================================================
24
  LIST_LABEL = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
 
 
25
 
26
- FOLDER_MODEL = Path("saved_models")
27
- FOLDER_MODEL.mkdir(exist_ok=True)
28
 
29
- # ==============================================================
30
- # File & Utils
31
- # ==============================================================
32
- def read_file_upload(file_obj):
33
- """Handle file upload dari Gradio."""
34
- if file_obj is None:
35
- raise ValueError("File belum diupload.")
36
-
37
- if isinstance(file_obj, str):
38
- return file_obj
39
-
40
- if hasattr(file_obj, "name"):
41
- return file_obj.name
42
-
43
- if hasattr(file_obj, "read"):
44
- temp_path = Path("/tmp") / f"upload_{np.random.randint(1e9)}.csv"
45
- with open(temp_path, "wb") as f:
46
- f.write(file_obj.read())
47
- return str(temp_path)
48
-
49
- raise ValueError("Tipe file tidak didukung.")
50
-
51
- def save_last_model(name):
52
- (FOLDER_MODEL / "last_model_name.txt").write_text(name)
53
-
54
- def load_last_model():
55
- path_file = FOLDER_MODEL / "last_model_name.txt"
56
- if path_file.exists():
57
- return path_file.read_text().strip()
58
- return None
59
-
60
- def get_model_path(model_name):
61
- return FOLDER_MODEL / model_name.replace("/", "_")
62
-
63
- # ==============================================================
64
- # Data Cleaning
65
- # ==============================================================
66
- def clean_labels(df):
67
- """
68
- 1. Isi label kosong dengan 0.
69
- 2. Pastikan tipe data label adalah Numeric (Float), bukan Object/String.
70
- """
71
- for l in LIST_LABEL:
72
- if l not in df.columns:
73
- df[l] = 0
74
-
75
- # --- PERBAIKAN UTAMA DI SINI ---
76
- # Paksa konversi ke angka. Error (text/kosong) jadi NaN, lalu diisi 0.
77
- df[l] = pd.to_numeric(df[l], errors='coerce').fillna(0).astype(float)
78
-
79
- return df
80
-
81
- def clean_text(df, col="text"):
82
- """Hapus enter dan spasi berlebih."""
83
- if col not in df.columns:
84
- raise KeyError(f"CSV harus punya kolom '{col}'")
85
- df[col] = df[col].astype(str).str.replace("\n", " ").str.strip()
86
- return df
87
 
88
  # =========================================================
89
- # Model Architecture
90
  # =========================================================
91
  class ModelEmosi(nn.Module):
92
  def __init__(self, base_model_name, num_labels=8):
@@ -97,411 +34,229 @@ class ModelEmosi(nn.Module):
97
  self.classifier = nn.Linear(self.config.hidden_size, num_labels)
98
 
99
  def forward(self, input_ids, attention_mask):
100
- out = self.base(
101
- input_ids=input_ids,
102
- attention_mask=attention_mask
103
- )
104
  if hasattr(out, "pooler_output") and out.pooler_output is not None:
105
  x = out.pooler_output
106
  else:
107
  x = out.last_hidden_state[:, 0, :]
108
-
109
- x = self.dropout(x)
110
- return self.classifier(x)
111
-
112
- # ==============================================================
113
- # Tokenizer & Dataset
114
- # ==============================================================
115
- def tokenize_batch(texts, tokenizer, max_len=128):
116
- return tokenizer(
117
- texts,
118
- padding="max_length",
119
- truncation=True,
120
- max_length=max_len,
121
- return_tensors="pt"
122
- )
123
-
124
- def create_dataset(df, tokenizer, max_len=128):
125
- encodings = tokenize_batch(df["text"].tolist(), tokenizer, max_len)
126
-
127
- # Karena sudah dibersihkan di clean_labels, ini aman
128
- labels = torch.tensor(df[LIST_LABEL].values, dtype=torch.float)
129
-
130
- return TensorDataset(
131
- encodings["input_ids"],
132
- encodings["attention_mask"],
133
- labels
134
- )
135
 
136
- # ==============================================================
137
- # Weights
138
- # ==============================================================
139
- def hitung_pos_weight(df):
140
- counts = df[LIST_LABEL].sum(axis=0)
141
- N = len(df)
142
- pw = []
143
- for c in counts:
144
- pw.append((N - c) / c if c > 0 else 1.0)
145
- return torch.tensor(pw, dtype=torch.float)
146
-
147
- # ==============================================================
148
- # Save & Load Logic
149
- # ==============================================================
150
- def save_model(model, tokenizer, folder):
151
- os.makedirs(folder, exist_ok=True)
152
- model.base.save_pretrained(folder)
153
- tokenizer.save_pretrained(folder)
154
- torch.save(model.classifier.state_dict(), str(Path(folder) / "classifier_head.pt"))
155
- save_last_model(str(folder))
156
 
157
- def load_model(folder):
158
- folder = str(folder)
159
- config = AutoConfig.from_pretrained(folder)
160
- tokenizer = AutoTokenizer.from_pretrained(folder)
161
- model = ModelEmosi(folder)
162
-
163
- state = torch.load(f"{folder}/classifier_head.pt", map_location="cpu")
164
- model.classifier.load_state_dict(state)
165
- model.eval()
166
- return model, tokenizer, config
167
 
168
- # ==============================================================
169
- # TRAINING
170
- # ==============================================================
171
- def jalankan_training(
172
- df,
173
- progress_bar=None, # Tambahan untuk Gradio Progress
174
- model_name="bert-base-multilingual-cased",
175
- epochs=3,
176
- batch_size=8,
177
- lr=2e-5,
178
- max_len=128,
179
- weight_decay=0.01,
180
- warmup_ratio=0.1,
181
- patience=2,
182
- freeze_layers=6,
183
- device=None
184
- ):
185
- """
186
- Fungsi ini diubah menjadi Generator (yield) agar bisa streaming log ke UI.
187
- """
188
- # 1. Yield pesan awal
189
- yield "Mempersiapkan dataset dan tokenizer...", None
190
 
191
- device = device or ("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  tokenizer = AutoTokenizer.from_pretrained(model_name)
193
 
194
- df = df.reset_index(drop=True)
195
- full_dataset = create_dataset(df, tokenizer, max_len)
196
-
197
- idx = list(range(len(full_dataset)))
198
- train_idx, val_idx = train_test_split(idx, test_size=0.15, random_state=42)
199
-
200
- def get_subset(ds, indices):
201
- return TensorDataset(
202
- torch.stack([ds[i][0] for i in indices]),
203
- torch.stack([ds[i][1] for i in indices]),
204
- torch.stack([ds[i][2] for i in indices]),
205
- )
206
 
207
- train_ds = get_subset(full_dataset, train_idx)
208
- val_ds = get_subset(full_dataset, val_idx)
209
-
210
- train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
211
- val_loader = DataLoader(val_ds, batch_size=batch_size)
212
 
213
  model = ModelEmosi(model_name)
214
  model.to(device)
215
-
216
- # Freeze layers logic
217
- for name, param in model.base.named_parameters():
218
- if name.startswith("embeddings."):
219
- param.requires_grad = False
220
- elif name.startswith("encoder.layer"):
221
- try:
222
- layer_num = int(name.split(".")[2])
223
- if layer_num < freeze_layers:
224
- param.requires_grad = False
225
- except:
226
- pass
227
-
228
- pos_weight = hitung_pos_weight(df).to(device)
229
- loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
230
-
231
- optimizer = torch.optim.AdamW(
232
- filter(lambda p: p.requires_grad, model.parameters()),
233
- lr=lr,
234
- weight_decay=weight_decay
235
- )
236
-
237
- total_steps = len(train_loader) * epochs
238
- warmup_steps = int(warmup_ratio * total_steps)
239
-
240
- scheduler = get_linear_schedule_with_warmup(
241
- optimizer,
242
- num_warmup_steps=warmup_steps,
243
- num_training_steps=total_steps
244
- )
245
-
246
- best_val_loss = float("inf")
247
- no_improve = 0
248
- history = {"train_loss": [], "val_loss": []}
249
- save_path = str(get_model_path(model_name))
250
-
251
- yield f"Mulai Training di device: {device}\nTotal Steps: {total_steps}", None
252
 
253
- for ep in range(1, epochs+1):
254
- # Update progress bar Gradio (jika ada)
255
- if progress_bar:
256
- progress_bar(float(ep)/epochs, desc=f"Epoch {ep}/{epochs}")
257
 
258
- model.train()
259
- total_train_loss = 0
260
-
261
- # Loop batch
262
- for input_ids, mask, labels in train_loader:
263
- input_ids = input_ids.to(device)
264
- mask = mask.to(device)
265
- labels = labels.to(device)
266
-
267
  optimizer.zero_grad()
268
- logits = model(input_ids, mask)
269
- loss = loss_fn(logits, labels)
270
-
271
  loss.backward()
272
  optimizer.step()
273
- scheduler.step()
274
 
275
- total_train_loss += loss.item() * input_ids.size(0)
276
-
277
- avg_train_loss = total_train_loss / len(train_loader.dataset)
278
- history["train_loss"].append(avg_train_loss)
279
 
280
- # Validation
281
- model.eval()
282
- total_val_loss = 0
283
- with torch.no_grad():
284
- for input_ids, mask, labels in val_loader:
285
- input_ids = input_ids.to(device)
286
- mask = mask.to(device)
287
- labels = labels.to(device)
288
- logits = model(input_ids, mask)
289
- loss = loss_fn(logits, labels)
290
- total_val_loss += loss.item() * input_ids.size(0)
291
-
292
- avg_val_loss = total_val_loss / len(val_loader.dataset)
293
- history["val_loss"].append(avg_val_loss)
294
-
295
- # LOGGING MESSAGE
296
- log_msg = f"βœ… Epoch {ep} | Train Loss={avg_train_loss:.4f} | Val Loss={avg_val_loss:.4f}"
 
 
297
 
298
- if avg_val_loss < best_val_loss:
299
- best_val_loss = avg_val_loss
300
- no_improve = 0
301
- save_model(model, tokenizer, save_path)
302
- log_msg += " --> (Model Saved πŸ’Ύ)"
303
- else:
304
- no_improve += 1
305
- log_msg += f" --> (No Improve: {no_improve}/{patience})"
306
 
307
- # Yield log per epoch
308
- yield log_msg, None
 
 
 
 
 
 
 
 
 
 
309
 
310
- if no_improve >= patience:
311
- yield "β›” Early stopping triggered.", None
312
- break
313
-
314
- yield "Training Selesai! πŸŽ‰", history
315
-
316
- # ==============================================================
317
- # PREDICTION
318
- # ==============================================================
319
- def predict_satu(text, folder=None):
320
- folder = folder or load_last_model()
321
- if folder is None:
322
- return {"Error": "Belum ada model yang dilatih."}
323
-
324
- model, tokenizer, _ = load_model(folder)
325
 
326
- encoded = tokenizer(
327
- text,
328
- padding="max_length",
329
- truncation=True,
330
- max_length=128,
331
- return_tensors="pt"
332
- )
333
 
334
- with torch.no_grad():
335
- out = model(encoded["input_ids"], encoded["attention_mask"])
336
- probs = torch.sigmoid(out).numpy()[0]
337
-
338
- return {LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))}
339
-
340
- def predict_batch(text_list, folder=None, batch_size=32):
341
- folder = folder or load_last_model()
342
- if folder is None:
343
- return []
344
-
345
- model, tokenizer, _ = load_model(folder)
346
- preds = []
347
 
348
- for i in range(0, len(text_list), batch_size):
349
- batch = text_list[i:i+batch_size]
350
- encoded = tokenizer(
351
- batch,
352
- padding="max_length",
353
- truncation=True,
354
- max_length=128,
355
- return_tensors="pt"
356
- )
357
- with torch.no_grad():
358
- out = model(encoded["input_ids"], encoded["attention_mask"])
359
- probs = torch.sigmoid(out).numpy()
360
-
361
- for p in probs:
362
- preds.append({LIST_LABEL[j]: float(p[j]) for j in range(len(LIST_LABEL))})
363
-
364
- return preds
365
-
366
- def summarize_result(preds):
367
- if not preds:
368
- return {"Info": "Tidak ada hasil."}
369
 
370
- avg = {l: 0.0 for l in LIST_LABEL}
371
- n = len(preds)
372
-
373
- for p in preds:
374
- for l,v in p.items():
375
- avg[l] += v
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
- for l in avg:
378
- avg[l] /= n
 
 
379
 
380
- top3 = sorted(avg.items(), key=lambda x: x[1], reverse=True)[:3]
381
- top3_fmt = [{"label":l, "score":float(s)} for l,s in top3]
382
-
383
- return {
384
- "jumlah_data": n,
385
- "distribusi_rata2": avg,
386
- "top_3": top3_fmt
387
- }
388
 
389
- # ==============================================================
390
- # GRADIO UI
391
- # ==============================================================
392
- def wrapper_training(file_obj, sep, model_name, epoch, batch, lr,
393
- max_len, wd, warmup, pat, freeze,
394
- progress=gr.Progress()): # Tambahkan progress bar object
395
-
396
- csv_path = read_file_upload(file_obj)
397
- df = pd.read_csv(csv_path, sep=sep)
398
-
399
- df = clean_labels(df)
400
- df = clean_text(df)
401
-
402
- accumulated_log = ""
403
 
404
- # Memanggil generator jalankan_training
405
- for log_msg, history_result in jalankan_training(
406
- df=df,
407
- progress_bar=progress, # Kirim progress bar ke backend
408
- model_name=model_name,
409
- epochs=int(epoch),
410
- batch_size=int(batch),
411
- lr=float(lr),
412
- max_len=int(max_len),
413
- weight_decay=float(wd),
414
- warmup_ratio=float(warmup),
415
- patience=int(pat),
416
- freeze_layers=int(freeze)
417
- ):
418
- # Update log text real-time
419
- accumulated_log += log_msg + "\n"
420
-
421
- # Jika training selesai, history_result tidak None
422
- if history_result is not None:
423
- # Yield terakhir: log penuh + JSON history
424
- yield accumulated_log, history_result
425
- else:
426
- # Yield proses: log berjalan + JSON kosong/null
427
- yield accumulated_log, None
428
-
429
- def wrapper_predict_satu(text):
430
- return predict_satu(text)
431
 
432
- def wrapper_predict_dataset(file_obj, sep, batch_size):
433
- csv_path = read_file_upload(file_obj)
434
- df = pd.read_csv(csv_path, sep=sep)
435
- df = clean_labels(df)
436
- df = clean_text(df)
437
- preds = predict_batch(df["text"].tolist(), batch_size=int(batch_size))
438
- return summarize_result(preds)
439
-
440
- # ==============================================================
441
- # INTERFACE
442
- # ==============================================================
443
- with gr.Blocks() as app:
444
- gr.Markdown("## Emotion Classifier β€” IndoBERT / Multilingual")
445
-
446
- with gr.Tab("Menu Training"):
447
- gr.Markdown("Upload dataset CSV untuk fine-tuning model.")
448
- in_file = gr.File(label="Upload File CSV")
449
- in_sep = gr.Textbox(label="Delimiter (Pemisah)", value=";")
450
-
451
- in_model = gr.Dropdown(
452
- label="Base Model",
453
- choices=["bert-base-multilingual-cased", "indobert-base-p1"],
454
- value="bert-base-multilingual-cased"
455
- )
456
-
457
- with gr.Row():
458
- in_epoch = gr.Number(label="Epochs", value=3)
459
- in_batch = gr.Number(label="Batch Size", value=8)
460
- in_lr = gr.Number(label="Learning Rate", value=2e-5)
461
-
462
- with gr.Row():
463
- in_len = gr.Number(label="Max Length", value=128)
464
- in_pat = gr.Number(label="Patience (Early Stop)", value=2)
465
- in_freeze = gr.Number(label="Freeze Layers", value=6)
466
 
467
- # Hidden advanced params
468
- in_wd = gr.Number(label="Weight Decay", value=0.01, visible=False)
469
- in_warmup = gr.Number(label="Warmup Ratio", value=0.1, visible=False)
 
 
 
 
 
470
 
471
- btn_train = gr.Button("Mulai Training", variant="primary")
472
-
473
- # OUTPUT: DUA KOLOM (Log Teks & Hasil JSON)
474
- with gr.Row():
475
- out_log = gr.Textbox(label="Log Latihan (Real-time)", lines=10, interactive=False)
476
- out_result = gr.JSON(label="Hasil Akhir (History)")
477
-
478
- btn_train.click(
479
- wrapper_training,
480
- inputs=[in_file, in_sep, in_model, in_epoch, in_batch,
481
- in_lr, in_len, in_wd, in_warmup, in_pat, in_freeze],
482
- outputs=[out_log, out_result] # Output ke dua komponen
483
- )
484
-
485
- with gr.Tab("Tes Satu Kalimat"):
486
- in_text = gr.Textbox(label="Input Teks", placeholder="Contoh: Aku senang sekali hari ini...")
487
- btn_satu = gr.Button("Prediksi")
488
- out_satu = gr.Label(label="Confidence Score")
489
-
490
- btn_satu.click(wrapper_predict_satu, inputs=[in_text], outputs=out_satu)
491
 
492
- with gr.Tab("Tes Satu File"):
493
- gr.Markdown("Upload file CSV baru untuk prediksi massal.")
494
- in_file_test = gr.File(label="Upload CSV")
495
- in_sep_test = gr.Textbox(label="Delimiter", value=";")
496
- in_bs_test = gr.Number(label="Batch Size", value=32)
497
-
498
- btn_test = gr.Button("Run Prediction")
499
- out_test = gr.JSON(label="Summary")
500
-
501
- btn_test.click(
502
- wrapper_predict_dataset,
503
- inputs=[in_file_test, in_sep_test, in_bs_test],
504
- outputs=out_test
505
- )
 
 
 
 
 
 
506
 
507
- app.launch()
 
 
 
 
1
  import os
 
2
  import torch
3
  import pandas as pd
4
  import numpy as np
5
  import gradio as gr
6
+ import zipfile
7
+ import shutil
8
  from pathlib import Path
9
  from torch import nn
10
  from torch.utils.data import DataLoader, TensorDataset
11
+ from transformers import AutoTokenizer, AutoModel, AutoConfig
 
 
 
 
 
 
12
 
13
  # =========================================================
14
+ # 1. KONFIGURASI & SETUP
15
  # =========================================================
16
  LIST_LABEL = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
17
+ DIR_TRAINED = Path("saved_models/trained_local")
18
+ DIR_UPLOADED = Path("saved_models/uploaded_colab")
19
 
20
+ DIR_TRAINED.mkdir(parents=True, exist_ok=True)
21
+ DIR_UPLOADED.mkdir(parents=True, exist_ok=True)
22
 
23
+ ACTIVE_MODEL_POINTER = "active_model_path.txt"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # =========================================================
26
+ # 2. ARSITEKTUR MODEL
27
  # =========================================================
28
  class ModelEmosi(nn.Module):
29
  def __init__(self, base_model_name, num_labels=8):
 
34
  self.classifier = nn.Linear(self.config.hidden_size, num_labels)
35
 
36
  def forward(self, input_ids, attention_mask):
37
+ out = self.base(input_ids=input_ids, attention_mask=attention_mask)
 
 
 
38
  if hasattr(out, "pooler_output") and out.pooler_output is not None:
39
  x = out.pooler_output
40
  else:
41
  x = out.last_hidden_state[:, 0, :]
42
+ return self.classifier(self.dropout(x))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ # =========================================================
45
+ # 3. HELPER FUNCTIONS
46
+ # =========================================================
47
+ def clean_data(df):
48
+ for l in LIST_LABEL:
49
+ if l not in df.columns: df[l] = 0
50
+ df[l] = pd.to_numeric(df[l], errors='coerce').fillna(0).astype(float)
51
+ if "text" in df.columns:
52
+ df["text"] = df["text"].astype(str).str.replace("\n", " ").str.strip()
53
+ return df
 
 
 
 
 
 
 
 
 
 
54
 
55
+ def get_active_model_path():
56
+ if os.path.exists(ACTIVE_MODEL_POINTER):
57
+ with open(ACTIVE_MODEL_POINTER, "r") as f:
58
+ path = f.read().strip()
59
+ if os.path.exists(path): return path
60
+ return None
 
 
 
 
61
 
62
+ def set_active_model_path(path):
63
+ with open(ACTIVE_MODEL_POINTER, "w") as f:
64
+ f.write(str(path))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ # =========================================================
67
+ # 4. LOGIKA TRAINING (CPU)
68
+ # =========================================================
69
+ def run_training_generator(file_obj, sep, epochs, batch_size, lr, progress=gr.Progress()):
70
+ yield "⏳ Membaca dataset...", None
71
+ try:
72
+ df = pd.read_csv(file_obj.name, sep=sep)
73
+ df = clean_data(df)
74
+ except Exception as e:
75
+ yield f"❌ Error: {str(e)}", None
76
+ return
77
+
78
+ device = "cpu"
79
+ model_name = "bert-base-multilingual-cased"
80
  tokenizer = AutoTokenizer.from_pretrained(model_name)
81
 
82
+ def tokenize_fn(texts):
83
+ return tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
 
 
 
 
 
 
 
 
 
 
84
 
85
+ encodings = tokenize_fn(df["text"].tolist())
86
+ labels = torch.tensor(df[LIST_LABEL].values, dtype=torch.float)
87
+ dataset = TensorDataset(encodings["input_ids"], encodings["attention_mask"], labels)
88
+ train_loader = DataLoader(dataset, batch_size=int(batch_size), shuffle=True)
 
89
 
90
  model = ModelEmosi(model_name)
91
  model.to(device)
92
+ optimizer = torch.optim.AdamW(model.parameters(), lr=float(lr))
93
+ loss_fn = nn.BCEWithLogitsLoss()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ log_text = f"πŸš€ Mulai Training CPU...\nData: {len(df)} baris\n"
96
+ yield log_text, None
 
 
97
 
98
+ model.train()
99
+ for ep in range(int(epochs)):
100
+ total_loss = 0
101
+ for step, batch in enumerate(train_loader):
102
+ b_ids, b_mask, b_lbl = batch
 
 
 
 
103
  optimizer.zero_grad()
104
+ out = model(b_ids, b_mask)
105
+ loss = loss_fn(out, b_lbl)
 
106
  loss.backward()
107
  optimizer.step()
108
+ total_loss += loss.item()
109
 
110
+ if step % 5 == 0:
111
+ progress((ep * len(train_loader) + step) / (int(epochs) * len(train_loader)))
 
 
112
 
113
+ avg_loss = total_loss / len(train_loader)
114
+ log_text += f"βœ… Epoch {ep+1} | Loss: {avg_loss:.4f}\n"
115
+ yield log_text, None
116
+
117
+ model.base.save_pretrained(DIR_TRAINED)
118
+ tokenizer.save_pretrained(DIR_TRAINED)
119
+ torch.save(model.classifier.state_dict(), DIR_TRAINED / "classifier_head.pt")
120
+ set_active_model_path(DIR_TRAINED)
121
+
122
+ yield log_text + "\nπŸŽ‰ Selesai & Disimpan!", "Model Lokal (Baru Dilatih)"
123
+
124
+ # =========================================================
125
+ # 5. LOGIKA UPLOAD (DARI COLAB)
126
+ # =========================================================
127
+ def handle_zip_upload(file_obj):
128
+ if file_obj is None: return "❌ Tidak ada file.", None
129
+ try:
130
+ if DIR_UPLOADED.exists(): shutil.rmtree(DIR_UPLOADED)
131
+ DIR_UPLOADED.mkdir()
132
 
133
+ with zipfile.ZipFile(file_obj.name, 'r') as zip_ref:
134
+ zip_ref.extractall(DIR_UPLOADED)
 
 
 
 
 
 
135
 
136
+ # Handle jika ada subfolder
137
+ files_in_dir = list(DIR_UPLOADED.iterdir())
138
+ if len(files_in_dir) == 1 and files_in_dir[0].is_dir():
139
+ subfolder = files_in_dir[0]
140
+ for item in subfolder.iterdir():
141
+ shutil.move(str(item), str(DIR_UPLOADED))
142
+ subfolder.rmdir()
143
+
144
+ set_active_model_path(DIR_UPLOADED)
145
+ return f"βœ… Model berhasil dimuat dari ZIP!\nLokasi: {DIR_UPLOADED}", "Model Upload (Dari Colab)"
146
+ except Exception as e:
147
+ return f"❌ Error unzip: {str(e)}", None
148
 
149
+ # =========================================================
150
+ # 6. LOGIKA PREDIKSI
151
+ # =========================================================
152
+ def load_model_inference():
153
+ path = get_active_model_path()
154
+ if not path: raise ValueError("Belum ada model aktif.")
 
 
 
 
 
 
 
 
 
155
 
156
+ path = Path(path)
157
+ config = AutoConfig.from_pretrained(path)
158
+ tokenizer = AutoTokenizer.from_pretrained(path)
159
+ model = ModelEmosi(path)
 
 
 
160
 
161
+ head_path = path / "classifier_head.pt"
162
+ if head_path.exists():
163
+ model.classifier.load_state_dict(torch.load(head_path, map_location="cpu"))
 
 
 
 
 
 
 
 
 
 
164
 
165
+ model.eval()
166
+ return model, tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
+ def predict_text(text):
169
+ if not text: return None
170
+ try:
171
+ model, tokenizer = load_model_inference()
172
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
173
+ with torch.no_grad():
174
+ out = model(inputs["input_ids"], inputs["attention_mask"])
175
+ probs = torch.sigmoid(out).numpy()[0]
176
+ return {LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))}
177
+ except Exception as e:
178
+ return {"Error": str(e)}
179
+
180
+ def predict_csv(file_obj, sep):
181
+ try:
182
+ df = pd.read_csv(file_obj.name, sep=sep)
183
+ df = clean_data(df)
184
+ model, tokenizer = load_model_inference()
185
+ results = []
186
+ for txt in df["text"]:
187
+ inputs = tokenizer(txt, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
188
+ with torch.no_grad():
189
+ out = model(inputs["input_ids"], inputs["attention_mask"])
190
+ probs = torch.sigmoid(out).numpy()[0]
191
+ results.append({LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))})
192
 
193
+ avg = {l: 0.0 for l in LIST_LABEL}
194
+ for r in results:
195
+ for l,v in r.items(): avg[l] += v
196
+ for l in avg: avg[l] /= len(results)
197
 
198
+ top3 = sorted(avg.items(), key=lambda x: x[1], reverse=True)[:3]
199
+ return {"Total Data": len(results), "Top 3 Emosi": {k: round(v,4) for k,v in top3}, "Rata-rata": avg}
200
+ except Exception as e:
201
+ return {"Error": str(e)}
 
 
 
 
202
 
203
+ # =========================================================
204
+ # 7. TAMPILAN ANTARMUKA (UI)
205
+ # =========================================================
206
+ with gr.Blocks(title="Emotion AI Manager") as app:
207
+ gr.Markdown("# 🎭 AI Emotion Classifier System")
 
 
 
 
 
 
 
 
 
208
 
209
+ # Status Bar Global
210
+ lbl_active_model = gr.Textbox(label="Status Model Aktif", value="Belum ada model yang dipilih.", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
+ # TAB UTAMA 1: SETUP & PELATIHAN
213
+ with gr.Tab("βš™οΈ Pelatihan & Model"):
214
+ with gr.Tabs():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
+ # Sub-Tab 1.1: Upload (Paling Recommended)
217
+ with gr.Tab("πŸ“‚ Upload Pretrained (Recommended)"):
218
+ gr.Markdown("Gunakan model hasil training GPU (Colab) agar cepat.")
219
+ in_zip = gr.File(label="Upload File .zip Model", file_types=[".zip"])
220
+ btn_upload = gr.Button("Ekstrak & Aktifkan Model", variant="primary")
221
+ out_log_upload = gr.Textbox(label="Log Sistem")
222
+
223
+ btn_upload.click(handle_zip_upload, inputs=in_zip, outputs=[out_log_upload, lbl_active_model])
224
 
225
+ # Sub-Tab 1.2: Latihan Manual
226
+ with gr.Tab("πŸ‹οΈβ€β™€οΈ Latihan Manual (CPU)"):
227
+ gr.Markdown("⚠️ Lambat di Hugging Face Space. Gunakan data kecil saja.")
228
+ with gr.Row():
229
+ in_csv = gr.File(label="Dataset CSV")
230
+ in_sep = gr.Textbox(label="Separator", value=";")
231
+ with gr.Row():
232
+ in_ep = gr.Number(label="Epoch", value=1)
233
+ in_bs = gr.Number(label="Batch", value=4)
234
+ in_lr = gr.Number(label="LR", value=2e-5)
235
+
236
+ btn_train = gr.Button("Mulai Latihan")
237
+ out_log_train = gr.Textbox(label="Log Training", lines=6)
238
+
239
+ btn_train.click(run_training_generator, inputs=[in_csv, in_sep, in_ep, in_bs, in_lr], outputs=[out_log_train, lbl_active_model])
 
 
 
 
 
240
 
241
+ # TAB UTAMA 2: PENGUJIAN
242
+ with gr.Tab("πŸ§ͺ Pengujian (Testing)"):
243
+ with gr.Tabs():
244
+
245
+ # Sub-Tab 2.1: Uji Tunggal
246
+ with gr.Tab("πŸ“ Uji Tunggal (Teks)"):
247
+ in_txt = gr.Textbox(label="Masukkan Kalimat", placeholder="Saya merasa...")
248
+ btn_pred_txt = gr.Button("Prediksi Emosi")
249
+ out_lbl = gr.Label(label="Confidence Score")
250
+
251
+ btn_pred_txt.click(predict_text, inputs=in_txt, outputs=out_lbl)
252
+
253
+ # Sub-Tab 2.2: Uji Batch
254
+ with gr.Tab("πŸ“Š Uji Batch (CSV)"):
255
+ in_csv_test = gr.File(label="Upload CSV Test")
256
+ in_sep_test = gr.Textbox(label="Separator", value=";")
257
+ btn_pred_csv = gr.Button("Analisis Batch")
258
+ out_json = gr.JSON(label="Hasil Analisis")
259
+
260
+ btn_pred_csv.click(predict_csv, inputs=[in_csv_test, in_sep_test], outputs=out_json)
261
 
262
+ app.queue().launch()