Darendra commited on
Commit
54584f7
Β·
verified Β·
1 Parent(s): 02adcda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +188 -103
app.py CHANGED
@@ -1,203 +1,288 @@
1
  import os
2
  import torch
3
  import pandas as pd
4
- import numpy as np
5
  import gradio as gr
6
- import zipfile
7
  import shutil
8
- import sys
9
  from pathlib import Path
10
- from torch import nn
11
- from torch.utils.data import DataLoader, TensorDataset
12
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
13
 
14
  # =========================================================
15
- # 1. KONFIGURASI & SETUP
16
  # =========================================================
17
-
18
  LIST_LABEL = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
19
 
20
- # Setup Path
21
- def get_root_path():
22
- if getattr(sys, 'frozen', False):
23
- return Path(sys.executable).parent
24
- else:
25
- return Path(__file__).parent
26
-
27
- BASE_DIR = get_root_path()
28
- DIR_TRAINED = BASE_DIR / "saved_models" / "trained_local"
29
- DIR_UPLOADED = BASE_DIR / "saved_models" / "uploaded_colab"
30
- ACTIVE_MODEL_POINTER = BASE_DIR / "active_model_path.txt"
31
 
32
- DIR_TRAINED.mkdir(parents=True, exist_ok=True)
33
  DIR_UPLOADED.mkdir(parents=True, exist_ok=True)
 
 
 
 
34
 
35
  # =========================================================
36
- # 2. HELPER FUNCTIONS
37
  # =========================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def clean_data(df):
39
- # Cek kolom label dan tipenya
40
  for l in LIST_LABEL:
41
  if l not in df.columns: df[l] = 0
42
- # Fix format koma (1,00 -> 1.00)
43
  df[l] = df[l].astype(str).str.replace(',', '.', regex=False)
44
  df[l] = pd.to_numeric(df[l], errors='coerce').fillna(0).astype(float)
45
 
46
- # Bersihkan teks
47
  col_text = next((c for c in df.columns if c.lower() in ['text', 'kalimat', 'content', 'tweet']), None)
48
  if col_text:
49
  df["text_clean"] = df[col_text].astype(str).str.replace("\n", " ").str.strip()
50
  elif "text" in df.columns:
51
  df["text_clean"] = df["text"].astype(str).str.replace("\n", " ").str.strip()
52
-
53
  return df
54
 
55
- def get_active_model_path():
56
- if os.path.exists(ACTIVE_MODEL_POINTER):
57
- with open(ACTIVE_MODEL_POINTER, "r") as f:
58
- path = f.read().strip()
59
- if os.path.exists(path): return path
60
- return None
61
-
62
- def set_active_model_path(path):
63
- with open(ACTIVE_MODEL_POINTER, "w") as f:
64
- f.write(str(path))
65
-
66
  # =========================================================
67
- # 3. LOGIKA UPLOAD
68
  # =========================================================
69
  def handle_zip_upload(file_obj):
 
 
70
  if file_obj is None: return "❌ Tidak ada file.", None
71
  try:
72
- # Bersihkan folder lama
73
  if DIR_UPLOADED.exists(): shutil.rmtree(DIR_UPLOADED)
74
  DIR_UPLOADED.mkdir(parents=True, exist_ok=True)
75
 
76
  with zipfile.ZipFile(file_obj.name, 'r') as zip_ref:
77
  zip_ref.extractall(DIR_UPLOADED)
78
-
79
- # Handle jika zip membungkus folder (nested folder)
80
- # Cari file config.json untuk menentukan root folder model
81
- config_path = list(DIR_UPLOADED.rglob("config.json"))
82
 
83
- if not config_path:
84
- return "❌ Error: Tidak ditemukan config.json di dalam zip.", None
85
-
 
 
86
  final_model_path = config_path[0].parent
 
87
 
88
- # Simpan path yang valid
89
- set_active_model_path(final_model_path)
90
- return f"βœ… Model berhasil dimuat!\nLokasi: {final_model_path}", "Model Upload (Siap)"
91
  except Exception as e:
92
  return f"❌ Error unzip: {str(e)}", None
93
 
94
  # =========================================================
95
- # 4. LOGIKA PREDIKSI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  # =========================================================
97
  def load_model_inference():
98
- path = get_active_model_path()
99
- if not path: raise ValueError("Belum ada model aktif. Upload dulu!")
 
 
 
100
 
101
- path = Path(path)
 
 
 
 
 
 
 
 
102
  try:
103
- tokenizer = AutoTokenizer.from_pretrained(str(path))
104
- model = AutoModelForSequenceClassification.from_pretrained(str(path))
105
  model.eval()
106
  return model, tokenizer
107
- except Exception as e:
108
- raise ValueError(f"Gagal load model: {e}")
 
109
 
110
  def predict_text(text):
111
  if not text: return None
112
  try:
113
  model, tokenizer = load_model_inference()
114
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
115
-
116
  with torch.no_grad():
117
  out = model(**inputs)
118
  probs = torch.sigmoid(out.logits).numpy()[0]
119
-
120
  return {LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))}
121
  except Exception as e:
122
  return {"Error": str(e)}
123
 
124
  def predict_csv(file_obj, sep):
125
  try:
126
- # Cek separator
127
- try:
128
- df = pd.read_csv(file_obj.name, sep=sep)
129
- except:
130
- df = pd.read_csv(file_obj.name, sep=",")
131
-
132
  df = clean_data(df)
 
133
  model, tokenizer = load_model_inference()
 
134
 
135
  results = []
136
- # Cek kolom text
137
- if "text_clean" not in df.columns: return {"Error": "Kolom teks tidak ditemukan"}
138
-
139
  for txt in df["text_clean"]:
140
  inputs = tokenizer(txt, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
141
  with torch.no_grad():
142
  out = model(**inputs)
143
  probs = torch.sigmoid(out.logits).numpy()[0]
144
-
145
  results.append({LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))})
146
 
147
- # Hitung statistik
148
  avg = {l: 0.0 for l in LIST_LABEL}
149
  for r in results:
150
  for l,v in r.items(): avg[l] += v
151
  for l in avg: avg[l] /= len(results)
152
-
153
  top3 = sorted(avg.items(), key=lambda x: x[1], reverse=True)[:3]
154
-
155
- return {
156
- "Total Data": len(results),
157
- "Top 3 Emosi Dominan": {k: round(v,4) for k,v in top3},
158
- "Rata-rata Skor": avg
159
- }
160
  except Exception as e:
161
  return {"Error": str(e)}
162
 
163
  # =========================================================
164
- # 5. TAMPILAN ANTARMUKA (UI GRADIO)
165
  # =========================================================
166
- with gr.Blocks(title="Emotion AI Manager") as app:
167
- gr.Markdown("# 🧠 AI Emotion Classifier System")
168
 
169
- # Status Bar Global
170
- lbl_active_model = gr.Textbox(label="Status Model Aktif", value="Belum ada model.", interactive=False)
171
 
172
  with gr.Tabs():
173
- # TAB 1: UPLOAD
174
- with gr.Tab("πŸ“‚ Upload Model"):
175
- gr.Markdown("Upload file `.zip` model hasil training.")
176
- in_zip = gr.File(label="Upload File .zip", file_types=[".zip"])
177
- btn_upload = gr.Button("Ekstrak & Aktifkan", variant="primary")
178
- out_log_upload = gr.Textbox(label="Log Sistem")
179
-
180
- btn_upload.click(handle_zip_upload, inputs=in_zip, outputs=[out_log_upload, lbl_active_model])
181
-
182
- # TAB 2: PENGUJIAN
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  with gr.Tab("πŸ§ͺ Testing"):
 
 
184
  with gr.Tabs():
185
- # Sub-Tab 2.1: Uji Tunggal
186
- with gr.Tab("πŸ“ Uji Tunggal"):
187
- in_txt = gr.Textbox(label="Masukkan Kalimat", placeholder="Saya merasa...", lines=2)
188
- btn_pred_txt = gr.Button("Prediksi", variant="primary")
189
  out_lbl = gr.Label(label="Hasil Prediksi")
190
-
191
- btn_pred_txt.click(predict_text, inputs=in_txt, outputs=out_lbl)
192
 
193
- # Sub-Tab 2.2: Uji Batch
194
  with gr.Tab("πŸ“Š Uji Batch (CSV)"):
195
  in_csv_test = gr.File(label="Upload CSV Test")
196
- in_sep_test = gr.Textbox(label="Separator", value=";")
197
- btn_pred_csv = gr.Button("Analisis Batch")
198
  out_json = gr.JSON(label="Hasil Analisis")
199
-
200
- btn_pred_csv.click(predict_csv, inputs=[in_csv_test, in_sep_test], outputs=out_json)
201
 
202
  if __name__ == "__main__":
203
- app.queue().launch()
 
1
  import os
2
  import torch
3
  import pandas as pd
 
4
  import gradio as gr
 
5
  import shutil
6
+ import zipfile
7
  from pathlib import Path
8
+ from torch.utils.data import DataLoader, Dataset
9
+ from torch.optim import AdamW
10
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
 
12
  # =========================================================
13
+ # 1. KONFIGURASI & VARIABEL
14
  # =========================================================
 
15
  LIST_LABEL = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
16
 
17
+ # Folder penyimpanan sementara
18
+ DIR_UPLOADED = Path("temp_models/uploaded_zip")
19
+ DIR_TRAINED = Path("temp_models/trained_cloud")
 
 
 
 
 
 
 
 
20
 
 
21
  DIR_UPLOADED.mkdir(parents=True, exist_ok=True)
22
+ DIR_TRAINED.mkdir(parents=True, exist_ok=True)
23
+
24
+ # Variabel Global untuk menyimpan path model aktif
25
+ active_model_path = None
26
 
27
  # =========================================================
28
+ # 2. HELPER & DATASET
29
  # =========================================================
30
+ class EmosiDataset(Dataset):
31
+ def __init__(self, df, tokenizer, max_len=128):
32
+ self.df = df
33
+ self.tokenizer = tokenizer
34
+ self.max_len = max_len
35
+ self.labels = df[LIST_LABEL].values
36
+ self.texts = df["text_clean"].astype(str).tolist()
37
+
38
+ def __len__(self):
39
+ return len(self.df)
40
+
41
+ def __getitem__(self, item):
42
+ text = self.texts[item]
43
+ inputs = self.tokenizer(
44
+ text,
45
+ truncation=True,
46
+ padding='max_length',
47
+ max_length=self.max_len,
48
+ return_tensors='pt'
49
+ )
50
+ return {
51
+ 'input_ids': inputs['input_ids'].flatten(),
52
+ 'attention_mask': inputs['attention_mask'].flatten(),
53
+ 'labels': torch.tensor(self.labels[item], dtype=torch.float)
54
+ }
55
+
56
  def clean_data(df):
 
57
  for l in LIST_LABEL:
58
  if l not in df.columns: df[l] = 0
 
59
  df[l] = df[l].astype(str).str.replace(',', '.', regex=False)
60
  df[l] = pd.to_numeric(df[l], errors='coerce').fillna(0).astype(float)
61
 
 
62
  col_text = next((c for c in df.columns if c.lower() in ['text', 'kalimat', 'content', 'tweet']), None)
63
  if col_text:
64
  df["text_clean"] = df[col_text].astype(str).str.replace("\n", " ").str.strip()
65
  elif "text" in df.columns:
66
  df["text_clean"] = df["text"].astype(str).str.replace("\n", " ").str.strip()
 
67
  return df
68
 
 
 
 
 
 
 
 
 
 
 
 
69
  # =========================================================
70
+ # 3. UPLOAD ZIP
71
  # =========================================================
72
  def handle_zip_upload(file_obj):
73
+ global active_model_path
74
+
75
  if file_obj is None: return "❌ Tidak ada file.", None
76
  try:
 
77
  if DIR_UPLOADED.exists(): shutil.rmtree(DIR_UPLOADED)
78
  DIR_UPLOADED.mkdir(parents=True, exist_ok=True)
79
 
80
  with zipfile.ZipFile(file_obj.name, 'r') as zip_ref:
81
  zip_ref.extractall(DIR_UPLOADED)
 
 
 
 
82
 
83
+ # Cari config.json
84
+ config_path = list(DIR_UPLOADED.rglob("config.json"))
85
+ if not config_path:
86
+ return "❌ Error: Tidak ditemukan config.json dalam ZIP.", None
87
+
88
  final_model_path = config_path[0].parent
89
+ active_model_path = str(final_model_path)
90
 
91
+ return f"βœ… Model ZIP Berhasil Dimuat!\nLokasi: {active_model_path}", "Status: Memakai Model Upload ZIP"
 
 
92
  except Exception as e:
93
  return f"❌ Error unzip: {str(e)}", None
94
 
95
  # =========================================================
96
+ # 4. TRAINING CLOUD
97
+ # =========================================================
98
+ def train_model_cloud(file_obj, sep, epochs, batch_size, lr, progress=gr.Progress()):
99
+ global active_model_path
100
+
101
+ yield "⏳ Membaca dataset...", None
102
+ if file_obj is None:
103
+ yield "❌ File CSV belum diupload!", None
104
+ return
105
+
106
+ try:
107
+ df = pd.read_csv(file_obj.name, sep=sep)
108
+ df = clean_data(df)
109
+ if "text_clean" not in df.columns:
110
+ yield "❌ Kolom teks tidak ditemukan.", None
111
+ return
112
+
113
+ MODEL_NAME = "indobenchmark/indobert-base-p1"
114
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
115
+ model = AutoModelForSequenceClassification.from_pretrained(
116
+ MODEL_NAME, num_labels=len(LIST_LABEL), problem_type="multi_label_classification"
117
+ )
118
+
119
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
120
+ model.to(device)
121
+
122
+ dataset = EmosiDataset(df, tokenizer)
123
+ loader = DataLoader(dataset, batch_size=int(batch_size), shuffle=True)
124
+ optimizer = AdamW(model.parameters(), lr=float(lr))
125
+
126
+ log_text = f"πŸš€ Mulai Training di {device}...\nData: {len(df)} baris.\n"
127
+ yield log_text, None
128
+
129
+ model.train()
130
+ for ep in range(int(epochs)):
131
+ total_loss = 0
132
+ steps = len(loader)
133
+ for i, batch in enumerate(loader):
134
+ optimizer.zero_grad()
135
+ input_ids = batch['input_ids'].to(device)
136
+ attention_mask = batch['attention_mask'].to(device)
137
+ labels = batch['labels'].to(device)
138
+
139
+ outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
140
+ loss = outputs.loss
141
+ loss.backward()
142
+ optimizer.step()
143
+
144
+ total_loss += loss.item()
145
+ if i % 5 == 0:
146
+ progress((ep * steps + i) / (int(epochs) * steps), desc=f"Ep {ep+1} Loss: {total_loss/(i+1):.4f}")
147
+
148
+ avg_loss = total_loss / steps
149
+ log_text += f"βœ… Epoch {ep+1}/{epochs} | Loss: {avg_loss:.4f}\n"
150
+ yield log_text, None
151
+
152
+ # Simpan
153
+ yield log_text + "\nπŸ’Ύ Menyimpan model...", None
154
+ if DIR_TRAINED.exists(): shutil.rmtree(DIR_TRAINED)
155
+ DIR_TRAINED.mkdir(parents=True, exist_ok=True)
156
+
157
+ model.save_pretrained(DIR_TRAINED)
158
+ tokenizer.save_pretrained(DIR_TRAINED)
159
+
160
+ active_model_path = str(DIR_TRAINED)
161
+ yield log_text + f"\nπŸŽ‰ Selesai! Model training aktif.", "Status: Memakai Model Hasil Training"
162
+
163
+ except Exception as e:
164
+ yield f"❌ Error: {str(e)}", None
165
+
166
+ # =========================================================
167
+ # 5. LOAD & PREDIKSI
168
  # =========================================================
169
  def load_model_inference():
170
+ global active_model_path
171
+
172
+ # Prioritas 1: Model aktif (hasil upload/training barusan)
173
+ if active_model_path and os.path.exists(active_model_path):
174
+ target_path = active_model_path
175
 
176
+ # Prioritas 2: Folder default (upload manual via Files HF)
177
+ elif os.path.exists("model_default") and os.path.exists("model_default/config.json"):
178
+ target_path = "model_default"
179
+
180
+ # Prioritas 3: Download Base Model
181
+ else:
182
+ return AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=8), \
183
+ AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
184
+
185
  try:
186
+ tokenizer = AutoTokenizer.from_pretrained(target_path)
187
+ model = AutoModelForSequenceClassification.from_pretrained(target_path)
188
  model.eval()
189
  return model, tokenizer
190
+ except:
191
+ return AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=8), \
192
+ AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
193
 
194
  def predict_text(text):
195
  if not text: return None
196
  try:
197
  model, tokenizer = load_model_inference()
198
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
 
199
  with torch.no_grad():
200
  out = model(**inputs)
201
  probs = torch.sigmoid(out.logits).numpy()[0]
 
202
  return {LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))}
203
  except Exception as e:
204
  return {"Error": str(e)}
205
 
206
  def predict_csv(file_obj, sep):
207
  try:
208
+ try: df = pd.read_csv(file_obj.name, sep=sep)
209
+ except: df = pd.read_csv(file_obj.name, sep=",")
 
 
 
 
210
  df = clean_data(df)
211
+
212
  model, tokenizer = load_model_inference()
213
+ if "text_clean" not in df.columns: return {"Error": "Kolom teks tidak ditemukan"}
214
 
215
  results = []
 
 
 
216
  for txt in df["text_clean"]:
217
  inputs = tokenizer(txt, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
218
  with torch.no_grad():
219
  out = model(**inputs)
220
  probs = torch.sigmoid(out.logits).numpy()[0]
 
221
  results.append({LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))})
222
 
 
223
  avg = {l: 0.0 for l in LIST_LABEL}
224
  for r in results:
225
  for l,v in r.items(): avg[l] += v
226
  for l in avg: avg[l] /= len(results)
 
227
  top3 = sorted(avg.items(), key=lambda x: x[1], reverse=True)[:3]
228
+ return {"Info": f"Total {len(results)} data", "Dominan": {k: round(v,4) for k,v in top3}, "Detail": avg}
 
 
 
 
 
229
  except Exception as e:
230
  return {"Error": str(e)}
231
 
232
  # =========================================================
233
+ # 6. UI GRADIO
234
  # =========================================================
235
+ with gr.Blocks(title="IndoBERT Emotion Cloud") as app:
236
+ gr.Markdown("# ☁️ IndoBERT Emotion Classifier")
237
 
238
+ # Label Status Global
239
+ lbl_status = gr.Textbox(label="Status Model Aktif", value="Default (IndoBERT Base / Uploaded Manual)", interactive=False)
240
 
241
  with gr.Tabs():
242
+ # === TAB 1: KONFIGURASI MODEL ===
243
+ with gr.Tab("βš™οΈ Konfigurasi Model"):
244
+ with gr.Tabs():
245
+
246
+ # --- Sub Tab 1: Upload ---
247
+ with gr.Tab("πŸ“‚ Unggah Model"):
248
+ gr.Markdown("Upload file `.zip` berisi model yang sudah dilatih (dari Komputer).")
249
+ in_zip = gr.File(label="File ZIP Model")
250
+ btn_upload = gr.Button("Ekstrak & Pakai Model", variant="primary")
251
+ out_log_upload = gr.Textbox(label="Log Sistem")
252
+
253
+ btn_upload.click(handle_zip_upload, inputs=in_zip, outputs=[out_log_upload, lbl_status])
254
+
255
+ # --- Sub Tab 2: Training ---
256
+ with gr.Tab("πŸ‹οΈβ€β™€οΈ Latih Model"):
257
+ gr.Markdown("Latih model baru menggunakan Dataset CSV sendiri di Cloud.")
258
+ with gr.Row():
259
+ in_csv = gr.File(label="Dataset CSV")
260
+ in_sep = gr.Textbox(label="Separator", value=";")
261
+ with gr.Row():
262
+ in_ep = gr.Number(label="Epoch", value=1, precision=0)
263
+ in_bs = gr.Number(label="Batch Size", value=4, precision=0)
264
+ in_lr = gr.Number(label="Learning Rate", value=2e-5)
265
+ btn_train = gr.Button("Mulai Training", variant="stop")
266
+ out_log_train = gr.Textbox(label="Log Training", lines=5)
267
+
268
+ btn_train.click(train_model_cloud, inputs=[in_csv, in_sep, in_ep, in_bs, in_lr], outputs=[out_log_train, lbl_status])
269
+
270
+ # === TAB 2: TESTING ===
271
  with gr.Tab("πŸ§ͺ Testing"):
272
+ gr.Markdown("Uji model yang sedang aktif.")
273
+
274
  with gr.Tabs():
275
+ with gr.Tab("πŸ“ Uji Satu Kalimat"):
276
+ in_txt = gr.Textbox(label="Masukkan Kalimat", lines=2, placeholder="Contoh: Saya sangat bahagia hari ini...")
277
+ btn_pred = gr.Button("Prediksi Emosi")
 
278
  out_lbl = gr.Label(label="Hasil Prediksi")
279
+ btn_pred.click(predict_text, inputs=in_txt, outputs=out_lbl)
 
280
 
 
281
  with gr.Tab("πŸ“Š Uji Batch (CSV)"):
282
  in_csv_test = gr.File(label="Upload CSV Test")
283
+ btn_batch = gr.Button("Analisis Batch")
 
284
  out_json = gr.JSON(label="Hasil Analisis")
285
+ btn_batch.click(predict_csv, inputs=[in_csv_test, in_sep], outputs=out_json)
 
286
 
287
  if __name__ == "__main__":
288
+ app.launch()