File size: 12,085 Bytes
062b593
 
 
 
74dd21d
54584f7
062b593
54584f7
 
 
d0cc31c
 
54584f7
d0cc31c
02adcda
062b593
54584f7
 
 
d0cc31c
02adcda
54584f7
 
 
 
d0cc31c
74dd21d
54584f7
74dd21d
54584f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74dd21d
 
 
02adcda
74dd21d
dafa625
02adcda
 
 
 
 
74dd21d
d0cc31c
74dd21d
54584f7
74dd21d
 
54584f7
 
74dd21d
 
 
02adcda
1a47d90
74dd21d
 
02adcda
54584f7
 
 
 
 
02adcda
54584f7
02adcda
54584f7
74dd21d
 
2faddd5
74dd21d
54584f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74dd21d
 
54584f7
 
 
 
 
1a47d90
54584f7
 
 
 
 
 
 
 
 
02adcda
54584f7
 
dafa625
02adcda
54584f7
 
 
d0cc31c
74dd21d
 
 
 
 
 
02adcda
 
74dd21d
 
 
 
 
 
54584f7
 
74dd21d
54584f7
74dd21d
54584f7
dafa625
74dd21d
02adcda
74dd21d
 
02adcda
 
74dd21d
1a47d90
74dd21d
 
 
 
 
54584f7
74dd21d
 
d0cc31c
74dd21d
54584f7
74dd21d
54584f7
 
2faddd5
54584f7
 
02adcda
 
54584f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02adcda
54584f7
 
02adcda
54584f7
 
 
02adcda
54584f7
74dd21d
02adcda
 
54584f7
02adcda
54584f7
02adcda
 
54584f7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import os
import torch
import pandas as pd
import gradio as gr
import shutil
import zipfile
from pathlib import Path
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# =========================================================
# 1. KONFIGURASI & VARIABEL
# =========================================================
LIST_LABEL = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']

# Folder penyimpanan sementara
DIR_UPLOADED = Path("temp_models/uploaded_zip")
DIR_TRAINED = Path("temp_models/trained_cloud")

DIR_UPLOADED.mkdir(parents=True, exist_ok=True)
DIR_TRAINED.mkdir(parents=True, exist_ok=True)

# Variabel Global untuk menyimpan path model aktif
active_model_path = None 

# =========================================================
# 2. HELPER & DATASET
# =========================================================
class EmosiDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.labels = df[LIST_LABEL].values
        self.texts = df["text_clean"].astype(str).tolist()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, item):
        text = self.texts[item]
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[item], dtype=torch.float)
        }

def clean_data(df):
    for l in LIST_LABEL:
        if l not in df.columns: df[l] = 0
        df[l] = df[l].astype(str).str.replace(',', '.', regex=False)
        df[l] = pd.to_numeric(df[l], errors='coerce').fillna(0).astype(float)
    
    col_text = next((c for c in df.columns if c.lower() in ['text', 'kalimat', 'content', 'tweet']), None)
    if col_text:
        df["text_clean"] = df[col_text].astype(str).str.replace("\n", " ").str.strip()
    elif "text" in df.columns:
         df["text_clean"] = df["text"].astype(str).str.replace("\n", " ").str.strip()
    return df

# =========================================================
# 3. UPLOAD ZIP
# =========================================================
def handle_zip_upload(file_obj):
    global active_model_path
    
    if file_obj is None: return "❌ Tidak ada file.", None
    try:
        if DIR_UPLOADED.exists(): shutil.rmtree(DIR_UPLOADED)
        DIR_UPLOADED.mkdir(parents=True, exist_ok=True)
        
        with zipfile.ZipFile(file_obj.name, 'r') as zip_ref:
            zip_ref.extractall(DIR_UPLOADED)
        
        # Cari config.json
        config_path = list(DIR_UPLOADED.rglob("config.json"))
        if not config_path: 
            return "❌ Error: Tidak ditemukan config.json dalam ZIP.", None
            
        final_model_path = config_path[0].parent
        active_model_path = str(final_model_path)
        
        return f"βœ… Model ZIP Berhasil Dimuat!\nLokasi: {active_model_path}", "Status: Memakai Model Upload ZIP"
    except Exception as e:
        return f"❌ Error unzip: {str(e)}", None

# =========================================================
# 4. TRAINING CLOUD
# =========================================================
def train_model_cloud(file_obj, sep, epochs, batch_size, lr, progress=gr.Progress()):
    global active_model_path
    
    yield "⏳ Membaca dataset...", None
    if file_obj is None:
        yield "❌ File CSV belum diupload!", None
        return

    try:
        df = pd.read_csv(file_obj.name, sep=sep)
        df = clean_data(df)
        if "text_clean" not in df.columns:
            yield "❌ Kolom teks tidak ditemukan.", None
            return

        MODEL_NAME = "indobenchmark/indobert-base-p1"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME, num_labels=len(LIST_LABEL), problem_type="multi_label_classification"
        )
        
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        
        dataset = EmosiDataset(df, tokenizer)
        loader = DataLoader(dataset, batch_size=int(batch_size), shuffle=True)
        optimizer = AdamW(model.parameters(), lr=float(lr))
        
        log_text = f"πŸš€ Mulai Training di {device}...\nData: {len(df)} baris.\n"
        yield log_text, None
        
        model.train()
        for ep in range(int(epochs)):
            total_loss = 0
            steps = len(loader)
            for i, batch in enumerate(loader):
                optimizer.zero_grad()
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
                if i % 5 == 0: 
                    progress((ep * steps + i) / (int(epochs) * steps), desc=f"Ep {ep+1} Loss: {total_loss/(i+1):.4f}")
            
            avg_loss = total_loss / steps
            log_text += f"βœ… Epoch {ep+1}/{epochs} | Loss: {avg_loss:.4f}\n"
            yield log_text, None
        
        # Simpan
        yield log_text + "\nπŸ’Ύ Menyimpan model...", None
        if DIR_TRAINED.exists(): shutil.rmtree(DIR_TRAINED)
        DIR_TRAINED.mkdir(parents=True, exist_ok=True)
        
        model.save_pretrained(DIR_TRAINED)
        tokenizer.save_pretrained(DIR_TRAINED)
        
        active_model_path = str(DIR_TRAINED)
        yield log_text + f"\nπŸŽ‰ Selesai! Model training aktif.", "Status: Memakai Model Hasil Training"
        
    except Exception as e:
        yield f"❌ Error: {str(e)}", None

# =========================================================
# 5. LOAD & PREDIKSI
# =========================================================
def load_model_inference():
    global active_model_path
    
    # Prioritas 1: Model aktif (hasil upload/training barusan)
    if active_model_path and os.path.exists(active_model_path):
        target_path = active_model_path
    
    # Prioritas 2: Folder default (upload manual via Files HF)
    elif os.path.exists("model_default") and os.path.exists("model_default/config.json"):
        target_path = "model_default"
        
    # Prioritas 3: Download Base Model
    else:
        return AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=8), \
               AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

    try:
        tokenizer = AutoTokenizer.from_pretrained(target_path)
        model = AutoModelForSequenceClassification.from_pretrained(target_path)
        model.eval()
        return model, tokenizer
    except:
        return AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=8), \
               AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

def predict_text(text):
    if not text: return None
    try:
        model, tokenizer = load_model_inference()
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
        with torch.no_grad():
            out = model(**inputs)
            probs = torch.sigmoid(out.logits).numpy()[0]
        return {LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))}
    except Exception as e:
        return {"Error": str(e)}

def predict_csv(file_obj, sep):
    try:
        try: df = pd.read_csv(file_obj.name, sep=sep)
        except: df = pd.read_csv(file_obj.name, sep=",")
        df = clean_data(df)
        
        model, tokenizer = load_model_inference()
        if "text_clean" not in df.columns: return {"Error": "Kolom teks tidak ditemukan"}
        
        results = []
        for txt in df["text_clean"]:
            inputs = tokenizer(txt, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
            with torch.no_grad():
                out = model(**inputs)
                probs = torch.sigmoid(out.logits).numpy()[0]
            results.append({LIST_LABEL[i]: float(probs[i]) for i in range(len(LIST_LABEL))})
            
        avg = {l: 0.0 for l in LIST_LABEL}
        for r in results:
            for l,v in r.items(): avg[l] += v
        for l in avg: avg[l] /= len(results)
        top3 = sorted(avg.items(), key=lambda x: x[1], reverse=True)[:3]
        return {"Info": f"Total {len(results)} data", "Dominan": {k: round(v,4) for k,v in top3}, "Detail": avg}
    except Exception as e:
        return {"Error": str(e)}

# =========================================================
# 6. UI GRADIO
# =========================================================
with gr.Blocks(title="IndoBERT Emotion Cloud") as app:
    gr.Markdown("# ☁️ IndoBERT Emotion Classifier")
    
    # Label Status Global
    lbl_status = gr.Textbox(label="Status Model Aktif", value="Default (IndoBERT Base / Uploaded Manual)", interactive=False)

    with gr.Tabs():
        # === TAB 1: KONFIGURASI MODEL ===
        with gr.Tab("βš™οΈ Konfigurasi Model"):
            with gr.Tabs():
                
                # --- Sub Tab 1: Upload ---
                with gr.Tab("πŸ“‚ Unggah Model"):
                    gr.Markdown("Upload file `.zip` berisi model yang sudah dilatih (dari Komputer).")
                    in_zip = gr.File(label="File ZIP Model")
                    btn_upload = gr.Button("Ekstrak & Pakai Model", variant="primary")
                    out_log_upload = gr.Textbox(label="Log Sistem")
                    
                    btn_upload.click(handle_zip_upload, inputs=in_zip, outputs=[out_log_upload, lbl_status])
                
                # --- Sub Tab 2: Training ---
                with gr.Tab("πŸ‹οΈβ€β™€οΈ Latih Model"):
                    gr.Markdown("Latih model baru menggunakan Dataset CSV sendiri di Cloud.")
                    with gr.Row():
                        in_csv = gr.File(label="Dataset CSV")
                        in_sep = gr.Textbox(label="Separator", value=";")
                    with gr.Row():
                        in_ep = gr.Number(label="Epoch", value=1, precision=0)
                        in_bs = gr.Number(label="Batch Size", value=4, precision=0)
                        in_lr = gr.Number(label="Learning Rate", value=2e-5)
                    btn_train = gr.Button("Mulai Training", variant="stop")
                    out_log_train = gr.Textbox(label="Log Training", lines=5)
                    
                    btn_train.click(train_model_cloud, inputs=[in_csv, in_sep, in_ep, in_bs, in_lr], outputs=[out_log_train, lbl_status])

        # === TAB 2: TESTING ===
        with gr.Tab("πŸ§ͺ Testing"):
            gr.Markdown("Uji model yang sedang aktif.")
            
            with gr.Tabs():
                with gr.Tab("πŸ“ Uji Satu Kalimat"):
                    in_txt = gr.Textbox(label="Masukkan Kalimat", lines=2, placeholder="Contoh: Saya sangat bahagia hari ini...")
                    btn_pred = gr.Button("Prediksi Emosi")
                    out_lbl = gr.Label(label="Hasil Prediksi")
                    btn_pred.click(predict_text, inputs=in_txt, outputs=out_lbl)
                
                with gr.Tab("πŸ“Š Uji Batch (CSV)"):
                    in_csv_test = gr.File(label="Upload CSV Test")
                    btn_batch = gr.Button("Analisis Batch")
                    out_json = gr.JSON(label="Hasil Analisis")
                    btn_batch.click(predict_csv, inputs=[in_csv_test, in_sep], outputs=out_json)

if __name__ == "__main__":
    app.launch()