# train_emotion.py import pandas as pd import re import joblib import os from datasets import load_dataset from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, precision_recall_fscore_support # ========================================== # ๐Ÿ”ง KONFIGURASI # ========================================== MODEL_OUTPUT = 'api/data/model_emotion.pkl' # ========================================== print("๐Ÿ” Mengunduh dataset GoEmotions...") try: dataset = load_dataset("google-research-datasets/go_emotions", "simplified", split="train") df = pd.DataFrame(dataset) labels_list = dataset.features['labels'].feature.names def get_first_label(label_ids): if len(label_ids) > 0: return labels_list[label_ids[0]] return "neutral" df['emotion_label'] = df['labels'].apply(get_first_label) X = df['text'] y = df['emotion_label'] print(f"โœ… Data siap: {len(df)} baris.") except Exception as e: print(f"โŒ Error: {e}") exit() # --- CLEANING DATA --- def clean_text(text): text = str(text).lower() text = re.sub(r'http\S+', '', text) text = re.sub(r'[^a-zA-Z\s]', '', text) text = re.sub(r'\s+', ' ', text).strip() return text print("๐Ÿงน Membersihkan data emosi...") X = X.apply(clean_text) # --- TRAINING --- print("๐Ÿš€ Melatih Model Emosi (Logistic Regression Fixed)...") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) pipeline = Pipeline([ ('tfidf', TfidfVectorizer( max_features=12000, # Fitur banyak biar detail stop_words='english', ngram_range=(1, 2), # Baca kata per kata & frasa sublinear_tf=True # [TRICK] Scaling logaritmik (Penting!) )), ('clf', LogisticRegression( max_iter=1000, solver='lbfgs', # Ganti ke lbfgs biar aman dari error multiclass C=1.2 # Agak agresif dikit (di atas 1.0) biar akurasi naik )) ]) pipeline.fit(X_train, y_train) # --- EVALUASI --- print("๐Ÿ“Š Menghitung Metrik Evaluasi...") predictions = pipeline.predict(X_test) accuracy = accuracy_score(y_test, predictions) precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions, average='weighted', zero_division=0) print("\n" + "="*40) print(" HASIL EVALUASI MODEL EMOSI (FINAL)") print("="*40) print(f"{'Metrik':<15} | {'Skor':<10}") print("-" * 30) print(f"{'Akurasi':<15} | {accuracy:.3f} ({accuracy*100:.1f}%)") print(f"{'Precision':<15} | {precision:.3f}") print(f"{'Recall':<15} | {recall:.3f}") print(f"{'F1-Score':<15} | {f1:.3f}") print("="*40 + "\n") os.makedirs(os.path.dirname(MODEL_OUTPUT), exist_ok=True) joblib.dump(pipeline, MODEL_OUTPUT) print(f"๐Ÿ’พ SUKSES! Model Emosi disimpan di: {MODEL_OUTPUT}")