|
|
|
|
|
import pandas as pd |
|
|
import re |
|
|
import joblib |
|
|
import os |
|
|
from datasets import load_dataset |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.pipeline import Pipeline |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.metrics import accuracy_score, precision_recall_fscore_support |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_OUTPUT = 'api/data/model_emotion.pkl' |
|
|
|
|
|
|
|
|
print("π Mengunduh dataset GoEmotions...") |
|
|
|
|
|
try: |
|
|
dataset = load_dataset("google-research-datasets/go_emotions", "simplified", split="train") |
|
|
df = pd.DataFrame(dataset) |
|
|
labels_list = dataset.features['labels'].feature.names |
|
|
|
|
|
def get_first_label(label_ids): |
|
|
if len(label_ids) > 0: |
|
|
return labels_list[label_ids[0]] |
|
|
return "neutral" |
|
|
|
|
|
df['emotion_label'] = df['labels'].apply(get_first_label) |
|
|
X = df['text'] |
|
|
y = df['emotion_label'] |
|
|
print(f"β
Data siap: {len(df)} baris.") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Error: {e}") |
|
|
exit() |
|
|
|
|
|
|
|
|
def clean_text(text): |
|
|
text = str(text).lower() |
|
|
text = re.sub(r'http\S+', '', text) |
|
|
text = re.sub(r'[^a-zA-Z\s]', '', text) |
|
|
text = re.sub(r'\s+', ' ', text).strip() |
|
|
return text |
|
|
|
|
|
print("π§Ή Membersihkan data emosi...") |
|
|
X = X.apply(clean_text) |
|
|
|
|
|
|
|
|
print("π Melatih Model Emosi (Logistic Regression Fixed)...") |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
pipeline = Pipeline([ |
|
|
('tfidf', TfidfVectorizer( |
|
|
max_features=12000, |
|
|
stop_words='english', |
|
|
ngram_range=(1, 2), |
|
|
sublinear_tf=True |
|
|
)), |
|
|
('clf', LogisticRegression( |
|
|
max_iter=1000, |
|
|
solver='lbfgs', |
|
|
C=1.2 |
|
|
)) |
|
|
]) |
|
|
|
|
|
pipeline.fit(X_train, y_train) |
|
|
|
|
|
|
|
|
print("π Menghitung Metrik Evaluasi...") |
|
|
predictions = pipeline.predict(X_test) |
|
|
|
|
|
accuracy = accuracy_score(y_test, predictions) |
|
|
precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions, average='weighted', zero_division=0) |
|
|
|
|
|
print("\n" + "="*40) |
|
|
print(" HASIL EVALUASI MODEL EMOSI (FINAL)") |
|
|
print("="*40) |
|
|
print(f"{'Metrik':<15} | {'Skor':<10}") |
|
|
print("-" * 30) |
|
|
print(f"{'Akurasi':<15} | {accuracy:.3f} ({accuracy*100:.1f}%)") |
|
|
print(f"{'Precision':<15} | {precision:.3f}") |
|
|
print(f"{'Recall':<15} | {recall:.3f}") |
|
|
print(f"{'F1-Score':<15} | {f1:.3f}") |
|
|
print("="*40 + "\n") |
|
|
|
|
|
os.makedirs(os.path.dirname(MODEL_OUTPUT), exist_ok=True) |
|
|
joblib.dump(pipeline, MODEL_OUTPUT) |
|
|
print(f"πΎ SUKSES! Model Emosi disimpan di: {MODEL_OUTPUT}") |