| | |
| | import pandas as pd |
| | import re |
| | import joblib |
| | import os |
| | from datasets import load_dataset |
| | from sklearn.feature_extraction.text import TfidfVectorizer |
| | from sklearn.svm import LinearSVC |
| | from sklearn.pipeline import Pipeline |
| | from sklearn.model_selection import train_test_split |
| | from sklearn.metrics import accuracy_score, precision_recall_fscore_support |
| |
|
| | |
| | |
| | |
| | MODEL_OUTPUT = 'api/data/model_mbti.pkl' |
| | |
| |
|
| | print("π Mengunduh dataset MBTI (7000 Data)...") |
| |
|
| | try: |
| | |
| | dataset = load_dataset("gmnsong/MBTI.csv", split="train") |
| | df = pd.DataFrame(dataset) |
| | |
| | |
| | if 'type' not in df.columns: |
| | df.rename(columns={'label': 'type', 'text': 'posts'}, inplace=True) |
| | |
| | X = df['posts'] |
| | y = df['type'] |
| | print(f"β
Data siap: {len(df)} baris.") |
| |
|
| | except Exception as e: |
| | print(f"β Error: {e}") |
| | exit() |
| |
|
| | |
| | def clean_text(text): |
| | text = str(text).lower() |
| | text = re.sub(r'http\S+', '', text) |
| | text = re.sub(r'[^a-zA-Z\s]', '', text) |
| | text = re.sub(r'\s+', ' ', text).strip() |
| | return text |
| |
|
| | print("π§Ή Membersihkan data...") |
| | X = X.apply(clean_text) |
| |
|
| | |
| | print("π Melatih Model MBTI (SVM Optimized)...") |
| |
|
| | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
| |
|
| | pipeline = Pipeline([ |
| | ('tfidf', TfidfVectorizer( |
| | max_features=15000, |
| | stop_words='english', |
| | ngram_range=(1, 2), |
| | sublinear_tf=True |
| | )), |
| | ('clf', LinearSVC( |
| | dual=False, |
| | C=0.6, |
| | class_weight='balanced' |
| | )) |
| | ]) |
| |
|
| | pipeline.fit(X_train, y_train) |
| |
|
| | |
| | print("π Menghitung Metrik Evaluasi...") |
| | predictions = pipeline.predict(X_test) |
| |
|
| | accuracy = accuracy_score(y_test, predictions) |
| | precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions, average='weighted', zero_division=0) |
| |
|
| | print("\n" + "="*40) |
| | print(" HASIL EVALUASI MODEL MBTI (FINAL)") |
| | print("="*40) |
| | print(f"{'Metrik':<15} | {'Skor':<10}") |
| | print("-" * 30) |
| | print(f"{'Akurasi':<15} | {accuracy:.3f} ({accuracy*100:.1f}%)") |
| | print(f"{'Precision':<15} | {precision:.3f}") |
| | print(f"{'Recall':<15} | {recall:.3f}") |
| | print(f"{'F1-Score':<15} | {f1:.3f}") |
| | print("="*40 + "\n") |
| |
|
| | os.makedirs(os.path.dirname(MODEL_OUTPUT), exist_ok=True) |
| | joblib.dump(pipeline, MODEL_OUTPUT) |
| | print(f"πΎ SUKSES! Model MBTI disimpan di: {MODEL_OUTPUT}") |