Spaces:
Running
Running
| import pandas as pd | |
| import pickle | |
| import os | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.svm import LinearSVC | |
| from sklearn.calibration import CalibratedClassifierCV | |
| from sklearn.pipeline import make_pipeline | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import accuracy_score, classification_report | |
| # Paths | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| DATA_PATH = os.path.join(BASE_DIR, '../data/reddit_disaster_posts.csv') | |
| MODEL_DIR = os.path.join(BASE_DIR, 'models') | |
| # --- 1. CUSTOM TAGLISH STOP WORDS --- | |
| # removing these prevents the model from cheating by memorizing common grammar words | |
| TAGLISH_STOP_WORDS = [ | |
| 'ang', 'mga', 'ng', 'sa', 'na', 'ko', 'mo', 'ba', 'ka', 'yung', | |
| 'ni', 'no', 'at', 'o', 'kay', 'to', 'po', 'pa', 'din', 'rin', | |
| 'naman', 'nyo', 'nila', 'namin', 'kasi', 'kame', 'kami', 'tayo', | |
| 'sana', 'lang', 'talaga', 'di', 'eh', 'oh', 'ah', 'yun', 'yan', | |
| 'the', 'is', 'a', 'an', 'and', 'or', 'of', 'to', 'in', 'on', 'for' | |
| ] | |
| def train_tfidf(): | |
| print(f"Loading data from: {DATA_PATH}") | |
| if not os.path.exists(DATA_PATH): | |
| print("Error: CSV not found.") | |
| return | |
| df = pd.read_csv(DATA_PATH) | |
| df['text'] = df['text'].astype(str).fillna('') | |
| df = df.dropna(subset=['label']) | |
| X = df['text'] | |
| y = df['label'].astype(int) | |
| print(f"Training TF-IDF + SVM Ensemble on {len(df)} posts...") | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| # --- 2. THE UPGRADED PIPELINE --- | |
| model = make_pipeline( | |
| TfidfVectorizer( | |
| stop_words=TAGLISH_STOP_WORDS, | |
| max_features=5000, | |
| ngram_range=(1, 3), | |
| sublinear_tf=True | |
| ), | |
| CalibratedClassifierCV(LinearSVC(dual=False, class_weight='balanced'), method='sigmoid') | |
| ) | |
| model.fit(X_train, y_train) | |
| # Evaluate | |
| y_pred = model.predict(X_test) | |
| acc = accuracy_score(y_test, y_pred) | |
| print(f"✅ TF-IDF Model Accuracy: {acc * 100:.2f}%") | |
| print("Classification Report:") | |
| print(classification_report(y_test, y_pred)) | |
| # Save | |
| save_path = os.path.join(MODEL_DIR, 'tfidf_ensemble.pkl') | |
| with open(save_path, 'wb') as f: | |
| pickle.dump(model, f) | |
| print(f"Model saved to: {save_path}") | |
| if __name__ == "__main__": | |
| train_tfidf() |