import pandas as pd import pickle import os from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import LinearSVC from sklearn.calibration import CalibratedClassifierCV from sklearn.pipeline import make_pipeline from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report # Paths BASE_DIR = os.path.dirname(os.path.abspath(__file__)) DATA_PATH = os.path.join(BASE_DIR, '../data/reddit_disaster_posts.csv') MODEL_DIR = os.path.join(BASE_DIR, 'models') # --- 1. CUSTOM TAGLISH STOP WORDS --- # removing these prevents the model from cheating by memorizing common grammar words TAGLISH_STOP_WORDS = [ 'ang', 'mga', 'ng', 'sa', 'na', 'ko', 'mo', 'ba', 'ka', 'yung', 'ni', 'no', 'at', 'o', 'kay', 'to', 'po', 'pa', 'din', 'rin', 'naman', 'nyo', 'nila', 'namin', 'kasi', 'kame', 'kami', 'tayo', 'sana', 'lang', 'talaga', 'di', 'eh', 'oh', 'ah', 'yun', 'yan', 'the', 'is', 'a', 'an', 'and', 'or', 'of', 'to', 'in', 'on', 'for' ] def train_tfidf(): print(f"Loading data from: {DATA_PATH}") if not os.path.exists(DATA_PATH): print("Error: CSV not found.") return df = pd.read_csv(DATA_PATH) df['text'] = df['text'].astype(str).fillna('') df = df.dropna(subset=['label']) X = df['text'] y = df['label'].astype(int) print(f"Training TF-IDF + SVM Ensemble on {len(df)} posts...") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # --- 2. THE UPGRADED PIPELINE --- model = make_pipeline( TfidfVectorizer( stop_words=TAGLISH_STOP_WORDS, max_features=5000, ngram_range=(1, 3), sublinear_tf=True ), CalibratedClassifierCV(LinearSVC(dual=False, class_weight='balanced'), method='sigmoid') ) model.fit(X_train, y_train) # Evaluate y_pred = model.predict(X_test) acc = accuracy_score(y_test, y_pred) print(f"✅ TF-IDF Model Accuracy: {acc * 100:.2f}%") print("Classification Report:") print(classification_report(y_test, y_pred)) # Save save_path = os.path.join(MODEL_DIR, 'tfidf_ensemble.pkl') with open(save_path, 'wb') as f: pickle.dump(model, f) print(f"Model saved to: {save_path}") if __name__ == "__main__": train_tfidf()