Spaces:
Running
Running
File size: 2,392 Bytes
bdb271a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import pandas as pd
import pickle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Paths
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_PATH = os.path.join(BASE_DIR, '../data/reddit_disaster_posts.csv')
MODEL_DIR = os.path.join(BASE_DIR, 'models')
# --- 1. CUSTOM TAGLISH STOP WORDS ---
# removing these prevents the model from cheating by memorizing common grammar words
TAGLISH_STOP_WORDS = [
'ang', 'mga', 'ng', 'sa', 'na', 'ko', 'mo', 'ba', 'ka', 'yung',
'ni', 'no', 'at', 'o', 'kay', 'to', 'po', 'pa', 'din', 'rin',
'naman', 'nyo', 'nila', 'namin', 'kasi', 'kame', 'kami', 'tayo',
'sana', 'lang', 'talaga', 'di', 'eh', 'oh', 'ah', 'yun', 'yan',
'the', 'is', 'a', 'an', 'and', 'or', 'of', 'to', 'in', 'on', 'for'
]
def train_tfidf():
print(f"Loading data from: {DATA_PATH}")
if not os.path.exists(DATA_PATH):
print("Error: CSV not found.")
return
df = pd.read_csv(DATA_PATH)
df['text'] = df['text'].astype(str).fillna('')
df = df.dropna(subset=['label'])
X = df['text']
y = df['label'].astype(int)
print(f"Training TF-IDF + SVM Ensemble on {len(df)} posts...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# --- 2. THE UPGRADED PIPELINE ---
model = make_pipeline(
TfidfVectorizer(
stop_words=TAGLISH_STOP_WORDS,
max_features=5000,
ngram_range=(1, 3),
sublinear_tf=True
),
CalibratedClassifierCV(LinearSVC(dual=False, class_weight='balanced'), method='sigmoid')
)
model.fit(X_train, y_train)
# Evaluate
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"✅ TF-IDF Model Accuracy: {acc * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))
# Save
save_path = os.path.join(MODEL_DIR, 'tfidf_ensemble.pkl')
with open(save_path, 'wb') as f:
pickle.dump(model, f)
print(f"Model saved to: {save_path}")
if __name__ == "__main__":
train_tfidf() |