File size: 2,392 Bytes
bdb271a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pandas as pd
import pickle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Paths
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_PATH = os.path.join(BASE_DIR, '../data/reddit_disaster_posts.csv')
MODEL_DIR = os.path.join(BASE_DIR, 'models')

# --- 1. CUSTOM TAGLISH STOP WORDS ---
# removing these prevents the model from cheating by memorizing common grammar words
TAGLISH_STOP_WORDS = [
    'ang', 'mga', 'ng', 'sa', 'na', 'ko', 'mo', 'ba', 'ka', 'yung', 
    'ni', 'no', 'at', 'o', 'kay', 'to', 'po', 'pa', 'din', 'rin', 
    'naman', 'nyo', 'nila', 'namin', 'kasi', 'kame', 'kami', 'tayo',
    'sana', 'lang', 'talaga', 'di', 'eh', 'oh', 'ah', 'yun', 'yan',
    'the', 'is', 'a', 'an', 'and', 'or', 'of', 'to', 'in', 'on', 'for'
]

def train_tfidf():
    print(f"Loading data from: {DATA_PATH}")
    if not os.path.exists(DATA_PATH):
        print("Error: CSV not found.")
        return

    df = pd.read_csv(DATA_PATH)
    df['text'] = df['text'].astype(str).fillna('')
    df = df.dropna(subset=['label'])
    
    X = df['text']
    y = df['label'].astype(int)

    print(f"Training TF-IDF + SVM Ensemble on {len(df)} posts...")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # --- 2. THE UPGRADED PIPELINE ---
    model = make_pipeline(
        TfidfVectorizer(
            stop_words=TAGLISH_STOP_WORDS,
            max_features=5000, 
            ngram_range=(1, 3),         
            sublinear_tf=True             
        ),
        CalibratedClassifierCV(LinearSVC(dual=False, class_weight='balanced'), method='sigmoid')
    )

    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"✅ TF-IDF Model Accuracy: {acc * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Save
    save_path = os.path.join(MODEL_DIR, 'tfidf_ensemble.pkl')
    with open(save_path, 'wb') as f:
        pickle.dump(model, f)
    
    print(f"Model saved to: {save_path}")

if __name__ == "__main__":
    train_tfidf()