Spaces:

trantuan1701
/

ml_exercise

Sleeping

File size: 4,246 Bytes

# file: train_demo_models.py
from __future__ import annotations
import os
import pickle
import numpy as np
from typing import Dict, Tuple, List
import nltk
from nltk.corpus import twitter_samples, stopwords
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from feature_extract import build_freqs, extract_features_2, extract_features_6

def _ensure_nltk():
    try:
        twitter_samples.fileids()
    except LookupError:
        nltk.download("twitter_samples", quiet=True)
    try:
        stopwords.words("english")
    except LookupError:
        nltk.download("stopwords", quiet=True)

def load_twitter_data() -> Tuple[List[str], np.ndarray]:
    pos = twitter_samples.strings("positive_tweets.json")
    neg = twitter_samples.strings("negative_tweets.json")
    tweets = pos + neg
    y = np.array([1] * len(pos) + [0] * len(neg))
    return tweets, y

def vectorize(tweets: List[str], freqs: Dict[Tuple[str, float], float], mode: str = "2f") -> np.ndarray:
    feat_fn = extract_features_2 if mode == "2f" else extract_features_6
    rows = [feat_fn(t, freqs) for t in tweets]
    return np.vstack(rows) if rows else np.zeros((0, 2 if mode == "2f" else 6))

ALL_MODEL_SPECS: Dict[str, object] = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": LGBMClassifier(random_state=42),
    "SVM": SVC(kernel="linear", probability=True, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(solver="liblinear", random_state=42),
}

def make_models(include: List[str] | None = None) -> Dict[str, object]:
    if include is None:
        return {k: v for k, v in ALL_MODEL_SPECS.items()}
    return {k: ALL_MODEL_SPECS[k] for k in include}

def _fit_and_log(name: str, clf, X: np.ndarray, y: np.ndarray):
    clf.fit(X, y.ravel())
    y_pred = clf.predict(X)
    acc = accuracy_score(y, y_pred)
    try:
        y_proba = clf.predict_proba(X)
        loss = log_loss(y, y_proba)
        print(f"[{name}] Accuracy: {acc:.4f} | LogLoss: {loss:.4f}")
    except Exception:
        print(f"[{name}] Accuracy: {acc:.4f} | (no predict_proba)")
    return clf

def train_models(X: np.ndarray, y: np.ndarray, include: List[str] | None = None) -> Dict[str, object]:
    specs = make_models(include)
    trained: Dict[str, object] = {}
    for name, clf in specs.items():
        trained[name] = _fit_and_log(name, clf, X, y)
    return trained

def ensure_logreg_only(save_path: str = "demo_models.pkl"):
    _ensure_nltk()
    tweets, y = load_twitter_data()
    if os.path.exists(save_path):
        with open(save_path, "rb") as f:
            data = pickle.load(f)
        freqs = data.get("freqs")
        models_2f: Dict[str, object] = data.get("2f", {})
        models_6f: Dict[str, object] = data.get("6f", {})
    else:
        freqs = build_freqs(tweets, y.reshape(-1, 1))
        models_2f, models_6f = {}, {}
    X2 = vectorize(tweets, freqs, mode="2f")
    X6 = vectorize(tweets, freqs, mode="6f")
    if "Logistic Regression" not in models_2f:
        new_models_2f = train_models(X2, y, include=["Logistic Regression"])
        models_2f.update(new_models_2f)
    if "Logistic Regression" not in models_6f:
        new_models_6f = train_models(X6, y, include=["Logistic Regression"])
        models_6f.update(new_models_6f)
    data_to_save = {"freqs": freqs, "2f": models_2f, "6f": models_6f}
    with open(save_path, "wb") as f:
        pickle.dump(data_to_save, f)
    return data_to_save

def load_demo_models(save_path: str = "demo_models.pkl"):
    with open(save_path, "rb") as f:
        data = pickle.load(f)
    return data

if __name__ == "__main__":
    models = ensure_logreg_only()
    print("Các mô hình 2f:", list(models["2f"].keys()))
    print("Các mô hình 6f:", list(models["6f"].keys()))