# train_model.py import pandas as pd import joblib import os import re from datasets import load_dataset from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression CACHE_DIR = "cache" MODEL_PATH = os.path.join(CACHE_DIR, "model.joblib") VEC_PATH = os.path.join(CACHE_DIR, "vectorizer.joblib") os.makedirs(CACHE_DIR, exist_ok=True) def clean_text(text): return re.sub(r"[^\w\s]", "", text.lower()) print("📥 Loading AG News dataset...") dataset = load_dataset("ag_news", split="train") df = pd.DataFrame(dataset) df["cleaned"] = df["text"].apply(clean_text) X = df["cleaned"] y = df["label"] print("🔠 Vectorizing text...") vectorizer = TfidfVectorizer(max_features=1000) X_vec = vectorizer.fit_transform(X) print("🤖 Training model...") clf = LogisticRegression(max_iter=1000) clf.fit(X_vec, y) print("💾 Saving model and vectorizer...") joblib.dump(clf, MODEL_PATH) joblib.dump(vectorizer, VEC_PATH) print("✅ Training complete.")