Spaces:
Runtime error
Runtime error
File size: 1,006 Bytes
444ebf5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | # train_model.py
import pandas as pd
import joblib
import os
import re
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
CACHE_DIR = "cache"
MODEL_PATH = os.path.join(CACHE_DIR, "model.joblib")
VEC_PATH = os.path.join(CACHE_DIR, "vectorizer.joblib")
os.makedirs(CACHE_DIR, exist_ok=True)
def clean_text(text):
return re.sub(r"[^\w\s]", "", text.lower())
print("📥 Loading AG News dataset...")
dataset = load_dataset("ag_news", split="train")
df = pd.DataFrame(dataset)
df["cleaned"] = df["text"].apply(clean_text)
X = df["cleaned"]
y = df["label"]
print("🔠 Vectorizing text...")
vectorizer = TfidfVectorizer(max_features=1000)
X_vec = vectorizer.fit_transform(X)
print("🤖 Training model...")
clf = LogisticRegression(max_iter=1000)
clf.fit(X_vec, y)
print("💾 Saving model and vectorizer...")
joblib.dump(clf, MODEL_PATH)
joblib.dump(vectorizer, VEC_PATH)
print("✅ Training complete.") |