Spaces:
Runtime error
Runtime error
| # train_model.py | |
| import pandas as pd | |
| import joblib | |
| import os | |
| import re | |
| from datasets import load_dataset | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.linear_model import LogisticRegression | |
| CACHE_DIR = "cache" | |
| MODEL_PATH = os.path.join(CACHE_DIR, "model.joblib") | |
| VEC_PATH = os.path.join(CACHE_DIR, "vectorizer.joblib") | |
| os.makedirs(CACHE_DIR, exist_ok=True) | |
| def clean_text(text): | |
| return re.sub(r"[^\w\s]", "", text.lower()) | |
| print("π₯ Loading AG News dataset...") | |
| dataset = load_dataset("ag_news", split="train") | |
| df = pd.DataFrame(dataset) | |
| df["cleaned"] = df["text"].apply(clean_text) | |
| X = df["cleaned"] | |
| y = df["label"] | |
| print("π Vectorizing text...") | |
| vectorizer = TfidfVectorizer(max_features=1000) | |
| X_vec = vectorizer.fit_transform(X) | |
| print("π€ Training model...") | |
| clf = LogisticRegression(max_iter=1000) | |
| clf.fit(X_vec, y) | |
| print("πΎ Saving model and vectorizer...") | |
| joblib.dump(clf, MODEL_PATH) | |
| joblib.dump(vectorizer, VEC_PATH) | |
| print("β Training complete.") |