Dask_NLP_PipeLine / train_model.py
MukeshKapoor25's picture
Add training script and model/vectorizer files for AG News classification
444ebf5
# train_model.py
import pandas as pd
import joblib
import os
import re
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
CACHE_DIR = "cache"
MODEL_PATH = os.path.join(CACHE_DIR, "model.joblib")
VEC_PATH = os.path.join(CACHE_DIR, "vectorizer.joblib")
os.makedirs(CACHE_DIR, exist_ok=True)
def clean_text(text):
return re.sub(r"[^\w\s]", "", text.lower())
print("πŸ“₯ Loading AG News dataset...")
dataset = load_dataset("ag_news", split="train")
df = pd.DataFrame(dataset)
df["cleaned"] = df["text"].apply(clean_text)
X = df["cleaned"]
y = df["label"]
print("πŸ”  Vectorizing text...")
vectorizer = TfidfVectorizer(max_features=1000)
X_vec = vectorizer.fit_transform(X)
print("πŸ€– Training model...")
clf = LogisticRegression(max_iter=1000)
clf.fit(X_vec, y)
print("πŸ’Ύ Saving model and vectorizer...")
joblib.dump(clf, MODEL_PATH)
joblib.dump(vectorizer, VEC_PATH)
print("βœ… Training complete.")