document-classifier-xgb / train_model.py
vidyasagar786's picture
Upload train_model.py with huggingface_hub
907307c verified
import time
import joblib
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from scipy.sparse import hstack, csr_matrix
# ===============================
# PATHS
# ===============================
TRAIN_PATH = "/Users/vidyasagarkaruturi/Downloads/machine learning/src/data/processed/train.csv"
VAL_PATH = "/Users/vidyasagarkaruturi/Downloads/machine learning/src/data/processed/val.csv"
TEST_PATH = "/Users/vidyasagarkaruturi/Downloads/machine learning/src/data/processed/test.csv"
MODEL_SAVE_PATH = "document_classifier_xgb.pkl"
# ===============================
# LOAD DATA
# ===============================
print("πŸ“‚ Loading data...")
train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)
X_train_text = train_df["text"].fillna("")
X_val_text = val_df["text"].fillna("")
X_test_text = test_df["text"].fillna("")
y_train = train_df["label"]
y_val = val_df["label"]
y_test = test_df["label"]
print("βœ… Data loaded successfully")
# ===============================
# TF-IDF FEATURES
# ===============================
print("🧠 Creating TF-IDF features...")
word_vectorizer = TfidfVectorizer(
max_features=40000,
ngram_range=(1, 2),
stop_words="english"
)
char_vectorizer = TfidfVectorizer(
analyzer="char",
ngram_range=(3, 5),
max_features=20000
)
X_train_word = word_vectorizer.fit_transform(X_train_text)
X_val_word = word_vectorizer.transform(X_val_text)
X_test_word = word_vectorizer.transform(X_test_text)
X_train_char = char_vectorizer.fit_transform(X_train_text)
X_val_char = char_vectorizer.transform(X_val_text)
X_test_char = char_vectorizer.transform(X_test_text)
X_train_text_features = hstack([X_train_word, X_train_char])
X_val_text_features = hstack([X_val_word, X_val_char])
X_test_text_features = hstack([X_test_word, X_test_char])
print("βœ… Text features ready")
# ===============================
# NUMERIC FEATURES
# ===============================
print("πŸ”’ Adding numeric features...")
numeric_cols = [
"char_count",
"digit_count",
"uppercase_count",
"currency_count",
"line_count"
]
scaler = StandardScaler()
X_train_num = scaler.fit_transform(train_df[numeric_cols])
X_val_num = scaler.transform(val_df[numeric_cols])
X_test_num = scaler.transform(test_df[numeric_cols])
X_train_num = csr_matrix(X_train_num)
X_val_num = csr_matrix(X_val_num)
X_test_num = csr_matrix(X_test_num)
# Combine text + numeric
X_train = hstack([X_train_text_features, X_train_num])
X_val = hstack([X_val_text_features, X_val_num])
X_test = hstack([X_test_text_features, X_test_num])
print("βœ… Feature matrix ready")
# ===============================
# MODEL
# ===============================
print("πŸš€ Starting training...")
N_ESTIMATORS = 400
class TqdmCallback(xgb.callback.TrainingCallback):
def __init__(self, total):
self.pbar = tqdm(total=total, desc="Training Progress", unit="trees")
def after_iteration(self, model, epoch, evals_log):
self.pbar.update(1)
return False
def after_training(self, model):
self.pbar.close()
return model
model = xgb.XGBClassifier(
n_estimators=N_ESTIMATORS,
max_depth=6,
learning_rate=0.1,
tree_method="hist",
eval_metric="mlogloss",
early_stopping_rounds=30,
callbacks=[TqdmCallback(N_ESTIMATORS)]
)
start_time = time.time()
model.fit(
X_train,
y_train,
eval_set=[(X_train, y_train), (X_val, y_val)],
verbose=False
)
print(f"\n⏱ Training completed in {round(time.time() - start_time, 2)} seconds")
# ===============================
# EVALUATION
# ===============================
print("\nπŸ“Š Validation Performance:")
val_preds = model.predict(X_val)
print(classification_report(y_val, val_preds))
print("\nπŸ“Š Test Performance:")
test_preds = model.predict(X_test)
print(classification_report(y_test, test_preds))
# ===============================
# TRAINING CURVE
# ===============================
results = model.evals_result()
train_loss = results["validation_0"]["mlogloss"]
val_loss = results["validation_1"]["mlogloss"]
plt.figure(figsize=(8,5))
plt.plot(train_loss, label="Train Loss")
plt.plot(val_loss, label="Validation Loss")
plt.xlabel("Boosting Rounds")
plt.ylabel("Log Loss")
plt.title("Training Curve")
plt.legend()
plt.savefig("training_curve.png", dpi=150, bbox_inches="tight")
plt.close()
print("πŸ“ˆ Training curve saved to training_curve.png")
# ===============================
# FEATURE IMPORTANCE
# ===============================
plt.figure(figsize=(10,8))
xgb.plot_importance(model, max_num_features=20)
plt.title("Top 20 Important Features")
plt.savefig("feature_importance.png", dpi=150, bbox_inches="tight")
plt.close()
print("πŸ“Š Feature importance saved to feature_importance.png")
# ===============================
# SAVE MODEL
# ===============================
# Clear callbacks before saving β€” TqdmCallback holds an open file handle
# (TextIOWrapper) that joblib/pickle cannot serialize.
model.set_params(callbacks=[])
joblib.dump({
"model": model,
"word_vectorizer": word_vectorizer,
"char_vectorizer": char_vectorizer,
"scaler": scaler
}, MODEL_SAVE_PATH)
print(f"\nπŸ’Ύ Model saved to {MODEL_SAVE_PATH}")
print("πŸ”₯ All done!")