import time import joblib import pandas as pd import numpy as np import xgboost as xgb import matplotlib.pyplot as plt from tqdm.auto import tqdm from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import classification_report from sklearn.preprocessing import StandardScaler from sklearn.metrics import confusion_matrix from scipy.sparse import hstack, csr_matrix # =============================== # PATHS # =============================== TRAIN_PATH = "/Users/vidyasagarkaruturi/Downloads/machine learning/src/data/processed/train.csv" VAL_PATH = "/Users/vidyasagarkaruturi/Downloads/machine learning/src/data/processed/val.csv" TEST_PATH = "/Users/vidyasagarkaruturi/Downloads/machine learning/src/data/processed/test.csv" MODEL_SAVE_PATH = "document_classifier_xgb.pkl" # =============================== # LOAD DATA # =============================== print("šŸ“‚ Loading data...") train_df = pd.read_csv(TRAIN_PATH) val_df = pd.read_csv(VAL_PATH) test_df = pd.read_csv(TEST_PATH) X_train_text = train_df["text"].fillna("") X_val_text = val_df["text"].fillna("") X_test_text = test_df["text"].fillna("") y_train = train_df["label"] y_val = val_df["label"] y_test = test_df["label"] print("āœ… Data loaded successfully") # =============================== # TF-IDF FEATURES # =============================== print("🧠 Creating TF-IDF features...") word_vectorizer = TfidfVectorizer( max_features=40000, ngram_range=(1, 2), stop_words="english" ) char_vectorizer = TfidfVectorizer( analyzer="char", ngram_range=(3, 5), max_features=20000 ) X_train_word = word_vectorizer.fit_transform(X_train_text) X_val_word = word_vectorizer.transform(X_val_text) X_test_word = word_vectorizer.transform(X_test_text) X_train_char = char_vectorizer.fit_transform(X_train_text) X_val_char = char_vectorizer.transform(X_val_text) X_test_char = char_vectorizer.transform(X_test_text) X_train_text_features = hstack([X_train_word, X_train_char]) X_val_text_features = hstack([X_val_word, X_val_char]) X_test_text_features = hstack([X_test_word, X_test_char]) print("āœ… Text features ready") # =============================== # NUMERIC FEATURES # =============================== print("šŸ”¢ Adding numeric features...") numeric_cols = [ "char_count", "digit_count", "uppercase_count", "currency_count", "line_count" ] scaler = StandardScaler() X_train_num = scaler.fit_transform(train_df[numeric_cols]) X_val_num = scaler.transform(val_df[numeric_cols]) X_test_num = scaler.transform(test_df[numeric_cols]) X_train_num = csr_matrix(X_train_num) X_val_num = csr_matrix(X_val_num) X_test_num = csr_matrix(X_test_num) # Combine text + numeric X_train = hstack([X_train_text_features, X_train_num]) X_val = hstack([X_val_text_features, X_val_num]) X_test = hstack([X_test_text_features, X_test_num]) print("āœ… Feature matrix ready") # =============================== # MODEL # =============================== print("šŸš€ Starting training...") N_ESTIMATORS = 400 class TqdmCallback(xgb.callback.TrainingCallback): def __init__(self, total): self.pbar = tqdm(total=total, desc="Training Progress", unit="trees") def after_iteration(self, model, epoch, evals_log): self.pbar.update(1) return False def after_training(self, model): self.pbar.close() return model model = xgb.XGBClassifier( n_estimators=N_ESTIMATORS, max_depth=6, learning_rate=0.1, tree_method="hist", eval_metric="mlogloss", early_stopping_rounds=30, callbacks=[TqdmCallback(N_ESTIMATORS)] ) start_time = time.time() model.fit( X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=False ) print(f"\nā± Training completed in {round(time.time() - start_time, 2)} seconds") # =============================== # EVALUATION # =============================== print("\nšŸ“Š Validation Performance:") val_preds = model.predict(X_val) print(classification_report(y_val, val_preds)) print("\nšŸ“Š Test Performance:") test_preds = model.predict(X_test) print(classification_report(y_test, test_preds)) # =============================== # TRAINING CURVE # =============================== results = model.evals_result() train_loss = results["validation_0"]["mlogloss"] val_loss = results["validation_1"]["mlogloss"] plt.figure(figsize=(8,5)) plt.plot(train_loss, label="Train Loss") plt.plot(val_loss, label="Validation Loss") plt.xlabel("Boosting Rounds") plt.ylabel("Log Loss") plt.title("Training Curve") plt.legend() plt.savefig("training_curve.png", dpi=150, bbox_inches="tight") plt.close() print("šŸ“ˆ Training curve saved to training_curve.png") # =============================== # FEATURE IMPORTANCE # =============================== plt.figure(figsize=(10,8)) xgb.plot_importance(model, max_num_features=20) plt.title("Top 20 Important Features") plt.savefig("feature_importance.png", dpi=150, bbox_inches="tight") plt.close() print("šŸ“Š Feature importance saved to feature_importance.png") # =============================== # SAVE MODEL # =============================== # Clear callbacks before saving — TqdmCallback holds an open file handle # (TextIOWrapper) that joblib/pickle cannot serialize. model.set_params(callbacks=[]) joblib.dump({ "model": model, "word_vectorizer": word_vectorizer, "char_vectorizer": char_vectorizer, "scaler": scaler }, MODEL_SAVE_PATH) print(f"\nšŸ’¾ Model saved to {MODEL_SAVE_PATH}") print("šŸ”„ All done!")