| | import time |
| | import joblib |
| | import pandas as pd |
| | import numpy as np |
| | import xgboost as xgb |
| | import matplotlib.pyplot as plt |
| |
|
| | from tqdm.auto import tqdm |
| | from sklearn.feature_extraction.text import TfidfVectorizer |
| | from sklearn.metrics import classification_report |
| | from sklearn.preprocessing import StandardScaler |
| | from sklearn.metrics import confusion_matrix |
| | from scipy.sparse import hstack, csr_matrix |
| |
|
| | |
| | |
| | |
| |
|
| | TRAIN_PATH = "/Users/vidyasagarkaruturi/Downloads/machine learning/src/data/processed/train.csv" |
| | VAL_PATH = "/Users/vidyasagarkaruturi/Downloads/machine learning/src/data/processed/val.csv" |
| | TEST_PATH = "/Users/vidyasagarkaruturi/Downloads/machine learning/src/data/processed/test.csv" |
| |
|
| | MODEL_SAVE_PATH = "document_classifier_xgb.pkl" |
| |
|
| | |
| | |
| | |
| |
|
| | print("π Loading data...") |
| |
|
| | train_df = pd.read_csv(TRAIN_PATH) |
| | val_df = pd.read_csv(VAL_PATH) |
| | test_df = pd.read_csv(TEST_PATH) |
| |
|
| | X_train_text = train_df["text"].fillna("") |
| | X_val_text = val_df["text"].fillna("") |
| | X_test_text = test_df["text"].fillna("") |
| |
|
| | y_train = train_df["label"] |
| | y_val = val_df["label"] |
| | y_test = test_df["label"] |
| |
|
| | print("β
Data loaded successfully") |
| |
|
| | |
| | |
| | |
| |
|
| | print("π§ Creating TF-IDF features...") |
| |
|
| | word_vectorizer = TfidfVectorizer( |
| | max_features=40000, |
| | ngram_range=(1, 2), |
| | stop_words="english" |
| | ) |
| |
|
| | char_vectorizer = TfidfVectorizer( |
| | analyzer="char", |
| | ngram_range=(3, 5), |
| | max_features=20000 |
| | ) |
| |
|
| | X_train_word = word_vectorizer.fit_transform(X_train_text) |
| | X_val_word = word_vectorizer.transform(X_val_text) |
| | X_test_word = word_vectorizer.transform(X_test_text) |
| |
|
| | X_train_char = char_vectorizer.fit_transform(X_train_text) |
| | X_val_char = char_vectorizer.transform(X_val_text) |
| | X_test_char = char_vectorizer.transform(X_test_text) |
| |
|
| | X_train_text_features = hstack([X_train_word, X_train_char]) |
| | X_val_text_features = hstack([X_val_word, X_val_char]) |
| | X_test_text_features = hstack([X_test_word, X_test_char]) |
| |
|
| | print("β
Text features ready") |
| |
|
| | |
| | |
| | |
| |
|
| | print("π’ Adding numeric features...") |
| |
|
| | numeric_cols = [ |
| | "char_count", |
| | "digit_count", |
| | "uppercase_count", |
| | "currency_count", |
| | "line_count" |
| | ] |
| |
|
| | scaler = StandardScaler() |
| |
|
| | X_train_num = scaler.fit_transform(train_df[numeric_cols]) |
| | X_val_num = scaler.transform(val_df[numeric_cols]) |
| | X_test_num = scaler.transform(test_df[numeric_cols]) |
| |
|
| | X_train_num = csr_matrix(X_train_num) |
| | X_val_num = csr_matrix(X_val_num) |
| | X_test_num = csr_matrix(X_test_num) |
| |
|
| | |
| | X_train = hstack([X_train_text_features, X_train_num]) |
| | X_val = hstack([X_val_text_features, X_val_num]) |
| | X_test = hstack([X_test_text_features, X_test_num]) |
| |
|
| | print("β
Feature matrix ready") |
| | |
| | |
| | |
| |
|
| | print("π Starting training...") |
| |
|
| | N_ESTIMATORS = 400 |
| |
|
| | class TqdmCallback(xgb.callback.TrainingCallback): |
| | def __init__(self, total): |
| | self.pbar = tqdm(total=total, desc="Training Progress", unit="trees") |
| |
|
| | def after_iteration(self, model, epoch, evals_log): |
| | self.pbar.update(1) |
| | return False |
| |
|
| | def after_training(self, model): |
| | self.pbar.close() |
| | return model |
| |
|
| | model = xgb.XGBClassifier( |
| | n_estimators=N_ESTIMATORS, |
| | max_depth=6, |
| | learning_rate=0.1, |
| | tree_method="hist", |
| | eval_metric="mlogloss", |
| | early_stopping_rounds=30, |
| | callbacks=[TqdmCallback(N_ESTIMATORS)] |
| | ) |
| |
|
| | start_time = time.time() |
| |
|
| | model.fit( |
| | X_train, |
| | y_train, |
| | eval_set=[(X_train, y_train), (X_val, y_val)], |
| | verbose=False |
| | ) |
| |
|
| | print(f"\nβ± Training completed in {round(time.time() - start_time, 2)} seconds") |
| |
|
| | |
| | |
| | |
| |
|
| | print("\nπ Validation Performance:") |
| | val_preds = model.predict(X_val) |
| | print(classification_report(y_val, val_preds)) |
| |
|
| | print("\nπ Test Performance:") |
| | test_preds = model.predict(X_test) |
| | print(classification_report(y_test, test_preds)) |
| |
|
| | |
| | |
| | |
| |
|
| | results = model.evals_result() |
| |
|
| | train_loss = results["validation_0"]["mlogloss"] |
| | val_loss = results["validation_1"]["mlogloss"] |
| |
|
| | plt.figure(figsize=(8,5)) |
| | plt.plot(train_loss, label="Train Loss") |
| | plt.plot(val_loss, label="Validation Loss") |
| | plt.xlabel("Boosting Rounds") |
| | plt.ylabel("Log Loss") |
| | plt.title("Training Curve") |
| | plt.legend() |
| | plt.savefig("training_curve.png", dpi=150, bbox_inches="tight") |
| | plt.close() |
| | print("π Training curve saved to training_curve.png") |
| |
|
| | |
| | |
| | |
| |
|
| | plt.figure(figsize=(10,8)) |
| | xgb.plot_importance(model, max_num_features=20) |
| | plt.title("Top 20 Important Features") |
| | plt.savefig("feature_importance.png", dpi=150, bbox_inches="tight") |
| | plt.close() |
| | print("π Feature importance saved to feature_importance.png") |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | model.set_params(callbacks=[]) |
| |
|
| | joblib.dump({ |
| | "model": model, |
| | "word_vectorizer": word_vectorizer, |
| | "char_vectorizer": char_vectorizer, |
| | "scaler": scaler |
| | }, MODEL_SAVE_PATH) |
| |
|
| | print(f"\nπΎ Model saved to {MODEL_SAVE_PATH}") |
| | print("π₯ All done!") |