File size: 16,469 Bytes

9b02fb4

"""
Resume Section Classifier – Training Script

Fine-tunes distilbert-base-uncased for classifying resume text sections
into 8 categories: education, experience, skills, projects, summary,
certifications, contact, awards.

Author: Lorenzo Scaturchio (gr8monk3ys)

Usage:
    python train.py                           # Train with defaults
    python train.py --epochs 5 --batch-size 32
    python train.py --push-to-hub             # Push to HuggingFace Hub
    python train.py --output-dir ./my_model
"""

import json
import logging
import os
import sys
from pathlib import Path

import evaluate
import numpy as np
import torch
from datasets import DatasetDict
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

from data_generator import generate_dataset, get_label_mapping, load_as_hf_dataset

# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
MODEL_NAME = "distilbert-base-uncased"
DEFAULT_OUTPUT_DIR = "./model_output"
DEFAULT_LOGGING_DIR = "./logs"
HUB_MODEL_ID = "gr8monk3ys/resume-section-classifier"
MAX_LENGTH = 256


# ---------------------------------------------------------------------------
# Metrics computation
# ---------------------------------------------------------------------------
def build_compute_metrics(id2label: dict):
    """Build a compute_metrics function with access to label mappings."""
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)

        acc = accuracy_metric.compute(predictions=predictions, references=labels)
        f1_macro = f1_metric.compute(predictions=predictions, references=labels, average="macro")
        f1_weighted = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
        precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
        recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")

        return {
            "accuracy": acc["accuracy"],
            "f1_macro": f1_macro["f1"],
            "f1_weighted": f1_weighted["f1"],
            "precision": precision["precision"],
            "recall": recall["recall"],
        }

    return compute_metrics


# ---------------------------------------------------------------------------
# Tokenization
# ---------------------------------------------------------------------------
def tokenize_dataset(dataset_dict: DatasetDict, tokenizer, label2id: dict, max_length: int = MAX_LENGTH):
    """Tokenize all splits and encode labels as integers."""

    def preprocess(examples):
        tokenized = tokenizer(
            examples["text"],
            truncation=True,
            max_length=max_length,
            padding=False,  # Dynamic padding via DataCollator
        )
        tokenized["labels"] = [label2id[label] for label in examples["label"]]
        return tokenized

    tokenized = dataset_dict.map(
        preprocess,
        batched=True,
        remove_columns=["text", "label"],
        desc="Tokenizing",
    )

    return tokenized


# ---------------------------------------------------------------------------
# Main training function
# ---------------------------------------------------------------------------
def train(
    output_dir: str = DEFAULT_OUTPUT_DIR,
    model_name: str = MODEL_NAME,
    epochs: int = 4,
    batch_size: int = 16,
    learning_rate: float = 2e-5,
    weight_decay: float = 0.01,
    warmup_ratio: float = 0.1,
    max_length: int = MAX_LENGTH,
    examples_per_category: int = 80,
    augmented_copies: int = 2,
    seed: int = 42,
    push_to_hub: bool = False,
    hub_model_id: str = HUB_MODEL_ID,
    fp16: bool = None,
    gradient_accumulation_steps: int = 1,
    early_stopping_patience: int = 3,
):
    """
    Full training pipeline.

    Args:
        output_dir: Directory to save model and artifacts.
        model_name: Pretrained model identifier.
        epochs: Number of training epochs.
        batch_size: Training batch size.
        learning_rate: Peak learning rate.
        weight_decay: Weight decay for AdamW.
        warmup_ratio: Fraction of steps for warmup.
        max_length: Maximum token sequence length.
        examples_per_category: Base synthetic examples per category.
        augmented_copies: Augmented copies per base example.
        seed: Random seed.
        push_to_hub: Whether to push to HuggingFace Hub.
        hub_model_id: Hub model repository ID.
        fp16: Use mixed precision (auto-detected if None).
        gradient_accumulation_steps: Gradient accumulation steps.
        early_stopping_patience: Early stopping patience (epochs).
    """
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    # Auto-detect fp16
    if fp16 is None:
        fp16 = torch.cuda.is_available()

    logger.info("=" * 60)
    logger.info("Resume Section Classifier – Training")
    logger.info("=" * 60)
    logger.info(f"Model: {model_name}")
    logger.info(f"Output: {output_dir}")
    logger.info(f"Epochs: {epochs}, Batch size: {batch_size}, LR: {learning_rate}")
    logger.info(f"Device: {'CUDA' if torch.cuda.is_available() else 'MPS' if torch.backends.mps.is_available() else 'CPU'}")
    logger.info(f"FP16: {fp16}")

    # ------------------------------------------------------------------
    # 1. Generate synthetic data
    # ------------------------------------------------------------------
    logger.info("\n[1/5] Generating synthetic training data...")
    raw_dataset = generate_dataset(
        examples_per_category=examples_per_category,
        augmented_copies=augmented_copies,
        seed=seed,
    )
    label2id, id2label = get_label_mapping(raw_dataset)
    num_labels = len(label2id)

    logger.info(f"  Total examples: {len(raw_dataset)}")
    logger.info(f"  Labels ({num_labels}): {list(label2id.keys())}")

    # Create HF DatasetDict with train/val/test splits
    dataset_dict = load_as_hf_dataset(raw_dataset)
    logger.info(f"  Train: {len(dataset_dict['train'])}")
    logger.info(f"  Validation: {len(dataset_dict['validation'])}")
    logger.info(f"  Test: {len(dataset_dict['test'])}")

    # ------------------------------------------------------------------
    # 2. Tokenize
    # ------------------------------------------------------------------
    logger.info("\n[2/5] Loading tokenizer and tokenizing data...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenized_dataset = tokenize_dataset(dataset_dict, tokenizer, label2id, max_length)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # ------------------------------------------------------------------
    # 3. Load model
    # ------------------------------------------------------------------
    logger.info("\n[3/5] Loading pretrained model...")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
    )
    logger.info(f"  Parameters: {sum(p.numel() for p in model.parameters()):,}")
    logger.info(f"  Trainable: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

    # ------------------------------------------------------------------
    # 4. Training
    # ------------------------------------------------------------------
    logger.info("\n[4/5] Training...")

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        # Training hyperparameters
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size * 2,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_ratio=warmup_ratio,
        lr_scheduler_type="cosine",
        # Evaluation
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        # Logging
        logging_dir=DEFAULT_LOGGING_DIR,
        logging_strategy="steps",
        logging_steps=50,
        report_to="none",
        # Efficiency
        fp16=fp16,
        dataloader_num_workers=0,
        # Reproducibility
        seed=seed,
        data_seed=seed,
        # Hub
        push_to_hub=False,  # We'll push manually after evaluation
        # Misc
        save_total_limit=3,
        disable_tqdm=False,
    )

    callbacks = []
    if early_stopping_patience > 0:
        callbacks.append(EarlyStoppingCallback(early_stopping_patience=early_stopping_patience))

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=build_compute_metrics(id2label),
        callbacks=callbacks,
    )

    train_result = trainer.train()

    # Log training metrics
    logger.info("\nTraining Results:")
    for key, value in train_result.metrics.items():
        logger.info(f"  {key}: {value}")

    # ------------------------------------------------------------------
    # 5. Evaluation
    # ------------------------------------------------------------------
    logger.info("\n[5/5] Evaluating on test set...")
    test_results = trainer.evaluate(tokenized_dataset["test"])

    logger.info("\nTest Results:")
    for key, value in test_results.items():
        logger.info(f"  {key}: {value:.4f}" if isinstance(value, float) else f"  {key}: {value}")

    # ------------------------------------------------------------------
    # Save artifacts
    # ------------------------------------------------------------------
    logger.info("\nSaving model and artifacts...")

    # Save model + tokenizer
    final_path = output_path / "final_model"
    trainer.save_model(str(final_path))
    tokenizer.save_pretrained(str(final_path))

    # Save label mapping
    label_mapping = {
        "label2id": label2id,
        "id2label": {str(k): v for k, v in id2label.items()},
        "labels": list(label2id.keys()),
    }
    with open(final_path / "label_mapping.json", "w") as f:
        json.dump(label_mapping, f, indent=2)

    # Save training config
    train_config = {
        "model_name": model_name,
        "max_length": max_length,
        "epochs": epochs,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
        "weight_decay": weight_decay,
        "warmup_ratio": warmup_ratio,
        "examples_per_category": examples_per_category,
        "augmented_copies": augmented_copies,
        "seed": seed,
        "num_labels": num_labels,
        "train_size": len(dataset_dict["train"]),
        "val_size": len(dataset_dict["validation"]),
        "test_size": len(dataset_dict["test"]),
    }
    with open(final_path / "training_config.json", "w") as f:
        json.dump(train_config, f, indent=2)

    # Save metrics
    all_metrics = {
        "train": train_result.metrics,
        "test": test_results,
    }
    with open(final_path / "metrics.json", "w") as f:
        json.dump(all_metrics, f, indent=2)

    logger.info(f"\nAll artifacts saved to: {final_path}")

    # ------------------------------------------------------------------
    # Optional: Push to Hub
    # ------------------------------------------------------------------
    if push_to_hub:
        logger.info(f"\nPushing to HuggingFace Hub: {hub_model_id}")
        try:
            trainer.push_to_hub(
                repo_id=hub_model_id,
                commit_message="Upload fine-tuned resume section classifier",
            )
            tokenizer.push_to_hub(hub_model_id)
            logger.info("Successfully pushed to Hub!")
        except Exception as e:
            logger.error(f"Failed to push to Hub: {e}")
            logger.info("You can push manually later with:")
            logger.info(f"  huggingface-cli upload {hub_model_id} {final_path}")

    logger.info("\nTraining complete!")
    return test_results


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="Fine-tune DistilBERT for resume section classification",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

    # Model & output
    parser.add_argument("--model-name", type=str, default=MODEL_NAME,
                        help="Pretrained model name or path")
    parser.add_argument("--output-dir", type=str, default=DEFAULT_OUTPUT_DIR,
                        help="Output directory for model and artifacts")

    # Training hyperparameters
    parser.add_argument("--epochs", type=int, default=4,
                        help="Number of training epochs")
    parser.add_argument("--batch-size", type=int, default=16,
                        help="Training batch size per device")
    parser.add_argument("--learning-rate", type=float, default=2e-5,
                        help="Peak learning rate")
    parser.add_argument("--weight-decay", type=float, default=0.01,
                        help="Weight decay for AdamW")
    parser.add_argument("--warmup-ratio", type=float, default=0.1,
                        help="Fraction of total steps for linear warmup")
    parser.add_argument("--max-length", type=int, default=MAX_LENGTH,
                        help="Maximum token sequence length")
    parser.add_argument("--gradient-accumulation-steps", type=int, default=1,
                        help="Number of gradient accumulation steps")

    # Data
    parser.add_argument("--examples-per-category", type=int, default=80,
                        help="Base synthetic examples per category")
    parser.add_argument("--augmented-copies", type=int, default=2,
                        help="Augmented copies per base example")
    parser.add_argument("--seed", type=int, default=42,
                        help="Random seed for reproducibility")

    # Training config
    parser.add_argument("--fp16", action="store_true", default=None,
                        help="Force FP16 training")
    parser.add_argument("--no-fp16", action="store_true",
                        help="Disable FP16 training")
    parser.add_argument("--early-stopping-patience", type=int, default=3,
                        help="Early stopping patience (0 to disable)")

    # Hub
    parser.add_argument("--push-to-hub", action="store_true",
                        help="Push trained model to HuggingFace Hub")
    parser.add_argument("--hub-model-id", type=str, default=HUB_MODEL_ID,
                        help="HuggingFace Hub model ID")

    args = parser.parse_args()

    # Handle fp16 flags
    fp16 = args.fp16
    if args.no_fp16:
        fp16 = False

    results = train(
        output_dir=args.output_dir,
        model_name=args.model_name,
        epochs=args.epochs,
        batch_size=args.batch_size,
        learning_rate=args.learning_rate,
        weight_decay=args.weight_decay,
        warmup_ratio=args.warmup_ratio,
        max_length=args.max_length,
        examples_per_category=args.examples_per_category,
        augmented_copies=args.augmented_copies,
        seed=args.seed,
        push_to_hub=args.push_to_hub,
        hub_model_id=args.hub_model_id,
        fp16=fp16,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        early_stopping_patience=args.early_stopping_patience,
    )