gr8monk3ys's picture
Upload folder using huggingface_hub
9b02fb4 verified
"""
Resume Section Classifier – Training Script
Fine-tunes distilbert-base-uncased for classifying resume text sections
into 8 categories: education, experience, skills, projects, summary,
certifications, contact, awards.
Author: Lorenzo Scaturchio (gr8monk3ys)
Usage:
python train.py # Train with defaults
python train.py --epochs 5 --batch-size 32
python train.py --push-to-hub # Push to HuggingFace Hub
python train.py --output-dir ./my_model
"""
import json
import logging
import os
import sys
from pathlib import Path
import evaluate
import numpy as np
import torch
from datasets import DatasetDict
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
DataCollatorWithPadding,
EarlyStoppingCallback,
Trainer,
TrainingArguments,
)
from data_generator import generate_dataset, get_label_mapping, load_as_hf_dataset
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
MODEL_NAME = "distilbert-base-uncased"
DEFAULT_OUTPUT_DIR = "./model_output"
DEFAULT_LOGGING_DIR = "./logs"
HUB_MODEL_ID = "gr8monk3ys/resume-section-classifier"
MAX_LENGTH = 256
# ---------------------------------------------------------------------------
# Metrics computation
# ---------------------------------------------------------------------------
def build_compute_metrics(id2label: dict):
"""Build a compute_metrics function with access to label mappings."""
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
acc = accuracy_metric.compute(predictions=predictions, references=labels)
f1_macro = f1_metric.compute(predictions=predictions, references=labels, average="macro")
f1_weighted = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
return {
"accuracy": acc["accuracy"],
"f1_macro": f1_macro["f1"],
"f1_weighted": f1_weighted["f1"],
"precision": precision["precision"],
"recall": recall["recall"],
}
return compute_metrics
# ---------------------------------------------------------------------------
# Tokenization
# ---------------------------------------------------------------------------
def tokenize_dataset(dataset_dict: DatasetDict, tokenizer, label2id: dict, max_length: int = MAX_LENGTH):
"""Tokenize all splits and encode labels as integers."""
def preprocess(examples):
tokenized = tokenizer(
examples["text"],
truncation=True,
max_length=max_length,
padding=False, # Dynamic padding via DataCollator
)
tokenized["labels"] = [label2id[label] for label in examples["label"]]
return tokenized
tokenized = dataset_dict.map(
preprocess,
batched=True,
remove_columns=["text", "label"],
desc="Tokenizing",
)
return tokenized
# ---------------------------------------------------------------------------
# Main training function
# ---------------------------------------------------------------------------
def train(
output_dir: str = DEFAULT_OUTPUT_DIR,
model_name: str = MODEL_NAME,
epochs: int = 4,
batch_size: int = 16,
learning_rate: float = 2e-5,
weight_decay: float = 0.01,
warmup_ratio: float = 0.1,
max_length: int = MAX_LENGTH,
examples_per_category: int = 80,
augmented_copies: int = 2,
seed: int = 42,
push_to_hub: bool = False,
hub_model_id: str = HUB_MODEL_ID,
fp16: bool = None,
gradient_accumulation_steps: int = 1,
early_stopping_patience: int = 3,
):
"""
Full training pipeline.
Args:
output_dir: Directory to save model and artifacts.
model_name: Pretrained model identifier.
epochs: Number of training epochs.
batch_size: Training batch size.
learning_rate: Peak learning rate.
weight_decay: Weight decay for AdamW.
warmup_ratio: Fraction of steps for warmup.
max_length: Maximum token sequence length.
examples_per_category: Base synthetic examples per category.
augmented_copies: Augmented copies per base example.
seed: Random seed.
push_to_hub: Whether to push to HuggingFace Hub.
hub_model_id: Hub model repository ID.
fp16: Use mixed precision (auto-detected if None).
gradient_accumulation_steps: Gradient accumulation steps.
early_stopping_patience: Early stopping patience (epochs).
"""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Auto-detect fp16
if fp16 is None:
fp16 = torch.cuda.is_available()
logger.info("=" * 60)
logger.info("Resume Section Classifier – Training")
logger.info("=" * 60)
logger.info(f"Model: {model_name}")
logger.info(f"Output: {output_dir}")
logger.info(f"Epochs: {epochs}, Batch size: {batch_size}, LR: {learning_rate}")
logger.info(f"Device: {'CUDA' if torch.cuda.is_available() else 'MPS' if torch.backends.mps.is_available() else 'CPU'}")
logger.info(f"FP16: {fp16}")
# ------------------------------------------------------------------
# 1. Generate synthetic data
# ------------------------------------------------------------------
logger.info("\n[1/5] Generating synthetic training data...")
raw_dataset = generate_dataset(
examples_per_category=examples_per_category,
augmented_copies=augmented_copies,
seed=seed,
)
label2id, id2label = get_label_mapping(raw_dataset)
num_labels = len(label2id)
logger.info(f" Total examples: {len(raw_dataset)}")
logger.info(f" Labels ({num_labels}): {list(label2id.keys())}")
# Create HF DatasetDict with train/val/test splits
dataset_dict = load_as_hf_dataset(raw_dataset)
logger.info(f" Train: {len(dataset_dict['train'])}")
logger.info(f" Validation: {len(dataset_dict['validation'])}")
logger.info(f" Test: {len(dataset_dict['test'])}")
# ------------------------------------------------------------------
# 2. Tokenize
# ------------------------------------------------------------------
logger.info("\n[2/5] Loading tokenizer and tokenizing data...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_dataset = tokenize_dataset(dataset_dict, tokenizer, label2id, max_length)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# ------------------------------------------------------------------
# 3. Load model
# ------------------------------------------------------------------
logger.info("\n[3/5] Loading pretrained model...")
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=num_labels,
id2label=id2label,
label2id=label2id,
)
logger.info(f" Parameters: {sum(p.numel() for p in model.parameters()):,}")
logger.info(f" Trainable: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
# ------------------------------------------------------------------
# 4. Training
# ------------------------------------------------------------------
logger.info("\n[4/5] Training...")
training_args = TrainingArguments(
output_dir=output_dir,
overwrite_output_dir=True,
# Training hyperparameters
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size * 2,
gradient_accumulation_steps=gradient_accumulation_steps,
learning_rate=learning_rate,
weight_decay=weight_decay,
warmup_ratio=warmup_ratio,
lr_scheduler_type="cosine",
# Evaluation
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="f1_macro",
greater_is_better=True,
# Logging
logging_dir=DEFAULT_LOGGING_DIR,
logging_strategy="steps",
logging_steps=50,
report_to="none",
# Efficiency
fp16=fp16,
dataloader_num_workers=0,
# Reproducibility
seed=seed,
data_seed=seed,
# Hub
push_to_hub=False, # We'll push manually after evaluation
# Misc
save_total_limit=3,
disable_tqdm=False,
)
callbacks = []
if early_stopping_patience > 0:
callbacks.append(EarlyStoppingCallback(early_stopping_patience=early_stopping_patience))
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=build_compute_metrics(id2label),
callbacks=callbacks,
)
train_result = trainer.train()
# Log training metrics
logger.info("\nTraining Results:")
for key, value in train_result.metrics.items():
logger.info(f" {key}: {value}")
# ------------------------------------------------------------------
# 5. Evaluation
# ------------------------------------------------------------------
logger.info("\n[5/5] Evaluating on test set...")
test_results = trainer.evaluate(tokenized_dataset["test"])
logger.info("\nTest Results:")
for key, value in test_results.items():
logger.info(f" {key}: {value:.4f}" if isinstance(value, float) else f" {key}: {value}")
# ------------------------------------------------------------------
# Save artifacts
# ------------------------------------------------------------------
logger.info("\nSaving model and artifacts...")
# Save model + tokenizer
final_path = output_path / "final_model"
trainer.save_model(str(final_path))
tokenizer.save_pretrained(str(final_path))
# Save label mapping
label_mapping = {
"label2id": label2id,
"id2label": {str(k): v for k, v in id2label.items()},
"labels": list(label2id.keys()),
}
with open(final_path / "label_mapping.json", "w") as f:
json.dump(label_mapping, f, indent=2)
# Save training config
train_config = {
"model_name": model_name,
"max_length": max_length,
"epochs": epochs,
"batch_size": batch_size,
"learning_rate": learning_rate,
"weight_decay": weight_decay,
"warmup_ratio": warmup_ratio,
"examples_per_category": examples_per_category,
"augmented_copies": augmented_copies,
"seed": seed,
"num_labels": num_labels,
"train_size": len(dataset_dict["train"]),
"val_size": len(dataset_dict["validation"]),
"test_size": len(dataset_dict["test"]),
}
with open(final_path / "training_config.json", "w") as f:
json.dump(train_config, f, indent=2)
# Save metrics
all_metrics = {
"train": train_result.metrics,
"test": test_results,
}
with open(final_path / "metrics.json", "w") as f:
json.dump(all_metrics, f, indent=2)
logger.info(f"\nAll artifacts saved to: {final_path}")
# ------------------------------------------------------------------
# Optional: Push to Hub
# ------------------------------------------------------------------
if push_to_hub:
logger.info(f"\nPushing to HuggingFace Hub: {hub_model_id}")
try:
trainer.push_to_hub(
repo_id=hub_model_id,
commit_message="Upload fine-tuned resume section classifier",
)
tokenizer.push_to_hub(hub_model_id)
logger.info("Successfully pushed to Hub!")
except Exception as e:
logger.error(f"Failed to push to Hub: {e}")
logger.info("You can push manually later with:")
logger.info(f" huggingface-cli upload {hub_model_id} {final_path}")
logger.info("\nTraining complete!")
return test_results
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Fine-tune DistilBERT for resume section classification",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Model & output
parser.add_argument("--model-name", type=str, default=MODEL_NAME,
help="Pretrained model name or path")
parser.add_argument("--output-dir", type=str, default=DEFAULT_OUTPUT_DIR,
help="Output directory for model and artifacts")
# Training hyperparameters
parser.add_argument("--epochs", type=int, default=4,
help="Number of training epochs")
parser.add_argument("--batch-size", type=int, default=16,
help="Training batch size per device")
parser.add_argument("--learning-rate", type=float, default=2e-5,
help="Peak learning rate")
parser.add_argument("--weight-decay", type=float, default=0.01,
help="Weight decay for AdamW")
parser.add_argument("--warmup-ratio", type=float, default=0.1,
help="Fraction of total steps for linear warmup")
parser.add_argument("--max-length", type=int, default=MAX_LENGTH,
help="Maximum token sequence length")
parser.add_argument("--gradient-accumulation-steps", type=int, default=1,
help="Number of gradient accumulation steps")
# Data
parser.add_argument("--examples-per-category", type=int, default=80,
help="Base synthetic examples per category")
parser.add_argument("--augmented-copies", type=int, default=2,
help="Augmented copies per base example")
parser.add_argument("--seed", type=int, default=42,
help="Random seed for reproducibility")
# Training config
parser.add_argument("--fp16", action="store_true", default=None,
help="Force FP16 training")
parser.add_argument("--no-fp16", action="store_true",
help="Disable FP16 training")
parser.add_argument("--early-stopping-patience", type=int, default=3,
help="Early stopping patience (0 to disable)")
# Hub
parser.add_argument("--push-to-hub", action="store_true",
help="Push trained model to HuggingFace Hub")
parser.add_argument("--hub-model-id", type=str, default=HUB_MODEL_ID,
help="HuggingFace Hub model ID")
args = parser.parse_args()
# Handle fp16 flags
fp16 = args.fp16
if args.no_fp16:
fp16 = False
results = train(
output_dir=args.output_dir,
model_name=args.model_name,
epochs=args.epochs,
batch_size=args.batch_size,
learning_rate=args.learning_rate,
weight_decay=args.weight_decay,
warmup_ratio=args.warmup_ratio,
max_length=args.max_length,
examples_per_category=args.examples_per_category,
augmented_copies=args.augmented_copies,
seed=args.seed,
push_to_hub=args.push_to_hub,
hub_model_id=args.hub_model_id,
fp16=fp16,
gradient_accumulation_steps=args.gradient_accumulation_steps,
early_stopping_patience=args.early_stopping_patience,
)