|
|
""" |
|
|
Resume Section Classifier – Training Script |
|
|
|
|
|
Fine-tunes distilbert-base-uncased for classifying resume text sections |
|
|
into 8 categories: education, experience, skills, projects, summary, |
|
|
certifications, contact, awards. |
|
|
|
|
|
Author: Lorenzo Scaturchio (gr8monk3ys) |
|
|
|
|
|
Usage: |
|
|
python train.py # Train with defaults |
|
|
python train.py --epochs 5 --batch-size 32 |
|
|
python train.py --push-to-hub # Push to HuggingFace Hub |
|
|
python train.py --output-dir ./my_model |
|
|
""" |
|
|
|
|
|
import json |
|
|
import logging |
|
|
import os |
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
import evaluate |
|
|
import numpy as np |
|
|
import torch |
|
|
from datasets import DatasetDict |
|
|
from transformers import ( |
|
|
AutoModelForSequenceClassification, |
|
|
AutoTokenizer, |
|
|
DataCollatorWithPadding, |
|
|
EarlyStoppingCallback, |
|
|
Trainer, |
|
|
TrainingArguments, |
|
|
) |
|
|
|
|
|
from data_generator import generate_dataset, get_label_mapping, load_as_hf_dataset |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format="%(asctime)s [%(levelname)s] %(message)s", |
|
|
handlers=[logging.StreamHandler(sys.stdout)], |
|
|
) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_NAME = "distilbert-base-uncased" |
|
|
DEFAULT_OUTPUT_DIR = "./model_output" |
|
|
DEFAULT_LOGGING_DIR = "./logs" |
|
|
HUB_MODEL_ID = "gr8monk3ys/resume-section-classifier" |
|
|
MAX_LENGTH = 256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_compute_metrics(id2label: dict): |
|
|
"""Build a compute_metrics function with access to label mappings.""" |
|
|
accuracy_metric = evaluate.load("accuracy") |
|
|
f1_metric = evaluate.load("f1") |
|
|
precision_metric = evaluate.load("precision") |
|
|
recall_metric = evaluate.load("recall") |
|
|
|
|
|
def compute_metrics(eval_pred): |
|
|
logits, labels = eval_pred |
|
|
predictions = np.argmax(logits, axis=-1) |
|
|
|
|
|
acc = accuracy_metric.compute(predictions=predictions, references=labels) |
|
|
f1_macro = f1_metric.compute(predictions=predictions, references=labels, average="macro") |
|
|
f1_weighted = f1_metric.compute(predictions=predictions, references=labels, average="weighted") |
|
|
precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted") |
|
|
recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted") |
|
|
|
|
|
return { |
|
|
"accuracy": acc["accuracy"], |
|
|
"f1_macro": f1_macro["f1"], |
|
|
"f1_weighted": f1_weighted["f1"], |
|
|
"precision": precision["precision"], |
|
|
"recall": recall["recall"], |
|
|
} |
|
|
|
|
|
return compute_metrics |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def tokenize_dataset(dataset_dict: DatasetDict, tokenizer, label2id: dict, max_length: int = MAX_LENGTH): |
|
|
"""Tokenize all splits and encode labels as integers.""" |
|
|
|
|
|
def preprocess(examples): |
|
|
tokenized = tokenizer( |
|
|
examples["text"], |
|
|
truncation=True, |
|
|
max_length=max_length, |
|
|
padding=False, |
|
|
) |
|
|
tokenized["labels"] = [label2id[label] for label in examples["label"]] |
|
|
return tokenized |
|
|
|
|
|
tokenized = dataset_dict.map( |
|
|
preprocess, |
|
|
batched=True, |
|
|
remove_columns=["text", "label"], |
|
|
desc="Tokenizing", |
|
|
) |
|
|
|
|
|
return tokenized |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def train( |
|
|
output_dir: str = DEFAULT_OUTPUT_DIR, |
|
|
model_name: str = MODEL_NAME, |
|
|
epochs: int = 4, |
|
|
batch_size: int = 16, |
|
|
learning_rate: float = 2e-5, |
|
|
weight_decay: float = 0.01, |
|
|
warmup_ratio: float = 0.1, |
|
|
max_length: int = MAX_LENGTH, |
|
|
examples_per_category: int = 80, |
|
|
augmented_copies: int = 2, |
|
|
seed: int = 42, |
|
|
push_to_hub: bool = False, |
|
|
hub_model_id: str = HUB_MODEL_ID, |
|
|
fp16: bool = None, |
|
|
gradient_accumulation_steps: int = 1, |
|
|
early_stopping_patience: int = 3, |
|
|
): |
|
|
""" |
|
|
Full training pipeline. |
|
|
|
|
|
Args: |
|
|
output_dir: Directory to save model and artifacts. |
|
|
model_name: Pretrained model identifier. |
|
|
epochs: Number of training epochs. |
|
|
batch_size: Training batch size. |
|
|
learning_rate: Peak learning rate. |
|
|
weight_decay: Weight decay for AdamW. |
|
|
warmup_ratio: Fraction of steps for warmup. |
|
|
max_length: Maximum token sequence length. |
|
|
examples_per_category: Base synthetic examples per category. |
|
|
augmented_copies: Augmented copies per base example. |
|
|
seed: Random seed. |
|
|
push_to_hub: Whether to push to HuggingFace Hub. |
|
|
hub_model_id: Hub model repository ID. |
|
|
fp16: Use mixed precision (auto-detected if None). |
|
|
gradient_accumulation_steps: Gradient accumulation steps. |
|
|
early_stopping_patience: Early stopping patience (epochs). |
|
|
""" |
|
|
output_path = Path(output_dir) |
|
|
output_path.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
if fp16 is None: |
|
|
fp16 = torch.cuda.is_available() |
|
|
|
|
|
logger.info("=" * 60) |
|
|
logger.info("Resume Section Classifier – Training") |
|
|
logger.info("=" * 60) |
|
|
logger.info(f"Model: {model_name}") |
|
|
logger.info(f"Output: {output_dir}") |
|
|
logger.info(f"Epochs: {epochs}, Batch size: {batch_size}, LR: {learning_rate}") |
|
|
logger.info(f"Device: {'CUDA' if torch.cuda.is_available() else 'MPS' if torch.backends.mps.is_available() else 'CPU'}") |
|
|
logger.info(f"FP16: {fp16}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("\n[1/5] Generating synthetic training data...") |
|
|
raw_dataset = generate_dataset( |
|
|
examples_per_category=examples_per_category, |
|
|
augmented_copies=augmented_copies, |
|
|
seed=seed, |
|
|
) |
|
|
label2id, id2label = get_label_mapping(raw_dataset) |
|
|
num_labels = len(label2id) |
|
|
|
|
|
logger.info(f" Total examples: {len(raw_dataset)}") |
|
|
logger.info(f" Labels ({num_labels}): {list(label2id.keys())}") |
|
|
|
|
|
|
|
|
dataset_dict = load_as_hf_dataset(raw_dataset) |
|
|
logger.info(f" Train: {len(dataset_dict['train'])}") |
|
|
logger.info(f" Validation: {len(dataset_dict['validation'])}") |
|
|
logger.info(f" Test: {len(dataset_dict['test'])}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("\n[2/5] Loading tokenizer and tokenizing data...") |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
tokenized_dataset = tokenize_dataset(dataset_dict, tokenizer, label2id, max_length) |
|
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("\n[3/5] Loading pretrained model...") |
|
|
model = AutoModelForSequenceClassification.from_pretrained( |
|
|
model_name, |
|
|
num_labels=num_labels, |
|
|
id2label=id2label, |
|
|
label2id=label2id, |
|
|
) |
|
|
logger.info(f" Parameters: {sum(p.numel() for p in model.parameters()):,}") |
|
|
logger.info(f" Trainable: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("\n[4/5] Training...") |
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir=output_dir, |
|
|
overwrite_output_dir=True, |
|
|
|
|
|
num_train_epochs=epochs, |
|
|
per_device_train_batch_size=batch_size, |
|
|
per_device_eval_batch_size=batch_size * 2, |
|
|
gradient_accumulation_steps=gradient_accumulation_steps, |
|
|
learning_rate=learning_rate, |
|
|
weight_decay=weight_decay, |
|
|
warmup_ratio=warmup_ratio, |
|
|
lr_scheduler_type="cosine", |
|
|
|
|
|
eval_strategy="epoch", |
|
|
save_strategy="epoch", |
|
|
load_best_model_at_end=True, |
|
|
metric_for_best_model="f1_macro", |
|
|
greater_is_better=True, |
|
|
|
|
|
logging_dir=DEFAULT_LOGGING_DIR, |
|
|
logging_strategy="steps", |
|
|
logging_steps=50, |
|
|
report_to="none", |
|
|
|
|
|
fp16=fp16, |
|
|
dataloader_num_workers=0, |
|
|
|
|
|
seed=seed, |
|
|
data_seed=seed, |
|
|
|
|
|
push_to_hub=False, |
|
|
|
|
|
save_total_limit=3, |
|
|
disable_tqdm=False, |
|
|
) |
|
|
|
|
|
callbacks = [] |
|
|
if early_stopping_patience > 0: |
|
|
callbacks.append(EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)) |
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=tokenized_dataset["train"], |
|
|
eval_dataset=tokenized_dataset["validation"], |
|
|
tokenizer=tokenizer, |
|
|
data_collator=data_collator, |
|
|
compute_metrics=build_compute_metrics(id2label), |
|
|
callbacks=callbacks, |
|
|
) |
|
|
|
|
|
train_result = trainer.train() |
|
|
|
|
|
|
|
|
logger.info("\nTraining Results:") |
|
|
for key, value in train_result.metrics.items(): |
|
|
logger.info(f" {key}: {value}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("\n[5/5] Evaluating on test set...") |
|
|
test_results = trainer.evaluate(tokenized_dataset["test"]) |
|
|
|
|
|
logger.info("\nTest Results:") |
|
|
for key, value in test_results.items(): |
|
|
logger.info(f" {key}: {value:.4f}" if isinstance(value, float) else f" {key}: {value}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("\nSaving model and artifacts...") |
|
|
|
|
|
|
|
|
final_path = output_path / "final_model" |
|
|
trainer.save_model(str(final_path)) |
|
|
tokenizer.save_pretrained(str(final_path)) |
|
|
|
|
|
|
|
|
label_mapping = { |
|
|
"label2id": label2id, |
|
|
"id2label": {str(k): v for k, v in id2label.items()}, |
|
|
"labels": list(label2id.keys()), |
|
|
} |
|
|
with open(final_path / "label_mapping.json", "w") as f: |
|
|
json.dump(label_mapping, f, indent=2) |
|
|
|
|
|
|
|
|
train_config = { |
|
|
"model_name": model_name, |
|
|
"max_length": max_length, |
|
|
"epochs": epochs, |
|
|
"batch_size": batch_size, |
|
|
"learning_rate": learning_rate, |
|
|
"weight_decay": weight_decay, |
|
|
"warmup_ratio": warmup_ratio, |
|
|
"examples_per_category": examples_per_category, |
|
|
"augmented_copies": augmented_copies, |
|
|
"seed": seed, |
|
|
"num_labels": num_labels, |
|
|
"train_size": len(dataset_dict["train"]), |
|
|
"val_size": len(dataset_dict["validation"]), |
|
|
"test_size": len(dataset_dict["test"]), |
|
|
} |
|
|
with open(final_path / "training_config.json", "w") as f: |
|
|
json.dump(train_config, f, indent=2) |
|
|
|
|
|
|
|
|
all_metrics = { |
|
|
"train": train_result.metrics, |
|
|
"test": test_results, |
|
|
} |
|
|
with open(final_path / "metrics.json", "w") as f: |
|
|
json.dump(all_metrics, f, indent=2) |
|
|
|
|
|
logger.info(f"\nAll artifacts saved to: {final_path}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if push_to_hub: |
|
|
logger.info(f"\nPushing to HuggingFace Hub: {hub_model_id}") |
|
|
try: |
|
|
trainer.push_to_hub( |
|
|
repo_id=hub_model_id, |
|
|
commit_message="Upload fine-tuned resume section classifier", |
|
|
) |
|
|
tokenizer.push_to_hub(hub_model_id) |
|
|
logger.info("Successfully pushed to Hub!") |
|
|
except Exception as e: |
|
|
logger.error(f"Failed to push to Hub: {e}") |
|
|
logger.info("You can push manually later with:") |
|
|
logger.info(f" huggingface-cli upload {hub_model_id} {final_path}") |
|
|
|
|
|
logger.info("\nTraining complete!") |
|
|
return test_results |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser( |
|
|
description="Fine-tune DistilBERT for resume section classification", |
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
) |
|
|
|
|
|
|
|
|
parser.add_argument("--model-name", type=str, default=MODEL_NAME, |
|
|
help="Pretrained model name or path") |
|
|
parser.add_argument("--output-dir", type=str, default=DEFAULT_OUTPUT_DIR, |
|
|
help="Output directory for model and artifacts") |
|
|
|
|
|
|
|
|
parser.add_argument("--epochs", type=int, default=4, |
|
|
help="Number of training epochs") |
|
|
parser.add_argument("--batch-size", type=int, default=16, |
|
|
help="Training batch size per device") |
|
|
parser.add_argument("--learning-rate", type=float, default=2e-5, |
|
|
help="Peak learning rate") |
|
|
parser.add_argument("--weight-decay", type=float, default=0.01, |
|
|
help="Weight decay for AdamW") |
|
|
parser.add_argument("--warmup-ratio", type=float, default=0.1, |
|
|
help="Fraction of total steps for linear warmup") |
|
|
parser.add_argument("--max-length", type=int, default=MAX_LENGTH, |
|
|
help="Maximum token sequence length") |
|
|
parser.add_argument("--gradient-accumulation-steps", type=int, default=1, |
|
|
help="Number of gradient accumulation steps") |
|
|
|
|
|
|
|
|
parser.add_argument("--examples-per-category", type=int, default=80, |
|
|
help="Base synthetic examples per category") |
|
|
parser.add_argument("--augmented-copies", type=int, default=2, |
|
|
help="Augmented copies per base example") |
|
|
parser.add_argument("--seed", type=int, default=42, |
|
|
help="Random seed for reproducibility") |
|
|
|
|
|
|
|
|
parser.add_argument("--fp16", action="store_true", default=None, |
|
|
help="Force FP16 training") |
|
|
parser.add_argument("--no-fp16", action="store_true", |
|
|
help="Disable FP16 training") |
|
|
parser.add_argument("--early-stopping-patience", type=int, default=3, |
|
|
help="Early stopping patience (0 to disable)") |
|
|
|
|
|
|
|
|
parser.add_argument("--push-to-hub", action="store_true", |
|
|
help="Push trained model to HuggingFace Hub") |
|
|
parser.add_argument("--hub-model-id", type=str, default=HUB_MODEL_ID, |
|
|
help="HuggingFace Hub model ID") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
fp16 = args.fp16 |
|
|
if args.no_fp16: |
|
|
fp16 = False |
|
|
|
|
|
results = train( |
|
|
output_dir=args.output_dir, |
|
|
model_name=args.model_name, |
|
|
epochs=args.epochs, |
|
|
batch_size=args.batch_size, |
|
|
learning_rate=args.learning_rate, |
|
|
weight_decay=args.weight_decay, |
|
|
warmup_ratio=args.warmup_ratio, |
|
|
max_length=args.max_length, |
|
|
examples_per_category=args.examples_per_category, |
|
|
augmented_copies=args.augmented_copies, |
|
|
seed=args.seed, |
|
|
push_to_hub=args.push_to_hub, |
|
|
hub_model_id=args.hub_model_id, |
|
|
fp16=fp16, |
|
|
gradient_accumulation_steps=args.gradient_accumulation_steps, |
|
|
early_stopping_patience=args.early_stopping_patience, |
|
|
) |
|
|
|