File size: 7,033 Bytes

0b83afd

# /// script
# dependencies = [
#   "torch",
#   "transformers>=4.51.0",
#   "datasets>=3.0.0",
#   "accelerate>=1.0.0",
#   "scikit-learn>=1.4.0",
#   "trackio>=0.25.0",
#   "huggingface_hub>=0.30.0",
# ]
# ///

import os
from collections import Counter

import numpy as np
import torch
import trackio
from datasets import load_dataset
from huggingface_hub import HfApi
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainerCallback,
    TrainingArguments,
    set_seed,
)

DATASET_ID = "biglam/on_the_books"
MODEL_ID = "distilbert-base-uncased"
HUB_MODEL_ID = "evalstate/jim-crow-test2323"
PROJECT = "jim-crow-law-classifier"
RUN_NAME = "distilbert-on-the-books"
MAX_LENGTH = 512
SEED = 42

set_seed(SEED)

if not os.environ.get("HF_TOKEN"):
    raise RuntimeError("HF_TOKEN is required so the trained model can be pushed to the Hub.")

run = trackio.init(
    project=PROJECT,
    name=RUN_NAME,
    config={
        "dataset": DATASET_ID,
        "base_model": MODEL_ID,
        "hub_model_id": HUB_MODEL_ID,
        "task": "binary sequence classification: Jim Crow law identification",
        "max_length": MAX_LENGTH,
        "seed": SEED,
    },
    private=False,
    auto_log_gpu=True,
)
print(f"Trackio run: {run}")

raw = load_dataset(DATASET_ID, split="train")
label_names = raw.features["jim_crow"].names
id2label = {i: name for i, name in enumerate(label_names)}
label2id = {name: i for i, name in id2label.items()}
print(raw)
print("Label distribution:", Counter(raw["jim_crow"]))

# Stratified split because the dataset has only one split and a modest class imbalance.
splits = raw.train_test_split(test_size=0.2, seed=SEED, stratify_by_column="jim_crow")
train_ds = splits["train"]
eval_ds = splits["test"]

trackio.log({
    "data/train_examples": len(train_ds),
    "data/eval_examples": len(eval_ds),
    "data/train_jim_crow": Counter(train_ds["jim_crow"])[1],
    "data/train_no_jim_crow": Counter(train_ds["jim_crow"])[0],
})

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

def make_text(example):
    chapter = example.get("chapter_text") or ""
    section = example.get("section_text") or ""
    meta = f"Source: {example.get('source','')}; Type: {example.get('type','')}; Chapter: {example.get('chapter_num','')}; Section: {example.get('section_num','')}"
    return meta + "\n\nChapter text:\n" + chapter + "\n\nSection text:\n" + section

def preprocess(batch):
    texts = []
    for i in range(len(batch["section_text"])):
        ex = {k: batch[k][i] for k in batch.keys()}
        texts.append(make_text(ex))
    enc = tokenizer(texts, truncation=True, max_length=MAX_LENGTH)
    enc["labels"] = batch["jim_crow"]
    return enc

remove_cols = raw.column_names
train_tok = train_ds.map(preprocess, batched=True, remove_columns=remove_cols)
eval_tok = eval_ds.map(preprocess, batched=True, remove_columns=remove_cols)

counts = Counter(train_ds["jim_crow"])
total = sum(counts.values())
class_weights = torch.tensor([total / (2 * counts[i]) for i in range(len(label_names))], dtype=torch.float)
print("Class weights:", class_weights.tolist())

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id,
)

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        weights = class_weights.to(outputs.logits.device)
        loss_fct = torch.nn.CrossEntropyLoss(weight=weights)
        loss = loss_fct(outputs.logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

class TrackioCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            trackio.log({f"trainer/{k}": v for k, v in logs.items() if isinstance(v, (int, float))}, step=state.global_step)
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics:
            trackio.log({f"eval/{k}": v for k, v in metrics.items() if isinstance(v, (int, float))}, step=state.global_step)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", pos_label=1, zero_division=0)
    macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
    acc = accuracy_score(labels, preds)
    cm = confusion_matrix(labels, preds, labels=[0, 1])
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "macro_precision": macro_precision,
        "macro_recall": macro_recall,
        "macro_f1": macro_f1,
        "tn": int(cm[0, 0]),
        "fp": int(cm[0, 1]),
        "fn": int(cm[1, 0]),
        "tp": int(cm[1, 1]),
    }

args = TrainingArguments(
    output_dir="jim-crow-test2323",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    push_to_hub=True,
    hub_model_id=HUB_MODEL_ID,
    hub_private_repo=False,
    report_to=[],
    run_name=RUN_NAME,
    seed=SEED,
)

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[TrackioCallback()],
)

trainer.train()
metrics = trainer.evaluate()
print("Final eval metrics:", metrics)
trackio.log({f"final/{k}": v for k, v in metrics.items() if isinstance(v, (int, float))})

# Ensure useful metadata and a model card are present on the final Hub repo.
trainer.save_model()
tokenizer.save_pretrained(args.output_dir)
trainer.create_model_card(
    model_name="Jim Crow law classifier",
    dataset_tags=DATASET_ID,
    finetuned_from=MODEL_ID,
    tasks="text-classification",
    language="en",
    tags=["legal", "history", "jim-crow", "sequence-classification", "distilbert"],
)
trainer.push_to_hub(commit_message="Fine-tune DistilBERT to identify Jim Crow laws")

api = HfApi(token=os.environ["HF_TOKEN"])
api.upload_file(
    path_or_fileobj=__file__,
    path_in_repo="training_script.py",
    repo_id=HUB_MODEL_ID,
    repo_type="model",
    commit_message="Add training script",
)
print(f"Pushed trained model to https://huggingface.co/{HUB_MODEL_ID}")