insureos-models / doc_classifier.py
piyushptiwari's picture
Upload folder using huggingface_hub
2cc32a5 verified
"""
InsureOS β€” Document Classifier Training
Fine-tunes ModernBERT (or a fallback BERT-base) for 12-class insurance document classification.
"""
import os
import json
import argparse
from pathlib import Path
import torch
import numpy as np
from datasets import Dataset
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
TrainingArguments,
Trainer,
)
from sklearn.metrics import accuracy_score, f1_score, classification_report
from data.constants import DOCUMENT_TYPES
# ── Defaults ──
# ModernBERT is preferred; fall back to bert-base if unavailable
MODEL_NAME = "answerdotai/ModernBERT-base"
FALLBACK_MODEL = "google-bert/bert-base-uncased"
DATA_PATH = "data/output/insurance_docs_10k.jsonl"
OUTPUT_DIR = "models/doc-classifier"
MAX_LEN = 512
EPOCHS = 5
BATCH_SIZE = 16
LR = 2e-5
WARMUP_RATIO = 0.1
EVAL_SPLIT = 0.1
LABELS = DOCUMENT_TYPES # 12 classes
def load_data(path: str) -> Dataset:
records = []
with open(path) as f:
for line in f:
obj = json.loads(line)
records.append({
"text": obj["text"],
"label": obj["label_id"],
})
return Dataset.from_list(records)
def compute_metrics(pred):
labels = pred.label_ids
preds = np.argmax(pred.predictions, axis=-1)
acc = accuracy_score(labels, preds)
f1_macro = f1_score(labels, preds, average="macro")
f1_weighted = f1_score(labels, preds, average="weighted")
return {"accuracy": acc, "f1_macro": f1_macro, "f1_weighted": f1_weighted}
def main():
parser = argparse.ArgumentParser(description="Train document classifier")
parser.add_argument("--model-name", default=MODEL_NAME)
parser.add_argument("--data-path", default=DATA_PATH)
parser.add_argument("--output-dir", default=OUTPUT_DIR)
parser.add_argument("--epochs", type=int, default=EPOCHS)
parser.add_argument("--batch-size", type=int, default=BATCH_SIZE)
parser.add_argument("--lr", type=float, default=LR)
args = parser.parse_args()
print(f"{'='*60}")
print(f" InsureOS β€” Document Classifier Training")
print(f" Model: {args.model_name}")
print(f" Classes: {len(LABELS)}")
print(f"{'='*60}\n")
# ── 1. Load tokenizer & model ──
print("[1/4] Loading model and tokenizer...")
try:
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
model = AutoModelForSequenceClassification.from_pretrained(
args.model_name,
num_labels=len(LABELS),
id2label={i: l for i, l in enumerate(LABELS)},
label2id={l: i for i, l in enumerate(LABELS)},
)
except Exception:
print(f" ⚠ {args.model_name} unavailable, falling back to {FALLBACK_MODEL}")
tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(
FALLBACK_MODEL,
num_labels=len(LABELS),
id2label={i: l for i, l in enumerate(LABELS)},
label2id={l: i for i, l in enumerate(LABELS)},
)
# ── 2. Load & tokenize data ──
print("[2/4] Loading data...")
dataset = load_data(args.data_path)
print(f" Total: {len(dataset)}")
def tokenize_fn(examples):
return tokenizer(
examples["text"],
truncation=True,
max_length=MAX_LEN,
padding="max_length",
)
dataset = dataset.map(tokenize_fn, batched=True)
dataset = dataset.class_encode_column("label")
split = dataset.train_test_split(test_size=EVAL_SPLIT, seed=42, stratify_by_column="label")
train_ds = split["train"]
eval_ds = split["test"]
print(f" Train: {len(train_ds)}, Eval: {len(eval_ds)}")
# ── 3. Training ──
print("[3/4] Training...")
training_args = TrainingArguments(
output_dir=args.output_dir,
num_train_epochs=args.epochs,
per_device_train_batch_size=args.batch_size,
per_device_eval_batch_size=args.batch_size * 2,
learning_rate=args.lr,
lr_scheduler_type="cosine",
warmup_ratio=WARMUP_RATIO,
weight_decay=0.01,
eval_strategy="epoch",
save_strategy="epoch",
save_total_limit=2,
load_best_model_at_end=True,
metric_for_best_model="f1_macro",
greater_is_better=True,
fp16=torch.cuda.is_available(),
report_to="none",
logging_steps=50,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_ds,
eval_dataset=eval_ds,
compute_metrics=compute_metrics,
)
trainer.train()
# ── 4. Evaluate & save ──
print("[4/4] Final evaluation...")
results = trainer.evaluate()
print(f" Accuracy: {results['eval_accuracy']:.4f}")
print(f" F1 (macro): {results['eval_f1_macro']:.4f}")
print(f" F1 (weighted): {results['eval_f1_weighted']:.4f}")
# Detailed classification report
preds = trainer.predict(eval_ds)
y_pred = np.argmax(preds.predictions, axis=-1)
y_true = preds.label_ids
report = classification_report(y_true, y_pred, target_names=LABELS)
print(f"\n{report}")
# Save
trainer.save_model(args.output_dir)
tokenizer.save_pretrained(args.output_dir)
# Save label map and results
meta = {
"labels": LABELS,
"id2label": {i: l for i, l in enumerate(LABELS)},
"results": {k: float(v) for k, v in results.items()},
}
with open(os.path.join(args.output_dir, "training_meta.json"), "w") as f:
json.dump(meta, f, indent=2)
print(f"\nβœ“ Document classifier saved β†’ {args.output_dir}")
if __name__ == "__main__":
main()