| """ |
| InsureOS β Document Classifier Training |
| Fine-tunes ModernBERT (or a fallback BERT-base) for 12-class insurance document classification. |
| """ |
|
|
| import os |
| import json |
| import argparse |
| from pathlib import Path |
|
|
| import torch |
| import numpy as np |
| from datasets import Dataset |
| from transformers import ( |
| AutoModelForSequenceClassification, |
| AutoTokenizer, |
| TrainingArguments, |
| Trainer, |
| ) |
| from sklearn.metrics import accuracy_score, f1_score, classification_report |
|
|
| from data.constants import DOCUMENT_TYPES |
|
|
|
|
| |
|
|
| |
| MODEL_NAME = "answerdotai/ModernBERT-base" |
| FALLBACK_MODEL = "google-bert/bert-base-uncased" |
| DATA_PATH = "data/output/insurance_docs_10k.jsonl" |
| OUTPUT_DIR = "models/doc-classifier" |
| MAX_LEN = 512 |
| EPOCHS = 5 |
| BATCH_SIZE = 16 |
| LR = 2e-5 |
| WARMUP_RATIO = 0.1 |
| EVAL_SPLIT = 0.1 |
| LABELS = DOCUMENT_TYPES |
|
|
|
|
| def load_data(path: str) -> Dataset: |
| records = [] |
| with open(path) as f: |
| for line in f: |
| obj = json.loads(line) |
| records.append({ |
| "text": obj["text"], |
| "label": obj["label_id"], |
| }) |
| return Dataset.from_list(records) |
|
|
|
|
| def compute_metrics(pred): |
| labels = pred.label_ids |
| preds = np.argmax(pred.predictions, axis=-1) |
| acc = accuracy_score(labels, preds) |
| f1_macro = f1_score(labels, preds, average="macro") |
| f1_weighted = f1_score(labels, preds, average="weighted") |
| return {"accuracy": acc, "f1_macro": f1_macro, "f1_weighted": f1_weighted} |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Train document classifier") |
| parser.add_argument("--model-name", default=MODEL_NAME) |
| parser.add_argument("--data-path", default=DATA_PATH) |
| parser.add_argument("--output-dir", default=OUTPUT_DIR) |
| parser.add_argument("--epochs", type=int, default=EPOCHS) |
| parser.add_argument("--batch-size", type=int, default=BATCH_SIZE) |
| parser.add_argument("--lr", type=float, default=LR) |
| args = parser.parse_args() |
|
|
| print(f"{'='*60}") |
| print(f" InsureOS β Document Classifier Training") |
| print(f" Model: {args.model_name}") |
| print(f" Classes: {len(LABELS)}") |
| print(f"{'='*60}\n") |
|
|
| |
| print("[1/4] Loading model and tokenizer...") |
| try: |
| tokenizer = AutoTokenizer.from_pretrained(args.model_name) |
| model = AutoModelForSequenceClassification.from_pretrained( |
| args.model_name, |
| num_labels=len(LABELS), |
| id2label={i: l for i, l in enumerate(LABELS)}, |
| label2id={l: i for i, l in enumerate(LABELS)}, |
| ) |
| except Exception: |
| print(f" β {args.model_name} unavailable, falling back to {FALLBACK_MODEL}") |
| tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL) |
| model = AutoModelForSequenceClassification.from_pretrained( |
| FALLBACK_MODEL, |
| num_labels=len(LABELS), |
| id2label={i: l for i, l in enumerate(LABELS)}, |
| label2id={l: i for i, l in enumerate(LABELS)}, |
| ) |
|
|
| |
| print("[2/4] Loading data...") |
| dataset = load_data(args.data_path) |
| print(f" Total: {len(dataset)}") |
|
|
| def tokenize_fn(examples): |
| return tokenizer( |
| examples["text"], |
| truncation=True, |
| max_length=MAX_LEN, |
| padding="max_length", |
| ) |
|
|
| dataset = dataset.map(tokenize_fn, batched=True) |
| dataset = dataset.class_encode_column("label") |
| split = dataset.train_test_split(test_size=EVAL_SPLIT, seed=42, stratify_by_column="label") |
| train_ds = split["train"] |
| eval_ds = split["test"] |
| print(f" Train: {len(train_ds)}, Eval: {len(eval_ds)}") |
|
|
| |
| print("[3/4] Training...") |
| training_args = TrainingArguments( |
| output_dir=args.output_dir, |
| num_train_epochs=args.epochs, |
| per_device_train_batch_size=args.batch_size, |
| per_device_eval_batch_size=args.batch_size * 2, |
| learning_rate=args.lr, |
| lr_scheduler_type="cosine", |
| warmup_ratio=WARMUP_RATIO, |
| weight_decay=0.01, |
| eval_strategy="epoch", |
| save_strategy="epoch", |
| save_total_limit=2, |
| load_best_model_at_end=True, |
| metric_for_best_model="f1_macro", |
| greater_is_better=True, |
| fp16=torch.cuda.is_available(), |
| report_to="none", |
| logging_steps=50, |
| ) |
|
|
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_ds, |
| eval_dataset=eval_ds, |
| compute_metrics=compute_metrics, |
| ) |
|
|
| trainer.train() |
|
|
| |
| print("[4/4] Final evaluation...") |
| results = trainer.evaluate() |
| print(f" Accuracy: {results['eval_accuracy']:.4f}") |
| print(f" F1 (macro): {results['eval_f1_macro']:.4f}") |
| print(f" F1 (weighted): {results['eval_f1_weighted']:.4f}") |
|
|
| |
| preds = trainer.predict(eval_ds) |
| y_pred = np.argmax(preds.predictions, axis=-1) |
| y_true = preds.label_ids |
| report = classification_report(y_true, y_pred, target_names=LABELS) |
| print(f"\n{report}") |
|
|
| |
| trainer.save_model(args.output_dir) |
| tokenizer.save_pretrained(args.output_dir) |
|
|
| |
| meta = { |
| "labels": LABELS, |
| "id2label": {i: l for i, l in enumerate(LABELS)}, |
| "results": {k: float(v) for k, v in results.items()}, |
| } |
| with open(os.path.join(args.output_dir, "training_meta.json"), "w") as f: |
| json.dump(meta, f, indent=2) |
|
|
| print(f"\nβ Document classifier saved β {args.output_dir}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|