File size: 4,686 Bytes
644a42b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from __future__ import annotations

import argparse
import json
import random
from pathlib import Path

import torch
from huggingface_hub import HfApi
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer


ROOT = Path(__file__).resolve().parents[1]
LABELS = ["ingredients", "nutrition", "license", "dates", "refuse_absolute"]
LABEL_TO_ID = {label: index for index, label in enumerate(LABELS)}


class RouterDataset(Dataset):
    def __init__(self, records, tokenizer):
        self.records = records
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.records)

    def __getitem__(self, index):
        record = self.records[index]
        encoded = self.tokenizer(
            record["text"],
            padding="max_length",
            truncation=True,
            max_length=32,
            return_tensors="pt",
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "labels": torch.tensor(LABEL_TO_ID[record["label"]]),
        }


def evaluate(model, loader, device):
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for batch in loader:
            labels = batch.pop("labels").to(device)
            logits = model(**{key: value.to(device) for key, value in batch.items()}).logits
            correct += (logits.argmax(dim=-1) == labels).sum().item()
            total += labels.numel()
    return correct / total


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--repo-id", default="build-small-hackathon/packetcourt-evidence-router")
    parser.add_argument("--base-model", default="google/bert_uncased_L-2_H-128_A-2")
    parser.add_argument("--epochs", type=int, default=30)
    args = parser.parse_args()

    random.seed(42)
    torch.manual_seed(42)
    records = [json.loads(line) for line in (ROOT / "data/router_training.jsonl").read_text().splitlines()]
    grouped = {label: [] for label in LABELS}
    for record in records:
        grouped[record["label"]].append(record)
    for group in grouped.values():
        random.shuffle(group)
    validation = [group.pop() for group in grouped.values()]
    training = [record for group in grouped.values() for record in group]
    random.shuffle(training)

    tokenizer = AutoTokenizer.from_pretrained(args.base_model)
    model = AutoModelForSequenceClassification.from_pretrained(
        args.base_model,
        num_labels=len(LABELS),
        id2label={index: label for index, label in enumerate(LABELS)},
        label2id=LABEL_TO_ID,
    )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    train_loader = DataLoader(RouterDataset(training, tokenizer), batch_size=8, shuffle=True)
    validation_loader = DataLoader(RouterDataset(validation, tokenizer), batch_size=5)
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

    for epoch in range(args.epochs):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            labels = batch.pop("labels").to(device)
            loss = model(**{key: value.to(device) for key, value in batch.items()}, labels=labels).loss
            loss.backward()
            optimizer.step()
        print(f"epoch={epoch + 1} validation_accuracy={evaluate(model, validation_loader, device):.3f}")

    output = ROOT / "router_model"
    model.save_pretrained(output)
    tokenizer.save_pretrained(output)
    score = evaluate(model, validation_loader, device)
    card = f"""---
license: apache-2.0
base_model: {args.base_model}
tags:
- text-classification
- build-small-hackathon
- packetcourt
- fine-tuned
---

# PacketCourt Evidence Router

A {sum(parameter.numel() for parameter in model.parameters()):,}-parameter fine-tuned classifier used by
PacketCourt's investigation agent to choose the next evidence tool for a packet claim.

Labels: `{", ".join(LABELS)}`.

Held-out validation accuracy: `{score:.3f}` on a small PacketCourt-specific routing set.
The router proposes an investigation tool; deterministic code remains responsible for final verdicts.
"""
    (output / "README.md").write_text(card)

    api = HfApi()
    api.create_repo(args.repo_id, repo_type="model", private=True, exist_ok=True)
    api.upload_folder(
        repo_id=args.repo_id,
        repo_type="model",
        folder_path=output,
        commit_message="feat: publish PacketCourt fine-tuned evidence router",
    )
    print(f"published={args.repo_id} validation_accuracy={score:.3f}")


if __name__ == "__main__":
    main()