File size: 6,820 Bytes
469ef7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
"""Finalize the intent classifier from the existing best checkpoint.

We stopped training at end of epoch 3/5 with best eval_f1_macro=0.9402.
Rather than resuming training (the checkpoint has no optimizer state because
train_intent.py used save_only_model=True), we accept the epoch-3 model as
final, run TEST evaluation, and save:

  models/intent_classifier/                   (best model + tokenizer + labels.json)
  models/intent_classifier/eval_results.json  (test metrics + per-language breakdown)

Usage:
  python src/finalize_intent.py
"""

from __future__ import annotations

import os
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

import json
import shutil
import sys
from pathlib import Path
from typing import Any

import numpy as np
import torch
from datasets import load_from_disk
from sklearn.metrics import (
    accuracy_score, classification_report, f1_score,
    precision_score, recall_score,
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

PROJECT_ROOT = Path(__file__).resolve().parent.parent
DATA_DIR = PROJECT_ROOT / "data" / "processed" / "intent"
LABELS_FILE = DATA_DIR / "labels.json"
OUT_DIR = PROJECT_ROOT / "models" / "intent_classifier"
CHECKPOINT_DIR = OUT_DIR / "runs" / "checkpoint-3336"
MAX_LENGTH = 128


def main() -> int:
    if not CHECKPOINT_DIR.exists():
        print(f"ERROR: checkpoint not found at {CHECKPOINT_DIR}", file=sys.stderr)
        return 2

    print("=" * 72)
    print("Finalize intent classifier from checkpoint-3336")
    print("=" * 72)
    print(f"  Checkpoint: {CHECKPOINT_DIR}")
    print(f"  Out dir   : {OUT_DIR}")

    labels_payload = json.loads(LABELS_FILE.read_text())
    label_to_id: dict[str, int] = labels_payload["label_to_id"]
    id_to_label: dict[int, str] = {int(k): v for k, v in labels_payload["id_to_label"].items()}
    label_names = [id_to_label[i] for i in range(len(id_to_label))]
    num_labels = len(label_names)

    ds = load_from_disk(str(DATA_DIR))
    print(f"  Test rows : {len(ds['test'])}")

    tokenizer = AutoTokenizer.from_pretrained(str(CHECKPOINT_DIR))
    model = AutoModelForSequenceClassification.from_pretrained(str(CHECKPOINT_DIR))

    def tokenize(batch: dict[str, list]) -> dict[str, Any]:
        return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH)

    drop_cols = [c for c in ds["test"].column_names if c not in ("label",)]
    test_tok = ds["test"].map(tokenize, batched=True, remove_columns=drop_cols,
                              desc="Tokenizing test")

    eval_args = TrainingArguments(
        output_dir=str(OUT_DIR / "tmp_eval"),
        per_device_eval_batch_size=16,
        fp16=torch.cuda.is_available(),
        report_to="none",
        dataloader_num_workers=0,
    )

    def compute_metrics(eval_pred) -> dict[str, float]:
        logits, labels = eval_pred
        if isinstance(logits, tuple):
            logits = logits[0]
        preds = np.argmax(logits, axis=-1)
        return {
            "accuracy": accuracy_score(labels, preds),
            "f1": f1_score(labels, preds, average="weighted", zero_division=0),
            "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
            "precision": precision_score(labels, preds, average="weighted", zero_division=0),
            "recall": recall_score(labels, preds, average="weighted", zero_division=0),
        }

    trainer = Trainer(
        model=model,
        args=eval_args,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics,
    )

    print("\nEvaluating on TEST split ...")
    test_metrics = trainer.evaluate(test_tok, metric_key_prefix="test")
    test_pred = trainer.predict(test_tok)
    test_logits = test_pred.predictions[0] if isinstance(test_pred.predictions, tuple) else test_pred.predictions
    pred_ids = np.argmax(test_logits, axis=-1)
    true_ids = test_pred.label_ids

    report_dict = classification_report(
        true_ids, pred_ids,
        labels=list(range(num_labels)),
        target_names=label_names,
        output_dict=True, zero_division=0,
    )
    report_text = classification_report(
        true_ids, pred_ids,
        labels=list(range(num_labels)),
        target_names=label_names,
        zero_division=0,
    )
    print("\nClassification report on TEST:")
    print(report_text)

    test_with_lang = ds["test"]
    per_lang: dict[str, dict[str, float]] = {}
    if "language" in test_with_lang.column_names:
        languages = test_with_lang["language"]
        for lang in sorted(set(languages)):
            mask = np.array([la == lang for la in languages])
            if not mask.any():
                continue
            lp = pred_ids[mask]
            lt = true_ids[mask]
            per_lang[lang] = {
                "n": int(mask.sum()),
                "accuracy": float(accuracy_score(lt, lp)),
                "f1_weighted": float(f1_score(lt, lp, average="weighted", zero_division=0)),
                "f1_macro": float(f1_score(lt, lp, average="macro", zero_division=0)),
            }
        print("\nPer-language metrics on TEST:")
        for lang, m in per_lang.items():
            print(f"  {lang}: n={m['n']}  acc={m['accuracy']:.4f}  "
                  f"f1_w={m['f1_weighted']:.4f}  f1_m={m['f1_macro']:.4f}")

    OUT_DIR.mkdir(parents=True, exist_ok=True)
    trainer.save_model(str(OUT_DIR))
    tokenizer.save_pretrained(str(OUT_DIR))
    shutil.copy(LABELS_FILE, OUT_DIR / "labels.json")

    payload = {
        "model_name": "distilbert-base-multilingual-cased",
        "task": "intent",
        "num_labels": num_labels,
        "labels": label_to_id,
        "source_checkpoint": str(CHECKPOINT_DIR.relative_to(PROJECT_ROOT)),
        "test_metrics": {k: float(v) for k, v in test_metrics.items()
                         if isinstance(v, (int, float))},
        "classification_report": report_dict,
        "per_language": per_lang,
        "training": {
            "epochs_completed": 3,
            "epochs_planned": 5,
            "note": "training stopped at end of epoch 3; checkpoint accepted as final "
                    "(best eval_f1_macro=0.9402 at epoch 3, curve still improving but "
                    "optimizer state was not saved due to save_only_model=True).",
        },
    }
    (OUT_DIR / "eval_results.json").write_text(
        json.dumps(payload, indent=2, ensure_ascii=False)
    )
    print(f"\n[OK] Saved final model to {OUT_DIR}")
    print(f"[OK] Saved eval_results.json to {OUT_DIR / 'eval_results.json'}")

    tmp = OUT_DIR / "tmp_eval"
    if tmp.exists():
        shutil.rmtree(tmp, ignore_errors=True)
    return 0


if __name__ == "__main__":
    sys.exit(main())