from __future__ import annotations import json import sys from pathlib import Path import numpy as np import torch from datasets import Dataset from sklearn.metrics import accuracy_score, f1_score BASE_DIR = Path(__file__).resolve().parent.parent if str(BASE_DIR) not in sys.path: sys.path.insert(0, str(BASE_DIR)) def load_labeled_rows(path: Path, label_field: str, label2id: dict[str, int]) -> list[dict]: rows = [] with path.open("r", encoding="utf-8") as handle: for line in handle: item = json.loads(line) rows.append({"text": item["text"], "label": label2id[item[label_field]]}) return rows def load_labeled_rows_from_paths(paths: list[Path], label_field: str, label2id: dict[str, int]) -> list[dict]: rows = [] for path in paths: if not path.exists(): continue rows.extend(load_labeled_rows(path, label_field, label2id)) return rows def prepare_dataset(rows: list[dict], tokenizer, max_length: int) -> Dataset: dataset = Dataset.from_list(rows) def tokenize(batch): return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=max_length) dataset = dataset.map(tokenize, batched=True) dataset = dataset.remove_columns(["text"]) dataset.set_format("torch") return dataset def build_balanced_class_weights(rows: list[dict], num_labels: int) -> torch.Tensor: counts = np.zeros(num_labels, dtype=np.float32) for row in rows: counts[row["label"]] += 1.0 nonzero = counts > 0 if not np.any(nonzero): return torch.ones(num_labels, dtype=torch.float32) total = float(counts.sum()) active_labels = float(np.count_nonzero(nonzero)) weights = np.ones(num_labels, dtype=np.float32) weights[nonzero] = total / (active_labels * counts[nonzero]) return torch.tensor(weights, dtype=torch.float32) def build_label_weight_tensor(labels: tuple[str, ...], weight_map: dict[str, float]) -> torch.Tensor: return torch.tensor( [float(weight_map.get(label, 1.0)) for label in labels], dtype=torch.float32, ) def compute_classification_metrics(eval_pred): logits, labels = eval_pred preds = np.argmax(logits, axis=-1) return { "accuracy": accuracy_score(labels, preds), "macro_f1": f1_score(labels, preds, average="macro"), } def write_json(path: Path, payload: dict) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")