"""
TAB dataset loader.

The Text Anonymization Benchmark is 1,268 ECHR court cases, manually
annotated with entity mentions tagged by type (PERSON, ORG, LOC, etc.) and
identifier role (DIRECT, QUASI, NO_MASK).

We pull it directly from the HuggingFace Hub. First load is ~50 MB and
caches locally; subsequent loads are instant.
"""
from __future__ import annotations

from collections import Counter, defaultdict
from typing import Any, Dict

from datasets import DatasetDict, load_dataset

DEFAULT_HF_PATH = "ildpil/text-anonymization-benchmark"


def load_tab(hf_path: str = DEFAULT_HF_PATH) -> DatasetDict:
    """Load TAB from HuggingFace. Returns a DatasetDict with train/validation/test splits."""
    return load_dataset(hf_path)


def get_dataset_summary(dataset: DatasetDict) -> Dict[str, Any]:
    """
    Compute split sizes, entity-type counts, and identifier-type counts
    across an entire DatasetDict.

    Returns
    -------
    dict with keys:
      - splits          : {split_name: doc_count}
      - entity_types    : Counter of entity_type → count
      - identifier_types: Counter of identifier_type → count
      - entity_by_id    : {entity_type: Counter(identifier_type → count)}
      - per_doc_counts  : {split_name: [entity_count_per_doc, ...]}
    """
    entity_types: Counter = Counter()
    identifier_types: Counter = Counter()
    entity_by_id: Dict[str, Counter] = defaultdict(Counter)
    per_doc_counts: Dict[str, list] = {split: [] for split in dataset}
    splits: Dict[str, int] = {}

    for split in dataset:
        splits[split] = len(dataset[split])
        for doc in dataset[split]:
            per_doc_counts[split].append(len(doc["entity_mentions"]))
            for em in doc["entity_mentions"]:
                entity_types[em["entity_type"]] += 1
                identifier_types[em["identifier_type"]] += 1
                entity_by_id[em["entity_type"]][em["identifier_type"]] += 1

    return {
        "splits": splits,
        "entity_types": entity_types,
        "identifier_types": identifier_types,
        "entity_by_id": dict(entity_by_id),
        "per_doc_counts": per_doc_counts,
    }