Willxo's picture
Space deployment
5c05bce
"""
TAB dataset loader.
The Text Anonymization Benchmark is 1,268 ECHR court cases, manually
annotated with entity mentions tagged by type (PERSON, ORG, LOC, etc.) and
identifier role (DIRECT, QUASI, NO_MASK).
We pull it directly from the HuggingFace Hub. First load is ~50 MB and
caches locally; subsequent loads are instant.
"""
from __future__ import annotations
from collections import Counter, defaultdict
from typing import Any, Dict
from datasets import DatasetDict, load_dataset
DEFAULT_HF_PATH = "ildpil/text-anonymization-benchmark"
def load_tab(hf_path: str = DEFAULT_HF_PATH) -> DatasetDict:
"""Load TAB from HuggingFace. Returns a DatasetDict with train/validation/test splits."""
return load_dataset(hf_path)
def get_dataset_summary(dataset: DatasetDict) -> Dict[str, Any]:
"""
Compute split sizes, entity-type counts, and identifier-type counts
across an entire DatasetDict.
Returns
-------
dict with keys:
- splits : {split_name: doc_count}
- entity_types : Counter of entity_type β†’ count
- identifier_types: Counter of identifier_type β†’ count
- entity_by_id : {entity_type: Counter(identifier_type β†’ count)}
- per_doc_counts : {split_name: [entity_count_per_doc, ...]}
"""
entity_types: Counter = Counter()
identifier_types: Counter = Counter()
entity_by_id: Dict[str, Counter] = defaultdict(Counter)
per_doc_counts: Dict[str, list] = {split: [] for split in dataset}
splits: Dict[str, int] = {}
for split in dataset:
splits[split] = len(dataset[split])
for doc in dataset[split]:
per_doc_counts[split].append(len(doc["entity_mentions"]))
for em in doc["entity_mentions"]:
entity_types[em["entity_type"]] += 1
identifier_types[em["identifier_type"]] += 1
entity_by_id[em["entity_type"]][em["identifier_type"]] += 1
return {
"splits": splits,
"entity_types": entity_types,
"identifier_types": identifier_types,
"entity_by_id": dict(entity_by_id),
"per_doc_counts": per_doc_counts,
}