Spaces:
Build error
Build error
| """ | |
| TAB dataset loader. | |
| The Text Anonymization Benchmark is 1,268 ECHR court cases, manually | |
| annotated with entity mentions tagged by type (PERSON, ORG, LOC, etc.) and | |
| identifier role (DIRECT, QUASI, NO_MASK). | |
| We pull it directly from the HuggingFace Hub. First load is ~50 MB and | |
| caches locally; subsequent loads are instant. | |
| """ | |
| from __future__ import annotations | |
| from collections import Counter, defaultdict | |
| from typing import Any, Dict | |
| from datasets import DatasetDict, load_dataset | |
| DEFAULT_HF_PATH = "ildpil/text-anonymization-benchmark" | |
| def load_tab(hf_path: str = DEFAULT_HF_PATH) -> DatasetDict: | |
| """Load TAB from HuggingFace. Returns a DatasetDict with train/validation/test splits.""" | |
| return load_dataset(hf_path) | |
| def get_dataset_summary(dataset: DatasetDict) -> Dict[str, Any]: | |
| """ | |
| Compute split sizes, entity-type counts, and identifier-type counts | |
| across an entire DatasetDict. | |
| Returns | |
| ------- | |
| dict with keys: | |
| - splits : {split_name: doc_count} | |
| - entity_types : Counter of entity_type β count | |
| - identifier_types: Counter of identifier_type β count | |
| - entity_by_id : {entity_type: Counter(identifier_type β count)} | |
| - per_doc_counts : {split_name: [entity_count_per_doc, ...]} | |
| """ | |
| entity_types: Counter = Counter() | |
| identifier_types: Counter = Counter() | |
| entity_by_id: Dict[str, Counter] = defaultdict(Counter) | |
| per_doc_counts: Dict[str, list] = {split: [] for split in dataset} | |
| splits: Dict[str, int] = {} | |
| for split in dataset: | |
| splits[split] = len(dataset[split]) | |
| for doc in dataset[split]: | |
| per_doc_counts[split].append(len(doc["entity_mentions"])) | |
| for em in doc["entity_mentions"]: | |
| entity_types[em["entity_type"]] += 1 | |
| identifier_types[em["identifier_type"]] += 1 | |
| entity_by_id[em["entity_type"]][em["identifier_type"]] += 1 | |
| return { | |
| "splits": splits, | |
| "entity_types": entity_types, | |
| "identifier_types": identifier_types, | |
| "entity_by_id": dict(entity_by_id), | |
| "per_doc_counts": per_doc_counts, | |
| } | |