File size: 2,399 Bytes
5c05bce
 
 
 
 
 
 
 
 
 
 
 
 
44361bf
5c05bce
44361bf
 
5c05bce
 
 
 
44361bf
5c05bce
44361bf
5c05bce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""
TAB dataset loader.

The Text Anonymization Benchmark is 1,268 ECHR court cases, manually
annotated with entity mentions tagged by type (PERSON, ORG, LOC, etc.) and
identifier role (DIRECT, QUASI, NO_MASK).

We pull it directly from the HuggingFace Hub. First load is ~50 MB and
caches locally; subsequent loads are instant.
"""
from __future__ import annotations

from collections import Counter, defaultdict
from typing import TYPE_CHECKING, Any, Dict

if TYPE_CHECKING:  # import only for type checkers β€” keeps `import anonymisation`
    from datasets import DatasetDict  # light (no datasets/pyarrow needed to import the package)

DEFAULT_HF_PATH = "ildpil/text-anonymization-benchmark"


def load_tab(hf_path: str = DEFAULT_HF_PATH) -> "DatasetDict":
    """Load TAB from HuggingFace. Returns a DatasetDict with train/validation/test splits."""
    from datasets import load_dataset  # lazy β€” only Phase-1+ data work pulls this dep
    return load_dataset(hf_path)


def get_dataset_summary(dataset: DatasetDict) -> Dict[str, Any]:
    """
    Compute split sizes, entity-type counts, and identifier-type counts
    across an entire DatasetDict.

    Returns
    -------
    dict with keys:
      - splits          : {split_name: doc_count}
      - entity_types    : Counter of entity_type β†’ count
      - identifier_types: Counter of identifier_type β†’ count
      - entity_by_id    : {entity_type: Counter(identifier_type β†’ count)}
      - per_doc_counts  : {split_name: [entity_count_per_doc, ...]}
    """
    entity_types: Counter = Counter()
    identifier_types: Counter = Counter()
    entity_by_id: Dict[str, Counter] = defaultdict(Counter)
    per_doc_counts: Dict[str, list] = {split: [] for split in dataset}
    splits: Dict[str, int] = {}

    for split in dataset:
        splits[split] = len(dataset[split])
        for doc in dataset[split]:
            per_doc_counts[split].append(len(doc["entity_mentions"]))
            for em in doc["entity_mentions"]:
                entity_types[em["entity_type"]] += 1
                identifier_types[em["identifier_type"]] += 1
                entity_by_id[em["entity_type"]][em["identifier_type"]] += 1

    return {
        "splits": splits,
        "entity_types": entity_types,
        "identifier_types": identifier_types,
        "entity_by_id": dict(entity_by_id),
        "per_doc_counts": per_doc_counts,
    }