Spaces:

Willxo
/

Anonymiser

Build error

App Files Files Community

Anonymiser / anonymisation /data.py

Willxo

Space deployment

5c05bce 8 days ago

raw

history blame contribute delete

2.16 kB

	"""
	TAB dataset loader.

	The Text Anonymization Benchmark is 1,268 ECHR court cases, manually
	annotated with entity mentions tagged by type (PERSON, ORG, LOC, etc.) and
	identifier role (DIRECT, QUASI, NO_MASK).

	We pull it directly from the HuggingFace Hub. First load is ~50 MB and
	caches locally; subsequent loads are instant.
	"""
	from __future__ import annotations

	from collections import Counter, defaultdict
	from typing import Any, Dict

	from datasets import DatasetDict, load_dataset

	DEFAULT_HF_PATH = "ildpil/text-anonymization-benchmark"


	def load_tab(hf_path: str = DEFAULT_HF_PATH) -> DatasetDict:
	"""Load TAB from HuggingFace. Returns a DatasetDict with train/validation/test splits."""
	return load_dataset(hf_path)


	def get_dataset_summary(dataset: DatasetDict) -> Dict[str, Any]:
	"""
	Compute split sizes, entity-type counts, and identifier-type counts
	across an entire DatasetDict.

	Returns
	-------
	dict with keys:
	- splits : {split_name: doc_count}
	- entity_types : Counter of entity_type → count
	- identifier_types: Counter of identifier_type → count
	- entity_by_id : {entity_type: Counter(identifier_type → count)}
	- per_doc_counts : {split_name: [entity_count_per_doc, ...]}
	"""
	entity_types: Counter = Counter()
	identifier_types: Counter = Counter()
	entity_by_id: Dict[str, Counter] = defaultdict(Counter)
	per_doc_counts: Dict[str, list] = {split: [] for split in dataset}
	splits: Dict[str, int] = {}

	for split in dataset:
	splits[split] = len(dataset[split])
	for doc in dataset[split]:
	per_doc_counts[split].append(len(doc["entity_mentions"]))
	for em in doc["entity_mentions"]:
	entity_types[em["entity_type"]] += 1
	identifier_types[em["identifier_type"]] += 1
	entity_by_id[em["entity_type"]][em["identifier_type"]] += 1

	return {
	"splits": splits,
	"entity_types": entity_types,
	"identifier_types": identifier_types,
	"entity_by_id": dict(entity_by_id),
	"per_doc_counts": per_doc_counts,
	}