Spaces:
Sleeping
Sleeping
File size: 1,524 Bytes
d242fb9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
from typing import Dict, List, Tuple
import hashlib
class Pseudonymizer:
def __init__(self):
# Stores mapping of original entity -> pseudonym per entity type
self.entity_map: Dict[str, Dict[str, str]] = {}
def _get_hash(self, text: str) -> str:
"""
Generates a deterministic short hash based on input text.
"""
return hashlib.md5(text.encode()).hexdigest()[:6]
def pseudonymize(self, text: str, entities: List[Dict]) -> Tuple[str, Dict[str, str]]:
"""
Replace detected entities in text with consistent pseudonyms.
Returns pseudonymized text and the mapping used.
"""
offset_correction = 0
text_out = text
local_mapping = {}
for ent in sorted(entities, key=lambda x: x["start"]):
label = ent["entity_group"]
word = ent["word"]
if label not in self.entity_map:
self.entity_map[label] = {}
if word not in self.entity_map[label]:
pseudonym = f"[{label}_{self._get_hash(word)}]"
self.entity_map[label][word] = pseudonym
replacement = self.entity_map[label][word]
local_mapping[word] = replacement
start = ent["start"] + offset_correction
end = ent["end"] + offset_correction
text_out = text_out[:start] + replacement + text_out[end:]
offset_correction += len(replacement) - (end - start)
return text_out, local_mapping
|