CaPE / pseudonymizer.py
amartyasaran's picture
Dev Ready
d242fb9
from typing import Dict, List, Tuple
import hashlib
class Pseudonymizer:
def __init__(self):
# Stores mapping of original entity -> pseudonym per entity type
self.entity_map: Dict[str, Dict[str, str]] = {}
def _get_hash(self, text: str) -> str:
"""
Generates a deterministic short hash based on input text.
"""
return hashlib.md5(text.encode()).hexdigest()[:6]
def pseudonymize(self, text: str, entities: List[Dict]) -> Tuple[str, Dict[str, str]]:
"""
Replace detected entities in text with consistent pseudonyms.
Returns pseudonymized text and the mapping used.
"""
offset_correction = 0
text_out = text
local_mapping = {}
for ent in sorted(entities, key=lambda x: x["start"]):
label = ent["entity_group"]
word = ent["word"]
if label not in self.entity_map:
self.entity_map[label] = {}
if word not in self.entity_map[label]:
pseudonym = f"[{label}_{self._get_hash(word)}]"
self.entity_map[label][word] = pseudonym
replacement = self.entity_map[label][word]
local_mapping[word] = replacement
start = ent["start"] + offset_correction
end = ent["end"] + offset_correction
text_out = text_out[:start] + replacement + text_out[end:]
offset_correction += len(replacement) - (end - start)
return text_out, local_mapping