Spaces:
Sleeping
Sleeping
| from typing import Dict, List, Tuple | |
| import hashlib | |
| class Pseudonymizer: | |
| def __init__(self): | |
| # Stores mapping of original entity -> pseudonym per entity type | |
| self.entity_map: Dict[str, Dict[str, str]] = {} | |
| def _get_hash(self, text: str) -> str: | |
| """ | |
| Generates a deterministic short hash based on input text. | |
| """ | |
| return hashlib.md5(text.encode()).hexdigest()[:6] | |
| def pseudonymize(self, text: str, entities: List[Dict]) -> Tuple[str, Dict[str, str]]: | |
| """ | |
| Replace detected entities in text with consistent pseudonyms. | |
| Returns pseudonymized text and the mapping used. | |
| """ | |
| offset_correction = 0 | |
| text_out = text | |
| local_mapping = {} | |
| for ent in sorted(entities, key=lambda x: x["start"]): | |
| label = ent["entity_group"] | |
| word = ent["word"] | |
| if label not in self.entity_map: | |
| self.entity_map[label] = {} | |
| if word not in self.entity_map[label]: | |
| pseudonym = f"[{label}_{self._get_hash(word)}]" | |
| self.entity_map[label][word] = pseudonym | |
| replacement = self.entity_map[label][word] | |
| local_mapping[word] = replacement | |
| start = ent["start"] + offset_correction | |
| end = ent["end"] + offset_correction | |
| text_out = text_out[:start] + replacement + text_out[end:] | |
| offset_correction += len(replacement) - (end - start) | |
| return text_out, local_mapping | |