File size: 1,524 Bytes
d242fb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from typing import Dict, List, Tuple
import hashlib

class Pseudonymizer:
    def __init__(self):
        # Stores mapping of original entity -> pseudonym per entity type
        self.entity_map: Dict[str, Dict[str, str]] = {}

    def _get_hash(self, text: str) -> str:
        """
        Generates a deterministic short hash based on input text.
        """
        return hashlib.md5(text.encode()).hexdigest()[:6]

    def pseudonymize(self, text: str, entities: List[Dict]) -> Tuple[str, Dict[str, str]]:
        """
        Replace detected entities in text with consistent pseudonyms.
        Returns pseudonymized text and the mapping used.
        """
        offset_correction = 0
        text_out = text
        local_mapping = {}

        for ent in sorted(entities, key=lambda x: x["start"]):
            label = ent["entity_group"]
            word = ent["word"]

            if label not in self.entity_map:
                self.entity_map[label] = {}

            if word not in self.entity_map[label]:
                pseudonym = f"[{label}_{self._get_hash(word)}]"
                self.entity_map[label][word] = pseudonym

            replacement = self.entity_map[label][word]
            local_mapping[word] = replacement

            start = ent["start"] + offset_correction
            end = ent["end"] + offset_correction
            text_out = text_out[:start] + replacement + text_out[end:]

            offset_correction += len(replacement) - (end - start)

        return text_out, local_mapping