Spaces:

MosaHosseini
/

Swedish_Text_Anonymizer

Build error

App Files Files Community

MosaHosseini commited on Jul 28, 2025

Commit

378b61e

verified ·

1 Parent(s): 833a0d6

Upload 3 files

Browse files

app.py is for gradio interface and pdf to text and anonymize.py is for the text masker.

Files changed (3) hide show

anonymize.py +128 -0
app.py +33 -0
requirements.txt +5 -0

anonymize.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import regex as re
+from typing import List, Tuple, Dict, Set
+from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+class SwedishTextMasker:
+    def __init__(self, model_name: str = "RecordedFuture/Swedish-NER", threshold: float = 0.85):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForTokenClassification.from_pretrained(model_name)
+        self.ner_pipeline = pipeline("ner", model=self.model, tokenizer=self.tokenizer, aggregation_strategy="simple")
+        self.threshold = threshold
+    def _reconstruct_entities(self, tokens_with_labels: List[Tuple[str, str, float]]) -> List[Tuple[str, str]]:
+        words = []
+        current_word = ''
+        current_label = ''
+        scores = []
+        for token, label, score in tokens_with_labels:
+            if token.startswith('##'):
+                current_word += token[2:]
+                scores.append(score)
+            else:
+                if current_word:
+                    words.append((current_word, current_label , sum(scores) / len(scores)))
+                current_word, current_label  = token, label
+                scores = [score]
+        if current_word:
+            words.append((current_word, current_label , sum(scores) / len(scores)))
+        result = self._in_order_ent_list(words)
+        print("\n\n\n the result of inorder ent list : \n\n" , result)
+        return result
+    def _in_order_ent_list(self , all_ents_list):
+        threshold_ents = [ent for ent in all_ents_list if ent[2]>=self.threshold]
+        threshold_ents_word = {ent[0] for ent in threshold_ents}
+        result = [(ent[0] , ent[1]) for ent in all_ents_list if ent[0] in threshold_ents_word and len(ent[0]) >=2]
+        return result
+    def _get_chunks(self, text_list: List[str], chunk_size: int = 100) -> List[List[str]]:
+        return [text_list[i:i + chunk_size] for i in range(0, len(text_list), chunk_size)]
+    def _retrieve_ner(self, text: str) -> List[Tuple[str, str, float]]:
+        results = self.ner_pipeline(text)
+        return [
+            (ent["word"], ent["entity_group"], ent["score"])
+            for ent in results
+            if ent["entity_group"] in {"ORG", "PER", "TIT"}
+        ]
+    def _get_entities(self, text: str) -> List[Tuple[str, str, float]]:
+        tokens = text.split()
+        if len(tokens) > 100:
+            chunks = self._get_chunks(tokens)
+            all_ents = []
+            for chunk in chunks:
+                chunk_text = " ".join(chunk)
+                all_ents.extend(self._retrieve_ner(chunk_text))
+            return all_ents
+        else:
+            return self._retrieve_ner(text)
+    def _get_entity_dicts(self, entities: List[Tuple[str, str, float]]) -> Tuple[Dict[str, str], Dict[str, str], Set[str]]:
+        persons = [ent[0] for ent in entities if ent[1] == "PER"]
+        companies = [ent[0] for ent in entities if ent[1] == "ORG"]
+        titles = {ent[0] for ent in entities if ent[1] == "TIT"}
+        person_dict = {name: f"Person {chr(ord('A') + i)}" for i, name in enumerate(dict.fromkeys(persons))}
+        company_dict = {name: f"ORG_COMPANY {chr(ord('A') + i)}" for i, name in enumerate(dict.fromkeys(companies))}
+        return person_dict, company_dict, titles
+    @staticmethod
+    def mask_digits(text: str) -> str:
+        return re.sub(r'\d', 'x', text)
+    def mask_phone_numbers(self, text: str) -> str:
+        phone_regex = re.compile(r'(?:\+|00)?\d[\d\s\-()]{5,}\d')
+        return phone_regex.sub(lambda m: self.mask_digits(m.group()), text)
+    def mask_org_numbers(self, text: str) -> str:
+        org_regex = re.compile(r'\b\d{6}-?\d{4}\b')
+        return org_regex.sub(lambda m: self.mask_digits(m.group()), text)
+    def mask_emails(self, text: str) -> str:
+        email_regex = re.compile(r'\b([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b')
+        def email_masker(match):
+            local, domain = match.groups()
+            return f"{re.sub(r'[a-zA-Z0-9]', 'x', local)}@{re.sub(r'[a-zA-Z0-9]', 'x', domain)}"
+        return email_regex.sub(email_masker, text)
+    def mask_addresses(self, text: str) -> str:
+        address_regex = re.compile(
+            r'\b(?:[A-ZÅÄÖa-zåäöéÉèÈçÇß0-9\-]+\s)+\d{1,4}\s*,?\s*\d{3}\s?\d{2}\s+[A-ZÅÄÖa-zåäö\-]+',
+            re.UNICODE
+        )
+        return address_regex.sub('[ADDRESS]', text)
+    def mask_entities(self, text: str, entity_dict: Dict[str, str], tag: str) -> str:
+        for original, masked in entity_dict.items():
+            text = re.sub(re.escape(original), f"[{masked}]", text)
+        return text
+    def mask_titles(self, text: str, titles: Set[str]) -> str:
+        for title in titles:
+            text = re.sub(re.escape(title), "[Person_Title]", text)
+        return text
+    def mask_all(self, text: str) -> str:
+        old_text_backup = text
+        print("Original Text: \n\n" , text )
+        text = self.mask_phone_numbers(text)
+        text = self.mask_org_numbers(text)
+        text = self.mask_emails(text)
+        text = self.mask_addresses(text)
+        ents_raw = self._get_entities(old_text_backup)
+        ents = self._reconstruct_entities(ents_raw)
+        person_dict, company_dict, title_set = self._get_entity_dicts(ents)
+        text = self.mask_entities(text, company_dict, "ORG")
+        text = self.mask_entities(text, person_dict, "PER")
+        text = self.mask_titles(text, title_set)
+        return text

app.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import fitz  # PyMuPDF
+import gradio as gr
+from anonymize import SwedishTextMasker
+# Instantiate once, globally
+text_anonymizer = SwedishTextMasker(threshold= 0.9)
+def extract_text_from_pdf(pdf_file):
+    if pdf_file is None:
+        return "No file uploa9999ded."
+    # Approach 1: open via file path (usually safer)
+    with fitz.open(pdf_file.name) as doc:
+        text_output = ""
+        for page in doc:
+            text_output += page.get_text()
+    raw_text = text_output.strip()
+    anonymized_text = text_anonymizer.mask_all(raw_text)
+    return anonymized_text
+# Gradio interface
+with gr.Blocks(title="PDF -> Anonymized Text") as demo:
+    gr.Markdown("### ?? PDF Anonymizer (text only, skips images)")
+    with gr.Row():
+        pdf_input = gr.File(label="Upload a PDF", file_types=[".pdf"])
+    text_output = gr.Textbox(label="Anonymized Output", lines=20, interactive=False)
+    extract_button = gr.Button("Anonymize Text")
+    extract_button.click(fn=extract_text_from_pdf, inputs=pdf_input, outputs=text_output)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers==4.52.4
+torch==2.7.0
+regex==2024.11.6
+gradio==5.32.1
+PyMuPDF==1.26.3