Spaces:
Sleeping
Sleeping
Update utils/masker3.py
Browse files- utils/masker3.py +25 -3
utils/masker3.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
import re
|
| 2 |
import spacy
|
| 3 |
from typing import Dict, Any, List
|
|
|
|
|
|
|
| 4 |
|
| 5 |
# Load spaCy model
|
| 6 |
nlp = spacy.load("en_core_web_sm")
|
|
@@ -9,7 +11,14 @@ def mask_pii(text: str) -> Dict[str, Any]:
|
|
| 9 |
"""
|
| 10 |
Enhanced PII masking with JSON output format
|
| 11 |
"""
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
entities = []
|
| 14 |
|
| 15 |
def mask_and_record(pattern, label, group=0):
|
|
@@ -60,13 +69,26 @@ def mask_pii(text: str) -> Dict[str, Any]:
|
|
| 60 |
})
|
| 61 |
|
| 62 |
# Optional: Set category based on simple rule or ML model
|
| 63 |
-
category = "
|
|
|
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
return {
|
| 66 |
-
"input_email_body":
|
| 67 |
"list_of_masked_entities": sorted(entities, key=lambda x: x["position"][0]),
|
| 68 |
"masked_email": masked_text,
|
| 69 |
"category_of_the_email": category
|
| 70 |
}
|
| 71 |
|
| 72 |
|
|
|
|
|
|
|
|
|
| 1 |
import re
|
| 2 |
import spacy
|
| 3 |
from typing import Dict, Any, List
|
| 4 |
+
from langdetect import detect
|
| 5 |
+
from deep_translator import GoogleTranslator
|
| 6 |
|
| 7 |
# Load spaCy model
|
| 8 |
nlp = spacy.load("en_core_web_sm")
|
|
|
|
| 11 |
"""
|
| 12 |
Enhanced PII masking with JSON output format
|
| 13 |
"""
|
| 14 |
+
lang = detect(text)
|
| 15 |
+
if lang == 'en':
|
| 16 |
+
#return text
|
| 17 |
+
masked_text = text
|
| 18 |
+
else:
|
| 19 |
+
# Translate to English
|
| 20 |
+
translated = GoogleTranslator(source=lang, target='en').translate(text)
|
| 21 |
+
masked_text = translated
|
| 22 |
entities = []
|
| 23 |
|
| 24 |
def mask_and_record(pattern, label, group=0):
|
|
|
|
| 69 |
})
|
| 70 |
|
| 71 |
# Optional: Set category based on simple rule or ML model
|
| 72 |
+
category = "Problem"
|
| 73 |
+
|
| 74 |
|
| 75 |
+
# if lang == 'en':
|
| 76 |
+
# masked_text = masked_text
|
| 77 |
+
# else:
|
| 78 |
+
# masked_text = GoogleTranslator(source='en', target=lang).translate(masked_text)
|
| 79 |
+
text2 = text
|
| 80 |
+
for ent in entities:
|
| 81 |
+
entity_value = ent['entity']
|
| 82 |
+
classification = ent['classification']
|
| 83 |
+
text = text.replace(entity_value, f"[{classification}]")
|
| 84 |
+
masked_text = text
|
| 85 |
return {
|
| 86 |
+
"input_email_body": text2,
|
| 87 |
"list_of_masked_entities": sorted(entities, key=lambda x: x["position"][0]),
|
| 88 |
"masked_email": masked_text,
|
| 89 |
"category_of_the_email": category
|
| 90 |
}
|
| 91 |
|
| 92 |
|
| 93 |
+
text = "Subject: Unvorhergesehener Absturz der Datenanalyse-Plattform\n\nDie Datenanalyse-Plattform brach unerwartet ab, da die Speicheroberfläche zu gering war My name is Sophia Rossi.. Ich habe versucht, Laravel 8 und meinen MacBook Pro neu zu starten, aber das Problem behält sich bei. Ich benötige Ihre Unterstützung, um diesen Fehler zu beheben. You can reach me at janesmith@company.com."
|
| 94 |
+
print(mask_pii(text))
|