disabling language detection
Browse files- classes/prompt_sanitizer.py +13 -10
classes/prompt_sanitizer.py
CHANGED
|
@@ -102,20 +102,23 @@ class PromptSanitizer:
|
|
| 102 |
return text
|
| 103 |
|
| 104 |
# 1. Automatic Language Detection
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
| 114 |
results = self.analyzer.analyze(
|
| 115 |
text=text, entities=self.target_entities, language=lang
|
| 116 |
)
|
| 117 |
|
| 118 |
-
#
|
| 119 |
anonymized_result = self.anonymizer.anonymize(
|
| 120 |
text=text,
|
| 121 |
analyzer_results=results, # pyright: ignore[reportArgumentType]
|
|
|
|
| 102 |
return text
|
| 103 |
|
| 104 |
# 1. Automatic Language Detection
|
| 105 |
+
# This step would add a lot of overhead to the LLM calls (almost 1s).
|
| 106 |
+
# try:
|
| 107 |
+
# detected_lang = detect(text)
|
| 108 |
+
# # Presidio needs to know if we are using 'en' or 'fr'
|
| 109 |
+
# # If it detects something else, we default to 'en'
|
| 110 |
+
# lang = detected_lang if detected_lang in ["en", "fr"] else "en"
|
| 111 |
+
# except Exception:
|
| 112 |
+
# lang = "en"
|
| 113 |
+
|
| 114 |
+
lang = "en"
|
| 115 |
+
|
| 116 |
+
# 2. Detect PII
|
| 117 |
results = self.analyzer.analyze(
|
| 118 |
text=text, entities=self.target_entities, language=lang
|
| 119 |
)
|
| 120 |
|
| 121 |
+
# 3. Redact PII
|
| 122 |
anonymized_result = self.anonymizer.anonymize(
|
| 123 |
text=text,
|
| 124 |
analyzer_results=results, # pyright: ignore[reportArgumentType]
|