qyle commited on
Commit
3da1373
·
verified ·
1 Parent(s): 4d7fbb1

pii filter improved

Browse files
Dockerfile CHANGED
@@ -2,24 +2,12 @@ FROM python:3.11-slim
2
 
3
  WORKDIR /app
4
 
5
- RUN apt-get update && apt-get install -y \
6
- libmagic1 \
7
- libxcb1 \
8
- libx11-6 \
9
- libxext6 \
10
- libxrender1 \
11
- libgl1 \
12
- libglib2.0-0 \
13
- libsm6 \
14
- && rm -rf /var/lib/apt/lists/*
15
-
16
  COPY requirements.txt .
17
  COPY pyproject.toml .
18
  RUN pip install uv
19
  RUN uv pip install --no-cache-dir -r requirements.txt --system
20
 
21
- RUN python -m spacy download en_core_web_lg
22
- RUN python -m spacy download fr_core_news_lg
23
 
24
  COPY . .
25
 
 
2
 
3
  WORKDIR /app
4
 
 
 
 
 
 
 
 
 
 
 
 
5
  COPY requirements.txt .
6
  COPY pyproject.toml .
7
  RUN pip install uv
8
  RUN uv pip install --no-cache-dir -r requirements.txt --system
9
 
10
+ RUN apt-get update && apt-get install -y libmagic1
 
11
 
12
  COPY . .
13
 
README.md CHANGED
@@ -161,7 +161,7 @@ The test cases are defined in the folder `/tests/stress_tests/`:
161
  #### Chat session test scenario
162
  The chat session scenario must be run by specifying the model type and the URL of the server. For example, the following command simulates 150 users making three requests at `https://<username>-champ-chatbot.hf.space` to the model `champ`:
163
  ```
164
- k6 run chat_session.js -e MODEL_TYPE=champ -e URL=https://<username>-champ-bot.hf.space/chat
165
  ```
166
  The possible values for `MODEL_TYPE` are `champ`, `google`, and `openai`.
167
 
 
161
  #### Chat session test scenario
162
  The chat session scenario must be run by specifying the model type and the URL of the server. For example, the following command simulates 150 users making three requests at `https://<username>-champ-chatbot.hf.space` to the model `champ`:
163
  ```
164
+ k6 run chat_session.js -e MODEL_TYPE=champ -e URL=https://<username>-champ-chatbot.hf.space
165
  ```
166
  The possible values for `MODEL_TYPE` are `champ`, `google`, and `openai`.
167
 
classes/pii_filter.py CHANGED
@@ -1,14 +1,34 @@
1
  import logging
2
- from typing import List, Optional
3
- from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer
4
- from presidio_analyzer.nlp_engine import NlpEngineProvider
5
- from presidio_anonymizer import AnonymizerEngine
6
- from presidio_anonymizer.entities import OperatorConfig
7
 
8
- # from lingua import Language, LanguageDetector
9
  logger = logging.getLogger("uvicorn")
10
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def clean_backslashes(txt: str) -> str:
13
  """Cleans backslashes from a string.
14
 
@@ -25,196 +45,103 @@ def clean_backslashes(txt: str) -> str:
25
  return txt.replace("\\'", "'")
26
 
27
 
28
- def create_ssn_pattern_recognizer():
29
- # matches 111-111-111, 111 111 111, and 111111111
30
- ssn_pattern = Pattern(
31
- name="ssn_pattern", regex=r"\b\d{3}[- ]?\d{3}[- ]?\d{3}\b", score=0.9
32
- )
33
- fuzzy_sin_pattern = Pattern(
34
- name="fuzzy_sin_pattern",
35
- regex=r"\b[\dlIOS]{3}[- ]?[\dlIOS]{3}[- ]?[\dlIOS]{3}\b",
36
- score=0.8,
37
- )
38
- return PatternRecognizer(
39
- supported_entity="SSN", patterns=[ssn_pattern, fuzzy_sin_pattern]
40
- )
41
-
42
-
43
- def create_zip_code_pattern_recognizer():
44
- zip_code_pattern = Pattern(
45
- name="zip_code_pattern",
46
- regex=r"\b[A-Z]\d[A-Z]\s?\d[A-Z]\d\b", # Matches A1A 1A1 and A1A1A1
47
- score=0.9,
48
- )
49
- fuzzy_zip_code_pattern = Pattern(
50
- name="fuzzy_zip_code_pattern",
51
- regex=r"\b[A-Z][\dlIOS][A-Z]\s?[\dlIOS][A-Z][\dlIOS]\b",
52
- score=0.8,
53
- )
54
- return PatternRecognizer(
55
- supported_entity="ZIP_CODE", patterns=[zip_code_pattern, fuzzy_zip_code_pattern]
56
- )
57
-
58
-
59
- def create_street_pattern_recognizer():
60
- bilingual_street_regex = (
61
- r"\d+\s+(?:rue|boul|boulevard|av|avenue|place|square|st|street|rd|road|ave|blvd|lane|dr|drive)"
62
- r"\s+[A-ZÁÀÂÄÇÉÈÊËÍÎÏÓÔÖÚÛÜa-z]+"
63
- r"(?:\s+[A-ZÁÀÂÄÇÉÈÊËÍÎÏÓÔÖÚÛÜa-z]+)*"
64
- r"|(?:\d+\s+)?[A-ZÁÀÂÄÇÉÈÊËÍÎÏÓÔÖÚÛÜa-z]+(?:\s+[A-ZÁÀÂÄÇÉÈÊËÍÎÏÓÔÖÚÛÜa-z]+)*"
65
- r"\s+(?:rue|boul|boulevard|av|avenue|place|square|st|street|rd|road|ave|blvd|lane|dr|drive)\b"
66
- )
67
-
68
- street_pattern = Pattern(
69
- name="street_pattern", regex=bilingual_street_regex, score=0.8
70
- )
71
- return PatternRecognizer(
72
- supported_entity="STREET_ADDRESS", patterns=[street_pattern]
73
- )
74
-
75
-
76
- # The default phone pattern recognizer does not catch some edge cases.
77
- def create_phone_pattern_recognizer():
78
  """
79
- Create a custom phone pattern recognizer to catch additional phone formats.
80
- Matches various North American phone formats:
81
- - 123-456-7890 (with dashes)
82
- - 123 456 7890 (with spaces)
83
- - (123) 456-7890 (with parentheses)
84
- - (123) 456 7890 (with parentheses and spaces)
85
- - +1-123-456-7890 (with country code and dashes)
86
- - +1 (123) 456-7890 (with country code, parentheses, and dashes)
87
- - +1 123 456 7890 (with country code and spaces)
88
  """
89
- phone_pattern = Pattern(
90
- name="phone_pattern",
91
- regex=r"(?:\+\d{1,3}[-\s]?)?\(?(?:\d{3})\)?[-\s]?\d{3}[-\s]?\d{4}",
92
- score=0.9,
93
- )
94
- fuzzy_phone_pattern = Pattern(
95
- name="fuzzy_phone_pattern",
96
- regex=r"(?:\+[\dlIOS]{1,3}[-\s]?)?\(?(?:[\dlIOS]{3})\)?[-\s]?[\dlIOS]{3}[-\s]?[\dlIOS]{4}",
97
- score=0.8,
98
- )
99
- return PatternRecognizer(
100
- supported_entity="PHONE_NUMBER", patterns=[phone_pattern, fuzzy_phone_pattern]
101
- )
 
 
 
 
 
 
102
 
103
 
104
  class PIIFilter:
105
  _instance: Optional["PIIFilter"] = None
106
- analyzer: AnalyzerEngine
107
- anonymizer: AnonymizerEngine
108
- operators: dict
109
- target_entities: List[str]
110
- white_list = [
111
- "salut",
112
- "bonjour",
113
- "comment",
114
- "fort", # Par exemple, "Il tousse fort".
115
- "Salut",
116
- "Bonjour",
117
- "Comment",
118
- "fievre",
119
- "fièvre",
120
- "Fievre",
121
- "Fièvre",
122
- "tu",
123
- "Tu",
124
- ]
125
 
126
  def __new__(cls):
127
  if cls._instance is None:
128
- logger.info("Loading the prompt sanitizer into memory...")
129
  cls._instance = super(PIIFilter, cls).__new__(cls)
130
 
131
- # Define which models to use for which language
132
- configuration = {
133
- "nlp_engine_name": "spacy",
134
- "models": [
135
- {"lang_code": "en", "model_name": "en_core_web_lg"},
136
- {"lang_code": "fr", "model_name": "fr_core_news_lg"},
137
- ],
138
- }
139
- provider = NlpEngineProvider(nlp_configuration=configuration)
140
- nlp_engine = provider.create_engine()
141
-
142
- cls._instance.analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
143
-
144
- ssn_pattern_recognizer = create_ssn_pattern_recognizer()
145
- zip_code_pattern_recognizer = create_zip_code_pattern_recognizer()
146
- street_pattern_recognizer = create_street_pattern_recognizer()
147
- phone_pattern_recognizer = create_phone_pattern_recognizer()
148
-
149
- cls._instance.analyzer.registry.add_recognizer(ssn_pattern_recognizer)
150
- cls._instance.analyzer.registry.add_recognizer(zip_code_pattern_recognizer)
151
- cls._instance.analyzer.registry.add_recognizer(street_pattern_recognizer)
152
- cls._instance.analyzer.registry.add_recognizer(phone_pattern_recognizer)
153
-
154
- cls._instance.anonymizer = AnonymizerEngine()
155
-
156
- # Define standard masking rules
157
- cls._instance.operators = {
158
- "PERSON": OperatorConfig("replace", {"new_value": "a person"}),
159
- "EMAIL_ADDRESS": OperatorConfig("replace", {"new_value": "an email"}),
160
- "PHONE_NUMBER": OperatorConfig(
161
- "replace", {"new_value": "a phone number"}
162
- ),
163
- "SSN": OperatorConfig(
164
- "replace", {"new_value": "a social security number"}
165
- ),
166
- "CREDIT_CARD": OperatorConfig(
167
- "replace", {"new_value": "a credit card number"}
168
- ),
169
- "LOCATION": OperatorConfig("replace", {"new_value": "a location"}),
170
- "STREET_ADDRESS": OperatorConfig(
171
- "replace", {"new_value": "a location"}
172
- ),
173
- "ZIP_CODE": OperatorConfig("replace", {"new_value": "a location"}),
174
- }
175
- cls._instance.target_entities = list(cls._instance.operators.keys())
176
 
177
  return cls._instance
178
 
179
  def sanitize(self, text: str) -> str:
180
- """Analyzes and redacts PII from the given text."""
181
  if not text:
182
  return text
183
 
184
  text = clean_backslashes(text)
185
-
186
- # Instead of detecting the language of the document,
187
- # we apply PII removal for both language.
188
- # This strategy is more effective and faster.
189
-
190
- # 2. Detect PII in English
191
- results_en = self.analyzer.analyze(
192
- text=text,
193
- entities=self.target_entities,
194
- language="en",
195
- allow_list=self.white_list,
196
- )
197
-
198
- # 3. Redact PII in English
199
- anonymized_result_en = self.anonymizer.anonymize(
200
- text=text,
201
- analyzer_results=results_en, # pyright: ignore[reportArgumentType]
202
- operators=self.operators,
203
- )
204
-
205
- # 4. Detect PII in French
206
- results_fr = self.analyzer.analyze(
207
- text=anonymized_result_en.text,
208
- entities=self.target_entities,
209
- language="fr",
210
- allow_list=self.white_list, # The French analyzer is also too aggressive against French words surprisingly.
211
- )
212
-
213
- # 5. Redact PII in French
214
- anonymized_result_fr = self.anonymizer.anonymize(
215
- text=anonymized_result_en.text,
216
- analyzer_results=results_fr, # pyright: ignore[reportArgumentType]
217
- operators=self.operators,
218
- )
219
-
220
- return anonymized_result_fr.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import logging
2
+ import re
3
+ from typing import Optional
4
+
5
+ from gliner import GLiNER
 
6
 
 
7
  logger = logging.getLogger("uvicorn")
8
 
9
 
10
+ LABELS = [
11
+ "email",
12
+ "date_of_birth",
13
+ "last_name",
14
+ "street_address",
15
+ ]
16
+ LABELS_PLACEHOLDERS = {
17
+ "email": "an email",
18
+ "phone_number": "a phone number",
19
+ "date_of_birth": "a date of birth",
20
+ "last_name": "a last name",
21
+ "street_address": "a location",
22
+ "ssn": "a ssn",
23
+ }
24
+
25
+ RE_SSN = r"\b\d{3}[- ]?\d{3}[- ]?\d{3}\b"
26
+ RE_ZIP = (
27
+ r"\b[ABCEGHJKLMNPRSTVXY]\d[ABCEGHJKLMNPRSTVWXYZ][ ]?\d[ABCEGHJKLMNPRSTVWXYZ]\d\b"
28
+ )
29
+ RE_PHONE = r"(?:\+?\d{1,3}[-\s.]?)?\(?\d{3}\)?[-\s.]?\d{3}[-\s.]?\d{4}"
30
+
31
+
32
  def clean_backslashes(txt: str) -> str:
33
  """Cleans backslashes from a string.
34
 
 
45
  return txt.replace("\\'", "'")
46
 
47
 
48
+ def chunk_text(text: str, max_chars: int = 1000) -> list[tuple[str, int]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  """
50
+ The text is sometimes too large for the model. We chunk it here so we can pass
51
+ each chunk to the model one by one.
 
 
 
 
 
 
 
52
  """
53
+ chunks = []
54
+ start = 0
55
+ text_len = len(text)
56
+
57
+ while start < text_len:
58
+ # On prend un bloc (environ 1000 caractères ~ 250-300 tokens)
59
+ end = start + max_chars
60
+
61
+ # Pour éviter de couper un mot au milieu, on recule jusqu'au dernier espace
62
+ if end < text_len:
63
+ end = text.rfind(" ", start, end)
64
+ if end <= start: # Si aucun espace n'est trouvé
65
+ end = start + max_chars
66
+
67
+ chunks.append((text[start:end], start))
68
+ # On avance le curseur (on peut ajouter un overlap ici si nécessaire)
69
+ start = end
70
+
71
+ return chunks
72
 
73
 
74
  class PIIFilter:
75
  _instance: Optional["PIIFilter"] = None
76
+ model: None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  def __new__(cls):
79
  if cls._instance is None:
80
+ logger.info("Loading the PII filter into memory...")
81
  cls._instance = super(PIIFilter, cls).__new__(cls)
82
 
83
+ # TODO: manual SSN detection
84
+ cls._instance.model = GLiNER.from_pretrained("nvidia/gliner-PII")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  return cls._instance
87
 
88
  def sanitize(self, text: str) -> str:
 
89
  if not text:
90
  return text
91
 
92
  text = clean_backslashes(text)
93
+ all_entities = []
94
+
95
+ # 1. Chunking pour GLiNER (max_chars=1000 pour rester sous les 384 tokens)
96
+ chunks = chunk_text(text, max_chars=1000)
97
+ for chunk, offset in chunks:
98
+ chunk_entities = self.model.predict_entities(chunk, LABELS, threshold=0.6)
99
+ for ent in chunk_entities:
100
+ all_entities.append(
101
+ {
102
+ "start": ent["start"] + offset,
103
+ "end": ent["end"] + offset,
104
+ "label": ent["label"],
105
+ }
106
+ )
107
+
108
+ # 2. Ajout des détections par Regex
109
+ regex_rules = [
110
+ (RE_SSN, "ssn"),
111
+ (RE_ZIP, "street_address"),
112
+ (RE_PHONE, "phone_number"),
113
+ ]
114
+
115
+ for pattern, label in regex_rules:
116
+ for match in re.finditer(pattern, text):
117
+ all_entities.append(
118
+ {"start": match.start(), "end": match.end(), "label": label}
119
+ )
120
+
121
+ # 3. Gestion des chevauchements (Overlaps)
122
+ # Si deux entités se chevauchent, on garde la plus large.
123
+ all_entities.sort(key=lambda x: x["start"])
124
+ merged_entities = []
125
+ if all_entities:
126
+ current = all_entities[0]
127
+ for next_ent in all_entities[1:]:
128
+ if next_ent["start"] < current["end"]:
129
+ # Chevauchement trouvé, on prend l'enveloppe maximale
130
+ current["end"] = max(current["end"], next_ent["end"])
131
+ # On peut aussi décider ici quel label prioriser
132
+ else:
133
+ merged_entities.append(current)
134
+ current = next_ent
135
+ merged_entities.append(current)
136
+
137
+ # 4. Remplacement (en partant de la fin pour garder les index valides)
138
+ redacted_text = text
139
+ for entity in sorted(merged_entities, key=lambda x: x["start"], reverse=True):
140
+ placeholder = LABELS_PLACEHOLDERS[entity["label"]]
141
+ redacted_text = (
142
+ redacted_text[: entity["start"]]
143
+ + placeholder
144
+ + redacted_text[entity["end"] :]
145
+ )
146
+
147
+ return redacted_text
helpers/llm_helper.py CHANGED
@@ -138,14 +138,19 @@ def _call_champ(
138
  lang: Literal["en", "fr"],
139
  conversation: List[ChatMessage],
140
  document_contents: List[str] | None,
141
- prompt_template: str | None= None,
142
- ) -> tuple[str, float, dict[str, Any], list[str]]:
143
  tracer = trace.get_tracer(__name__)
144
 
145
  vector_store = _get_vector_store(document_contents)
146
 
147
  with tracer.start_as_current_span("ChampService"):
148
- champ = ChampService(vector_store=vector_store, lang=lang, model_type="champ", prompt_template=prompt_template)
 
 
 
 
 
149
 
150
  with tracer.start_as_current_span("convert_messages_langchain"):
151
  msgs = convert_messages_langchain(conversation)
@@ -164,7 +169,7 @@ def _call_champ(
164
 
165
  return (
166
  reply,
167
- champ_impacts.usage.gwp.value,
168
  triage_meta,
169
  context,
170
  final_token_count,
@@ -192,7 +197,7 @@ def _call_qwen(
192
 
193
  return (
194
  reply,
195
- qwen_impacts.usage.gwp.value,
196
  triage_meta,
197
  context,
198
  n_tokens,
 
138
  lang: Literal["en", "fr"],
139
  conversation: List[ChatMessage],
140
  document_contents: List[str] | None,
141
+ prompt_template: str | None = None,
142
+ ) -> tuple[str, float, dict[str, Any], list[str], int]:
143
  tracer = trace.get_tracer(__name__)
144
 
145
  vector_store = _get_vector_store(document_contents)
146
 
147
  with tracer.start_as_current_span("ChampService"):
148
+ champ = ChampService(
149
+ vector_store=vector_store,
150
+ lang=lang,
151
+ model_type="champ",
152
+ prompt_template=prompt_template,
153
+ )
154
 
155
  with tracer.start_as_current_span("convert_messages_langchain"):
156
  msgs = convert_messages_langchain(conversation)
 
169
 
170
  return (
171
  reply,
172
+ champ_impacts.usage.gwp.value, # pyright: ignore[reportReturnType]
173
  triage_meta,
174
  context,
175
  final_token_count,
 
197
 
198
  return (
199
  reply,
200
+ qwen_impacts.usage.gwp.value, # pyright: ignore[reportReturnType]
201
  triage_meta,
202
  context,
203
  n_tokens,
main.py CHANGED
@@ -303,7 +303,7 @@ async def chat_endpoint(
303
 
304
  # Endpoint for specific replies/responses
305
  @app.post("/feedback")
306
- @limiter.limit("20/minute")
307
  def feedback_endpoint(
308
  payload: FeedbackRequest, background_tasks: BackgroundTasks, request: Request
309
  ):
@@ -328,7 +328,7 @@ def feedback_endpoint(
328
 
329
  # Endpoint for specific generic comments
330
  @app.post("/comment")
331
- @limiter.limit("20/minute")
332
  def comment_endpoint(
333
  payload: CommentRequest, background_tasks: BackgroundTasks, request: Request
334
  ):
 
303
 
304
  # Endpoint for specific replies/responses
305
  @app.post("/feedback")
306
+ @limiter.limit("450/minute")
307
  def feedback_endpoint(
308
  payload: FeedbackRequest, background_tasks: BackgroundTasks, request: Request
309
  ):
 
328
 
329
  # Endpoint for specific generic comments
330
  @app.post("/comment")
331
+ @limiter.limit("450/minute")
332
  def comment_endpoint(
333
  payload: CommentRequest, background_tasks: BackgroundTasks, request: Request
334
  ):
requirements.txt CHANGED
@@ -7,11 +7,9 @@ python-dotenv==1.2.2
7
  opentelemetry-sdk==1.40.0
8
  slowapi==0.1.9
9
  nh3==0.3.3
10
- presidio-analyzer==2.2.362
11
- presidio-anonymizer==2.2.362
12
  boto3==1.42.70
13
  pytz==2026.1.post1
14
- opencv-python==4.13.0.92
15
  PyMuPDF==1.27.2
16
  python-magic==0.4.27
17
  python-magic-bin==0.4.14; sys_platform=='win32'
@@ -28,4 +26,5 @@ opentelemetry-instrumentation==0.61b0
28
  opentelemetry-instrumentation-fastapi==0.61b0
29
  opentelemetry-instrumentation-httpx==0.61b0
30
  python-multipart==0.0.22
31
- tiktoken
 
 
7
  opentelemetry-sdk==1.40.0
8
  slowapi==0.1.9
9
  nh3==0.3.3
 
 
10
  boto3==1.42.70
11
  pytz==2026.1.post1
12
+ opencv-python-headless==4.13.0.92
13
  PyMuPDF==1.27.2
14
  python-magic==0.4.27
15
  python-magic-bin==0.4.14; sys_platform=='win32'
 
26
  opentelemetry-instrumentation-fastapi==0.61b0
27
  opentelemetry-instrumentation-httpx==0.61b0
28
  python-multipart==0.0.22
29
+ tiktoken
30
+ gliner==0.2.26
static/styles/components/consent.css CHANGED
@@ -15,13 +15,15 @@
15
  padding: 0.65rem 0.9rem;
16
  margin: 0.75rem 0 1rem;
17
  color: #791F1F;
18
- font-size: 13px;
19
- font-weight: 500;
20
  }
21
 
22
  .consent-emergency svg {
23
  flex-shrink: 0;
24
  color: #A32D2D;
 
 
25
  }
26
 
27
  .consent-data-note {
 
15
  padding: 0.65rem 0.9rem;
16
  margin: 0.75rem 0 1rem;
17
  color: #791F1F;
18
+ font-size: 16px;
19
+ font-weight: 700;
20
  }
21
 
22
  .consent-emergency svg {
23
  flex-shrink: 0;
24
  color: #A32D2D;
25
+ width: 18px;
26
+ height: 18px;
27
  }
28
 
29
  .consent-data-note {