champ-chatbot

Paused

App Files Files Community

qyle commited on 24 days ago

Commit

3da1373

verified ·

1 Parent(s): 4d7fbb1

pii filter improved

Browse files

Files changed (7) hide show

Dockerfile +1 -13
README.md +1 -1
classes/pii_filter.py +107 -180
helpers/llm_helper.py +10 -5
main.py +2 -2
requirements.txt +3 -4
static/styles/components/consent.css +4 -2

Dockerfile CHANGED Viewed

@@ -2,24 +2,12 @@ FROM python:3.11-slim
 WORKDIR /app
-RUN apt-get update && apt-get install -y \
-    libmagic1 \
-    libxcb1 \
-    libx11-6 \
-    libxext6 \
-    libxrender1 \
-    libgl1 \
-    libglib2.0-0 \
-    libsm6 \
-    && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
 COPY pyproject.toml .
 RUN pip install uv
 RUN uv pip install --no-cache-dir -r requirements.txt --system
-RUN python -m spacy download en_core_web_lg
-RUN python -m spacy download fr_core_news_lg
 COPY . .

 WORKDIR /app
 COPY requirements.txt .
 COPY pyproject.toml .
 RUN pip install uv
 RUN uv pip install --no-cache-dir -r requirements.txt --system
+RUN apt-get update && apt-get install -y libmagic1
 COPY . .

README.md CHANGED Viewed

@@ -161,7 +161,7 @@ The test cases are defined in the folder `/tests/stress_tests/`:
 #### Chat session test scenario
 The chat session scenario must be run by specifying the model type and the URL of the server. For example, the following command simulates 150 users making three requests at `https://<username>-champ-chatbot.hf.space` to the model `champ`:
 ```
-k6 run chat_session.js -e MODEL_TYPE=champ -e URL=https://<username>-champ-bot.hf.space/chat
 ```
 The possible values for `MODEL_TYPE` are `champ`, `google`, and `openai`.

 #### Chat session test scenario
 The chat session scenario must be run by specifying the model type and the URL of the server. For example, the following command simulates 150 users making three requests at `https://<username>-champ-chatbot.hf.space` to the model `champ`:
 ```
+k6 run chat_session.js -e MODEL_TYPE=champ -e URL=https://<username>-champ-chatbot.hf.space
 ```
 The possible values for `MODEL_TYPE` are `champ`, `google`, and `openai`.

classes/pii_filter.py CHANGED Viewed

@@ -1,14 +1,34 @@
 import logging
-from typing import List, Optional
-from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer
-from presidio_analyzer.nlp_engine import NlpEngineProvider
-from presidio_anonymizer import AnonymizerEngine
-from presidio_anonymizer.entities import OperatorConfig
-# from lingua import Language, LanguageDetector
 logger = logging.getLogger("uvicorn")
 def clean_backslashes(txt: str) -> str:
     """Cleans backslashes from a string.
@@ -25,196 +45,103 @@ def clean_backslashes(txt: str) -> str:
     return txt.replace("\\'", "'")
-def create_ssn_pattern_recognizer():
-    # matches 111-111-111, 111 111 111, and 111111111
-    ssn_pattern = Pattern(
-        name="ssn_pattern", regex=r"\b\d{3}[- ]?\d{3}[- ]?\d{3}\b", score=0.9
-    )
-    fuzzy_sin_pattern = Pattern(
-        name="fuzzy_sin_pattern",
-        regex=r"\b[\dlIOS]{3}[- ]?[\dlIOS]{3}[- ]?[\dlIOS]{3}\b",
-        score=0.8,
-    )
-    return PatternRecognizer(
-        supported_entity="SSN", patterns=[ssn_pattern, fuzzy_sin_pattern]
-    )
-def create_zip_code_pattern_recognizer():
-    zip_code_pattern = Pattern(
-        name="zip_code_pattern",
-        regex=r"\b[A-Z]\d[A-Z]\s?\d[A-Z]\d\b",  # Matches A1A 1A1 and A1A1A1
-        score=0.9,
-    )
-    fuzzy_zip_code_pattern = Pattern(
-        name="fuzzy_zip_code_pattern",
-        regex=r"\b[A-Z][\dlIOS][A-Z]\s?[\dlIOS][A-Z][\dlIOS]\b",
-        score=0.8,
-    )
-    return PatternRecognizer(
-        supported_entity="ZIP_CODE", patterns=[zip_code_pattern, fuzzy_zip_code_pattern]
-    )
-def create_street_pattern_recognizer():
-    bilingual_street_regex = (
-        r"\d+\s+(?:rue|boul|boulevard|av|avenue|place|square|st|street|rd|road|ave|blvd|lane|dr|drive)"
-        r"\s+[A-ZÁÀÂÄÇÉÈÊËÍÎÏÓÔÖÚÛÜa-z]+"
-        r"(?:\s+[A-ZÁÀÂÄÇÉÈÊËÍÎÏÓÔÖÚÛÜa-z]+)*"
-        r"|(?:\d+\s+)?[A-ZÁÀÂÄÇÉÈÊËÍÎÏÓÔÖÚÛÜa-z]+(?:\s+[A-ZÁÀÂÄÇÉÈÊËÍÎÏÓÔÖÚÛÜa-z]+)*"
-        r"\s+(?:rue|boul|boulevard|av|avenue|place|square|st|street|rd|road|ave|blvd|lane|dr|drive)\b"
-    )
-    street_pattern = Pattern(
-        name="street_pattern", regex=bilingual_street_regex, score=0.8
-    )
-    return PatternRecognizer(
-        supported_entity="STREET_ADDRESS", patterns=[street_pattern]
-    )
-# The default phone pattern recognizer does not catch some edge cases.
-def create_phone_pattern_recognizer():
     """
-    Create a custom phone pattern recognizer to catch additional phone formats.
-    Matches various North American phone formats:
-    - 123-456-7890 (with dashes)
-    - 123 456 7890 (with spaces)
-    - (123) 456-7890 (with parentheses)
-    - (123) 456 7890 (with parentheses and spaces)
-    - +1-123-456-7890 (with country code and dashes)
-    - +1 (123) 456-7890 (with country code, parentheses, and dashes)
-    - +1 123 456 7890 (with country code and spaces)
     """
-    phone_pattern = Pattern(
-        name="phone_pattern",
-        regex=r"(?:\+\d{1,3}[-\s]?)?\(?(?:\d{3})\)?[-\s]?\d{3}[-\s]?\d{4}",
-        score=0.9,
-    )
-    fuzzy_phone_pattern = Pattern(
-        name="fuzzy_phone_pattern",
-        regex=r"(?:\+[\dlIOS]{1,3}[-\s]?)?\(?(?:[\dlIOS]{3})\)?[-\s]?[\dlIOS]{3}[-\s]?[\dlIOS]{4}",
-        score=0.8,
-    )
-    return PatternRecognizer(
-        supported_entity="PHONE_NUMBER", patterns=[phone_pattern, fuzzy_phone_pattern]
-    )
 class PIIFilter:
     _instance: Optional["PIIFilter"] = None
-    analyzer: AnalyzerEngine
-    anonymizer: AnonymizerEngine
-    operators: dict
-    target_entities: List[str]
-    white_list = [
-        "salut",
-        "bonjour",
-        "comment",
-        "fort",  # Par exemple, "Il tousse fort".
-        "Salut",
-        "Bonjour",
-        "Comment",
-        "fievre",
-        "fièvre",
-        "Fievre",
-        "Fièvre",
-        "tu",
-        "Tu",
-    ]
     def __new__(cls):
         if cls._instance is None:
-            logger.info("Loading the prompt sanitizer into memory...")
             cls._instance = super(PIIFilter, cls).__new__(cls)
-            # Define which models to use for which language
-            configuration = {
-                "nlp_engine_name": "spacy",
-                "models": [
-                    {"lang_code": "en", "model_name": "en_core_web_lg"},
-                    {"lang_code": "fr", "model_name": "fr_core_news_lg"},
-                ],
-            }
-            provider = NlpEngineProvider(nlp_configuration=configuration)
-            nlp_engine = provider.create_engine()
-            cls._instance.analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
-            ssn_pattern_recognizer = create_ssn_pattern_recognizer()
-            zip_code_pattern_recognizer = create_zip_code_pattern_recognizer()
-            street_pattern_recognizer = create_street_pattern_recognizer()
-            phone_pattern_recognizer = create_phone_pattern_recognizer()
-            cls._instance.analyzer.registry.add_recognizer(ssn_pattern_recognizer)
-            cls._instance.analyzer.registry.add_recognizer(zip_code_pattern_recognizer)
-            cls._instance.analyzer.registry.add_recognizer(street_pattern_recognizer)
-            cls._instance.analyzer.registry.add_recognizer(phone_pattern_recognizer)
-            cls._instance.anonymizer = AnonymizerEngine()
-            # Define standard masking rules
-            cls._instance.operators = {
-                "PERSON": OperatorConfig("replace", {"new_value": "a person"}),
-                "EMAIL_ADDRESS": OperatorConfig("replace", {"new_value": "an email"}),
-                "PHONE_NUMBER": OperatorConfig(
-                    "replace", {"new_value": "a phone number"}
-                ),
-                "SSN": OperatorConfig(
-                    "replace", {"new_value": "a social security number"}
-                ),
-                "CREDIT_CARD": OperatorConfig(
-                    "replace", {"new_value": "a credit card number"}
-                ),
-                "LOCATION": OperatorConfig("replace", {"new_value": "a location"}),
-                "STREET_ADDRESS": OperatorConfig(
-                    "replace", {"new_value": "a location"}
-                ),
-                "ZIP_CODE": OperatorConfig("replace", {"new_value": "a location"}),
-            }
-            cls._instance.target_entities = list(cls._instance.operators.keys())
         return cls._instance
     def sanitize(self, text: str) -> str:
-        """Analyzes and redacts PII from the given text."""
         if not text:
             return text
         text = clean_backslashes(text)
-        # Instead of detecting the language of the document,
-        # we apply PII removal for both language.
-        # This strategy is more effective and faster.
-        # 2. Detect PII in English
-        results_en = self.analyzer.analyze(
-            text=text,
-            entities=self.target_entities,
-            language="en",
-            allow_list=self.white_list,
-        )
-        # 3. Redact PII in English
-        anonymized_result_en = self.anonymizer.anonymize(
-            text=text,
-            analyzer_results=results_en,  # pyright: ignore[reportArgumentType]
-            operators=self.operators,
-        )
-        # 4. Detect PII in French
-        results_fr = self.analyzer.analyze(
-            text=anonymized_result_en.text,
-            entities=self.target_entities,
-            language="fr",
-            allow_list=self.white_list,  # The French analyzer is also too aggressive against French words surprisingly.
-        )
-        # 5. Redact PII in French
-        anonymized_result_fr = self.anonymizer.anonymize(
-            text=anonymized_result_en.text,
-            analyzer_results=results_fr,  # pyright: ignore[reportArgumentType]
-            operators=self.operators,
-        )
-        return anonymized_result_fr.text

 import logging
+import re
+from typing import Optional
+from gliner import GLiNER
 logger = logging.getLogger("uvicorn")
+LABELS = [
+    "email",
+    "date_of_birth",
+    "last_name",
+    "street_address",
+]
+LABELS_PLACEHOLDERS = {
+    "email": "an email",
+    "phone_number": "a phone number",
+    "date_of_birth": "a date of birth",
+    "last_name": "a last name",
+    "street_address": "a location",
+    "ssn": "a ssn",
+}
+RE_SSN = r"\b\d{3}[- ]?\d{3}[- ]?\d{3}\b"
+RE_ZIP = (
+    r"\b[ABCEGHJKLMNPRSTVXY]\d[ABCEGHJKLMNPRSTVWXYZ][ ]?\d[ABCEGHJKLMNPRSTVWXYZ]\d\b"
+)
+RE_PHONE = r"(?:\+?\d{1,3}[-\s.]?)?\(?\d{3}\)?[-\s.]?\d{3}[-\s.]?\d{4}"
 def clean_backslashes(txt: str) -> str:
     """Cleans backslashes from a string.
     return txt.replace("\\'", "'")
+def chunk_text(text: str, max_chars: int = 1000) -> list[tuple[str, int]]:
     """
+    The text is sometimes too large for the model. We chunk it here so we can pass
+    each chunk to the model one by one.
     """
+    chunks = []
+    start = 0
+    text_len = len(text)
+    while start < text_len:
+        # On prend un bloc (environ 1000 caractères ~ 250-300 tokens)
+        end = start + max_chars
+        # Pour éviter de couper un mot au milieu, on recule jusqu'au dernier espace
+        if end < text_len:
+            end = text.rfind(" ", start, end)
+            if end <= start:  # Si aucun espace n'est trouvé
+                end = start + max_chars
+        chunks.append((text[start:end], start))
+        # On avance le curseur (on peut ajouter un overlap ici si nécessaire)
+        start = end
+    return chunks
 class PIIFilter:
     _instance: Optional["PIIFilter"] = None
+    model: None
     def __new__(cls):
         if cls._instance is None:
+            logger.info("Loading the PII filter into memory...")
             cls._instance = super(PIIFilter, cls).__new__(cls)
+            # TODO: manual SSN detection
+            cls._instance.model = GLiNER.from_pretrained("nvidia/gliner-PII")
         return cls._instance
     def sanitize(self, text: str) -> str:
         if not text:
             return text
         text = clean_backslashes(text)
+        all_entities = []
+        # 1. Chunking pour GLiNER (max_chars=1000 pour rester sous les 384 tokens)
+        chunks = chunk_text(text, max_chars=1000)
+        for chunk, offset in chunks:
+            chunk_entities = self.model.predict_entities(chunk, LABELS, threshold=0.6)
+            for ent in chunk_entities:
+                all_entities.append(
+                    {
+                        "start": ent["start"] + offset,
+                        "end": ent["end"] + offset,
+                        "label": ent["label"],
+                    }
+                )
+        # 2. Ajout des détections par Regex
+        regex_rules = [
+            (RE_SSN, "ssn"),
+            (RE_ZIP, "street_address"),
+            (RE_PHONE, "phone_number"),
+        ]
+        for pattern, label in regex_rules:
+            for match in re.finditer(pattern, text):
+                all_entities.append(
+                    {"start": match.start(), "end": match.end(), "label": label}
+                )
+        # 3. Gestion des chevauchements (Overlaps)
+        # Si deux entités se chevauchent, on garde la plus large.
+        all_entities.sort(key=lambda x: x["start"])
+        merged_entities = []
+        if all_entities:
+            current = all_entities[0]
+            for next_ent in all_entities[1:]:
+                if next_ent["start"] < current["end"]:
+                    # Chevauchement trouvé, on prend l'enveloppe maximale
+                    current["end"] = max(current["end"], next_ent["end"])
+                    # On peut aussi décider ici quel label prioriser
+                else:
+                    merged_entities.append(current)
+                    current = next_ent
+            merged_entities.append(current)
+        # 4. Remplacement (en partant de la fin pour garder les index valides)
+        redacted_text = text
+        for entity in sorted(merged_entities, key=lambda x: x["start"], reverse=True):
+            placeholder = LABELS_PLACEHOLDERS[entity["label"]]
+            redacted_text = (
+                redacted_text[: entity["start"]]
+                + placeholder
+                + redacted_text[entity["end"] :]
+            )
+        return redacted_text

helpers/llm_helper.py CHANGED Viewed

@@ -138,14 +138,19 @@ def _call_champ(
     lang: Literal["en", "fr"],
     conversation: List[ChatMessage],
     document_contents: List[str] | None,
-    prompt_template: str | None= None,
-) -> tuple[str, float, dict[str, Any], list[str]]:
     tracer = trace.get_tracer(__name__)
     vector_store = _get_vector_store(document_contents)
     with tracer.start_as_current_span("ChampService"):
-        champ = ChampService(vector_store=vector_store, lang=lang, model_type="champ", prompt_template=prompt_template)
     with tracer.start_as_current_span("convert_messages_langchain"):
         msgs = convert_messages_langchain(conversation)
@@ -164,7 +169,7 @@ def _call_champ(
     return (
         reply,
-        champ_impacts.usage.gwp.value,
         triage_meta,
         context,
         final_token_count,
@@ -192,7 +197,7 @@ def _call_qwen(
     return (
         reply,
-        qwen_impacts.usage.gwp.value,
         triage_meta,
         context,
         n_tokens,

     lang: Literal["en", "fr"],
     conversation: List[ChatMessage],
     document_contents: List[str] | None,
+    prompt_template: str | None = None,
+) -> tuple[str, float, dict[str, Any], list[str], int]:
     tracer = trace.get_tracer(__name__)
     vector_store = _get_vector_store(document_contents)
     with tracer.start_as_current_span("ChampService"):
+        champ = ChampService(
+            vector_store=vector_store,
+            lang=lang,
+            model_type="champ",
+            prompt_template=prompt_template,
+        )
     with tracer.start_as_current_span("convert_messages_langchain"):
         msgs = convert_messages_langchain(conversation)
     return (
         reply,
+        champ_impacts.usage.gwp.value,  # pyright: ignore[reportReturnType]
         triage_meta,
         context,
         final_token_count,
     return (
         reply,
+        qwen_impacts.usage.gwp.value,  # pyright: ignore[reportReturnType]
         triage_meta,
         context,
         n_tokens,

main.py CHANGED Viewed

@@ -303,7 +303,7 @@ async def chat_endpoint(
 # Endpoint for specific replies/responses
 @app.post("/feedback")
-@limiter.limit("20/minute")
 def feedback_endpoint(
     payload: FeedbackRequest, background_tasks: BackgroundTasks, request: Request
 ):
@@ -328,7 +328,7 @@ def feedback_endpoint(
 # Endpoint for specific generic comments
 @app.post("/comment")
-@limiter.limit("20/minute")
 def comment_endpoint(
     payload: CommentRequest, background_tasks: BackgroundTasks, request: Request
 ):

 # Endpoint for specific replies/responses
 @app.post("/feedback")
+@limiter.limit("450/minute")
 def feedback_endpoint(
     payload: FeedbackRequest, background_tasks: BackgroundTasks, request: Request
 ):
 # Endpoint for specific generic comments
 @app.post("/comment")
+@limiter.limit("450/minute")
 def comment_endpoint(
     payload: CommentRequest, background_tasks: BackgroundTasks, request: Request
 ):

requirements.txt CHANGED Viewed

@@ -7,11 +7,9 @@ python-dotenv==1.2.2
 opentelemetry-sdk==1.40.0
 slowapi==0.1.9
 nh3==0.3.3
-presidio-analyzer==2.2.362
-presidio-anonymizer==2.2.362
 boto3==1.42.70
 pytz==2026.1.post1
-opencv-python==4.13.0.92
 PyMuPDF==1.27.2
 python-magic==0.4.27
 python-magic-bin==0.4.14; sys_platform=='win32'
@@ -28,4 +26,5 @@ opentelemetry-instrumentation==0.61b0
 opentelemetry-instrumentation-fastapi==0.61b0
 opentelemetry-instrumentation-httpx==0.61b0
 python-multipart==0.0.22
-tiktoken

 opentelemetry-sdk==1.40.0
 slowapi==0.1.9
 nh3==0.3.3
 boto3==1.42.70
 pytz==2026.1.post1
+opencv-python-headless==4.13.0.92
 PyMuPDF==1.27.2
 python-magic==0.4.27
 python-magic-bin==0.4.14; sys_platform=='win32'
 opentelemetry-instrumentation-fastapi==0.61b0
 opentelemetry-instrumentation-httpx==0.61b0
 python-multipart==0.0.22
+tiktoken
+gliner==0.2.26

static/styles/components/consent.css CHANGED Viewed

@@ -15,13 +15,15 @@
   padding: 0.65rem 0.9rem;
   margin: 0.75rem 0 1rem;
   color: #791F1F;
-  font-size: 13px;
-  font-weight: 500;
 }
 .consent-emergency svg {
   flex-shrink: 0;
   color: #A32D2D;
 }
 .consent-data-note {

   padding: 0.65rem 0.9rem;
   margin: 0.75rem 0 1rem;
   color: #791F1F;
+  font-size: 16px;
+  font-weight: 700;
 }
 .consent-emergency svg {
   flex-shrink: 0;
   color: #A32D2D;
+  width: 18px;
+  height: 18px;
 }
 .consent-data-note {