Spaces:
Paused
Paused
pii filter improved
Browse files- Dockerfile +1 -13
- README.md +1 -1
- classes/pii_filter.py +107 -180
- helpers/llm_helper.py +10 -5
- main.py +2 -2
- requirements.txt +3 -4
- static/styles/components/consent.css +4 -2
Dockerfile
CHANGED
|
@@ -2,24 +2,12 @@ FROM python:3.11-slim
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
-
RUN apt-get update && apt-get install -y \
|
| 6 |
-
libmagic1 \
|
| 7 |
-
libxcb1 \
|
| 8 |
-
libx11-6 \
|
| 9 |
-
libxext6 \
|
| 10 |
-
libxrender1 \
|
| 11 |
-
libgl1 \
|
| 12 |
-
libglib2.0-0 \
|
| 13 |
-
libsm6 \
|
| 14 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
-
|
| 16 |
COPY requirements.txt .
|
| 17 |
COPY pyproject.toml .
|
| 18 |
RUN pip install uv
|
| 19 |
RUN uv pip install --no-cache-dir -r requirements.txt --system
|
| 20 |
|
| 21 |
-
RUN
|
| 22 |
-
RUN python -m spacy download fr_core_news_lg
|
| 23 |
|
| 24 |
COPY . .
|
| 25 |
|
|
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
COPY requirements.txt .
|
| 6 |
COPY pyproject.toml .
|
| 7 |
RUN pip install uv
|
| 8 |
RUN uv pip install --no-cache-dir -r requirements.txt --system
|
| 9 |
|
| 10 |
+
RUN apt-get update && apt-get install -y libmagic1
|
|
|
|
| 11 |
|
| 12 |
COPY . .
|
| 13 |
|
README.md
CHANGED
|
@@ -161,7 +161,7 @@ The test cases are defined in the folder `/tests/stress_tests/`:
|
|
| 161 |
#### Chat session test scenario
|
| 162 |
The chat session scenario must be run by specifying the model type and the URL of the server. For example, the following command simulates 150 users making three requests at `https://<username>-champ-chatbot.hf.space` to the model `champ`:
|
| 163 |
```
|
| 164 |
-
k6 run chat_session.js -e MODEL_TYPE=champ -e URL=https://<username>-champ-
|
| 165 |
```
|
| 166 |
The possible values for `MODEL_TYPE` are `champ`, `google`, and `openai`.
|
| 167 |
|
|
|
|
| 161 |
#### Chat session test scenario
|
| 162 |
The chat session scenario must be run by specifying the model type and the URL of the server. For example, the following command simulates 150 users making three requests at `https://<username>-champ-chatbot.hf.space` to the model `champ`:
|
| 163 |
```
|
| 164 |
+
k6 run chat_session.js -e MODEL_TYPE=champ -e URL=https://<username>-champ-chatbot.hf.space
|
| 165 |
```
|
| 166 |
The possible values for `MODEL_TYPE` are `champ`, `google`, and `openai`.
|
| 167 |
|
classes/pii_filter.py
CHANGED
|
@@ -1,14 +1,34 @@
|
|
| 1 |
import logging
|
| 2 |
-
|
| 3 |
-
from
|
| 4 |
-
|
| 5 |
-
from
|
| 6 |
-
from presidio_anonymizer.entities import OperatorConfig
|
| 7 |
|
| 8 |
-
# from lingua import Language, LanguageDetector
|
| 9 |
logger = logging.getLogger("uvicorn")
|
| 10 |
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def clean_backslashes(txt: str) -> str:
|
| 13 |
"""Cleans backslashes from a string.
|
| 14 |
|
|
@@ -25,196 +45,103 @@ def clean_backslashes(txt: str) -> str:
|
|
| 25 |
return txt.replace("\\'", "'")
|
| 26 |
|
| 27 |
|
| 28 |
-
def
|
| 29 |
-
# matches 111-111-111, 111 111 111, and 111111111
|
| 30 |
-
ssn_pattern = Pattern(
|
| 31 |
-
name="ssn_pattern", regex=r"\b\d{3}[- ]?\d{3}[- ]?\d{3}\b", score=0.9
|
| 32 |
-
)
|
| 33 |
-
fuzzy_sin_pattern = Pattern(
|
| 34 |
-
name="fuzzy_sin_pattern",
|
| 35 |
-
regex=r"\b[\dlIOS]{3}[- ]?[\dlIOS]{3}[- ]?[\dlIOS]{3}\b",
|
| 36 |
-
score=0.8,
|
| 37 |
-
)
|
| 38 |
-
return PatternRecognizer(
|
| 39 |
-
supported_entity="SSN", patterns=[ssn_pattern, fuzzy_sin_pattern]
|
| 40 |
-
)
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
def create_zip_code_pattern_recognizer():
|
| 44 |
-
zip_code_pattern = Pattern(
|
| 45 |
-
name="zip_code_pattern",
|
| 46 |
-
regex=r"\b[A-Z]\d[A-Z]\s?\d[A-Z]\d\b", # Matches A1A 1A1 and A1A1A1
|
| 47 |
-
score=0.9,
|
| 48 |
-
)
|
| 49 |
-
fuzzy_zip_code_pattern = Pattern(
|
| 50 |
-
name="fuzzy_zip_code_pattern",
|
| 51 |
-
regex=r"\b[A-Z][\dlIOS][A-Z]\s?[\dlIOS][A-Z][\dlIOS]\b",
|
| 52 |
-
score=0.8,
|
| 53 |
-
)
|
| 54 |
-
return PatternRecognizer(
|
| 55 |
-
supported_entity="ZIP_CODE", patterns=[zip_code_pattern, fuzzy_zip_code_pattern]
|
| 56 |
-
)
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
def create_street_pattern_recognizer():
|
| 60 |
-
bilingual_street_regex = (
|
| 61 |
-
r"\d+\s+(?:rue|boul|boulevard|av|avenue|place|square|st|street|rd|road|ave|blvd|lane|dr|drive)"
|
| 62 |
-
r"\s+[A-ZÁÀÂÄÇÉÈÊËÍÎÏÓÔÖÚÛÜa-z]+"
|
| 63 |
-
r"(?:\s+[A-ZÁÀÂÄÇÉÈÊËÍÎÏÓÔÖÚÛÜa-z]+)*"
|
| 64 |
-
r"|(?:\d+\s+)?[A-ZÁÀÂÄÇÉÈÊËÍÎÏÓÔÖÚÛÜa-z]+(?:\s+[A-ZÁÀÂÄÇÉÈÊËÍÎÏÓÔÖÚÛÜa-z]+)*"
|
| 65 |
-
r"\s+(?:rue|boul|boulevard|av|avenue|place|square|st|street|rd|road|ave|blvd|lane|dr|drive)\b"
|
| 66 |
-
)
|
| 67 |
-
|
| 68 |
-
street_pattern = Pattern(
|
| 69 |
-
name="street_pattern", regex=bilingual_street_regex, score=0.8
|
| 70 |
-
)
|
| 71 |
-
return PatternRecognizer(
|
| 72 |
-
supported_entity="STREET_ADDRESS", patterns=[street_pattern]
|
| 73 |
-
)
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
# The default phone pattern recognizer does not catch some edge cases.
|
| 77 |
-
def create_phone_pattern_recognizer():
|
| 78 |
"""
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
- 123-456-7890 (with dashes)
|
| 82 |
-
- 123 456 7890 (with spaces)
|
| 83 |
-
- (123) 456-7890 (with parentheses)
|
| 84 |
-
- (123) 456 7890 (with parentheses and spaces)
|
| 85 |
-
- +1-123-456-7890 (with country code and dashes)
|
| 86 |
-
- +1 (123) 456-7890 (with country code, parentheses, and dashes)
|
| 87 |
-
- +1 123 456 7890 (with country code and spaces)
|
| 88 |
"""
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
|
| 104 |
class PIIFilter:
|
| 105 |
_instance: Optional["PIIFilter"] = None
|
| 106 |
-
|
| 107 |
-
anonymizer: AnonymizerEngine
|
| 108 |
-
operators: dict
|
| 109 |
-
target_entities: List[str]
|
| 110 |
-
white_list = [
|
| 111 |
-
"salut",
|
| 112 |
-
"bonjour",
|
| 113 |
-
"comment",
|
| 114 |
-
"fort", # Par exemple, "Il tousse fort".
|
| 115 |
-
"Salut",
|
| 116 |
-
"Bonjour",
|
| 117 |
-
"Comment",
|
| 118 |
-
"fievre",
|
| 119 |
-
"fièvre",
|
| 120 |
-
"Fievre",
|
| 121 |
-
"Fièvre",
|
| 122 |
-
"tu",
|
| 123 |
-
"Tu",
|
| 124 |
-
]
|
| 125 |
|
| 126 |
def __new__(cls):
|
| 127 |
if cls._instance is None:
|
| 128 |
-
logger.info("Loading the
|
| 129 |
cls._instance = super(PIIFilter, cls).__new__(cls)
|
| 130 |
|
| 131 |
-
#
|
| 132 |
-
|
| 133 |
-
"nlp_engine_name": "spacy",
|
| 134 |
-
"models": [
|
| 135 |
-
{"lang_code": "en", "model_name": "en_core_web_lg"},
|
| 136 |
-
{"lang_code": "fr", "model_name": "fr_core_news_lg"},
|
| 137 |
-
],
|
| 138 |
-
}
|
| 139 |
-
provider = NlpEngineProvider(nlp_configuration=configuration)
|
| 140 |
-
nlp_engine = provider.create_engine()
|
| 141 |
-
|
| 142 |
-
cls._instance.analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
|
| 143 |
-
|
| 144 |
-
ssn_pattern_recognizer = create_ssn_pattern_recognizer()
|
| 145 |
-
zip_code_pattern_recognizer = create_zip_code_pattern_recognizer()
|
| 146 |
-
street_pattern_recognizer = create_street_pattern_recognizer()
|
| 147 |
-
phone_pattern_recognizer = create_phone_pattern_recognizer()
|
| 148 |
-
|
| 149 |
-
cls._instance.analyzer.registry.add_recognizer(ssn_pattern_recognizer)
|
| 150 |
-
cls._instance.analyzer.registry.add_recognizer(zip_code_pattern_recognizer)
|
| 151 |
-
cls._instance.analyzer.registry.add_recognizer(street_pattern_recognizer)
|
| 152 |
-
cls._instance.analyzer.registry.add_recognizer(phone_pattern_recognizer)
|
| 153 |
-
|
| 154 |
-
cls._instance.anonymizer = AnonymizerEngine()
|
| 155 |
-
|
| 156 |
-
# Define standard masking rules
|
| 157 |
-
cls._instance.operators = {
|
| 158 |
-
"PERSON": OperatorConfig("replace", {"new_value": "a person"}),
|
| 159 |
-
"EMAIL_ADDRESS": OperatorConfig("replace", {"new_value": "an email"}),
|
| 160 |
-
"PHONE_NUMBER": OperatorConfig(
|
| 161 |
-
"replace", {"new_value": "a phone number"}
|
| 162 |
-
),
|
| 163 |
-
"SSN": OperatorConfig(
|
| 164 |
-
"replace", {"new_value": "a social security number"}
|
| 165 |
-
),
|
| 166 |
-
"CREDIT_CARD": OperatorConfig(
|
| 167 |
-
"replace", {"new_value": "a credit card number"}
|
| 168 |
-
),
|
| 169 |
-
"LOCATION": OperatorConfig("replace", {"new_value": "a location"}),
|
| 170 |
-
"STREET_ADDRESS": OperatorConfig(
|
| 171 |
-
"replace", {"new_value": "a location"}
|
| 172 |
-
),
|
| 173 |
-
"ZIP_CODE": OperatorConfig("replace", {"new_value": "a location"}),
|
| 174 |
-
}
|
| 175 |
-
cls._instance.target_entities = list(cls._instance.operators.keys())
|
| 176 |
|
| 177 |
return cls._instance
|
| 178 |
|
| 179 |
def sanitize(self, text: str) -> str:
|
| 180 |
-
"""Analyzes and redacts PII from the given text."""
|
| 181 |
if not text:
|
| 182 |
return text
|
| 183 |
|
| 184 |
text = clean_backslashes(text)
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
#
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
#
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import logging
|
| 2 |
+
import re
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
+
from gliner import GLiNER
|
|
|
|
| 6 |
|
|
|
|
| 7 |
logger = logging.getLogger("uvicorn")
|
| 8 |
|
| 9 |
|
| 10 |
+
LABELS = [
|
| 11 |
+
"email",
|
| 12 |
+
"date_of_birth",
|
| 13 |
+
"last_name",
|
| 14 |
+
"street_address",
|
| 15 |
+
]
|
| 16 |
+
LABELS_PLACEHOLDERS = {
|
| 17 |
+
"email": "an email",
|
| 18 |
+
"phone_number": "a phone number",
|
| 19 |
+
"date_of_birth": "a date of birth",
|
| 20 |
+
"last_name": "a last name",
|
| 21 |
+
"street_address": "a location",
|
| 22 |
+
"ssn": "a ssn",
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
RE_SSN = r"\b\d{3}[- ]?\d{3}[- ]?\d{3}\b"
|
| 26 |
+
RE_ZIP = (
|
| 27 |
+
r"\b[ABCEGHJKLMNPRSTVXY]\d[ABCEGHJKLMNPRSTVWXYZ][ ]?\d[ABCEGHJKLMNPRSTVWXYZ]\d\b"
|
| 28 |
+
)
|
| 29 |
+
RE_PHONE = r"(?:\+?\d{1,3}[-\s.]?)?\(?\d{3}\)?[-\s.]?\d{3}[-\s.]?\d{4}"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
def clean_backslashes(txt: str) -> str:
|
| 33 |
"""Cleans backslashes from a string.
|
| 34 |
|
|
|
|
| 45 |
return txt.replace("\\'", "'")
|
| 46 |
|
| 47 |
|
| 48 |
+
def chunk_text(text: str, max_chars: int = 1000) -> list[tuple[str, int]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
"""
|
| 50 |
+
The text is sometimes too large for the model. We chunk it here so we can pass
|
| 51 |
+
each chunk to the model one by one.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
"""
|
| 53 |
+
chunks = []
|
| 54 |
+
start = 0
|
| 55 |
+
text_len = len(text)
|
| 56 |
+
|
| 57 |
+
while start < text_len:
|
| 58 |
+
# On prend un bloc (environ 1000 caractères ~ 250-300 tokens)
|
| 59 |
+
end = start + max_chars
|
| 60 |
+
|
| 61 |
+
# Pour éviter de couper un mot au milieu, on recule jusqu'au dernier espace
|
| 62 |
+
if end < text_len:
|
| 63 |
+
end = text.rfind(" ", start, end)
|
| 64 |
+
if end <= start: # Si aucun espace n'est trouvé
|
| 65 |
+
end = start + max_chars
|
| 66 |
+
|
| 67 |
+
chunks.append((text[start:end], start))
|
| 68 |
+
# On avance le curseur (on peut ajouter un overlap ici si nécessaire)
|
| 69 |
+
start = end
|
| 70 |
+
|
| 71 |
+
return chunks
|
| 72 |
|
| 73 |
|
| 74 |
class PIIFilter:
|
| 75 |
_instance: Optional["PIIFilter"] = None
|
| 76 |
+
model: None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
def __new__(cls):
|
| 79 |
if cls._instance is None:
|
| 80 |
+
logger.info("Loading the PII filter into memory...")
|
| 81 |
cls._instance = super(PIIFilter, cls).__new__(cls)
|
| 82 |
|
| 83 |
+
# TODO: manual SSN detection
|
| 84 |
+
cls._instance.model = GLiNER.from_pretrained("nvidia/gliner-PII")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
return cls._instance
|
| 87 |
|
| 88 |
def sanitize(self, text: str) -> str:
|
|
|
|
| 89 |
if not text:
|
| 90 |
return text
|
| 91 |
|
| 92 |
text = clean_backslashes(text)
|
| 93 |
+
all_entities = []
|
| 94 |
+
|
| 95 |
+
# 1. Chunking pour GLiNER (max_chars=1000 pour rester sous les 384 tokens)
|
| 96 |
+
chunks = chunk_text(text, max_chars=1000)
|
| 97 |
+
for chunk, offset in chunks:
|
| 98 |
+
chunk_entities = self.model.predict_entities(chunk, LABELS, threshold=0.6)
|
| 99 |
+
for ent in chunk_entities:
|
| 100 |
+
all_entities.append(
|
| 101 |
+
{
|
| 102 |
+
"start": ent["start"] + offset,
|
| 103 |
+
"end": ent["end"] + offset,
|
| 104 |
+
"label": ent["label"],
|
| 105 |
+
}
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
# 2. Ajout des détections par Regex
|
| 109 |
+
regex_rules = [
|
| 110 |
+
(RE_SSN, "ssn"),
|
| 111 |
+
(RE_ZIP, "street_address"),
|
| 112 |
+
(RE_PHONE, "phone_number"),
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
+
for pattern, label in regex_rules:
|
| 116 |
+
for match in re.finditer(pattern, text):
|
| 117 |
+
all_entities.append(
|
| 118 |
+
{"start": match.start(), "end": match.end(), "label": label}
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
# 3. Gestion des chevauchements (Overlaps)
|
| 122 |
+
# Si deux entités se chevauchent, on garde la plus large.
|
| 123 |
+
all_entities.sort(key=lambda x: x["start"])
|
| 124 |
+
merged_entities = []
|
| 125 |
+
if all_entities:
|
| 126 |
+
current = all_entities[0]
|
| 127 |
+
for next_ent in all_entities[1:]:
|
| 128 |
+
if next_ent["start"] < current["end"]:
|
| 129 |
+
# Chevauchement trouvé, on prend l'enveloppe maximale
|
| 130 |
+
current["end"] = max(current["end"], next_ent["end"])
|
| 131 |
+
# On peut aussi décider ici quel label prioriser
|
| 132 |
+
else:
|
| 133 |
+
merged_entities.append(current)
|
| 134 |
+
current = next_ent
|
| 135 |
+
merged_entities.append(current)
|
| 136 |
+
|
| 137 |
+
# 4. Remplacement (en partant de la fin pour garder les index valides)
|
| 138 |
+
redacted_text = text
|
| 139 |
+
for entity in sorted(merged_entities, key=lambda x: x["start"], reverse=True):
|
| 140 |
+
placeholder = LABELS_PLACEHOLDERS[entity["label"]]
|
| 141 |
+
redacted_text = (
|
| 142 |
+
redacted_text[: entity["start"]]
|
| 143 |
+
+ placeholder
|
| 144 |
+
+ redacted_text[entity["end"] :]
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
return redacted_text
|
helpers/llm_helper.py
CHANGED
|
@@ -138,14 +138,19 @@ def _call_champ(
|
|
| 138 |
lang: Literal["en", "fr"],
|
| 139 |
conversation: List[ChatMessage],
|
| 140 |
document_contents: List[str] | None,
|
| 141 |
-
prompt_template: str | None= None,
|
| 142 |
-
) -> tuple[str, float, dict[str, Any], list[str]]:
|
| 143 |
tracer = trace.get_tracer(__name__)
|
| 144 |
|
| 145 |
vector_store = _get_vector_store(document_contents)
|
| 146 |
|
| 147 |
with tracer.start_as_current_span("ChampService"):
|
| 148 |
-
champ = ChampService(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
with tracer.start_as_current_span("convert_messages_langchain"):
|
| 151 |
msgs = convert_messages_langchain(conversation)
|
|
@@ -164,7 +169,7 @@ def _call_champ(
|
|
| 164 |
|
| 165 |
return (
|
| 166 |
reply,
|
| 167 |
-
champ_impacts.usage.gwp.value,
|
| 168 |
triage_meta,
|
| 169 |
context,
|
| 170 |
final_token_count,
|
|
@@ -192,7 +197,7 @@ def _call_qwen(
|
|
| 192 |
|
| 193 |
return (
|
| 194 |
reply,
|
| 195 |
-
qwen_impacts.usage.gwp.value,
|
| 196 |
triage_meta,
|
| 197 |
context,
|
| 198 |
n_tokens,
|
|
|
|
| 138 |
lang: Literal["en", "fr"],
|
| 139 |
conversation: List[ChatMessage],
|
| 140 |
document_contents: List[str] | None,
|
| 141 |
+
prompt_template: str | None = None,
|
| 142 |
+
) -> tuple[str, float, dict[str, Any], list[str], int]:
|
| 143 |
tracer = trace.get_tracer(__name__)
|
| 144 |
|
| 145 |
vector_store = _get_vector_store(document_contents)
|
| 146 |
|
| 147 |
with tracer.start_as_current_span("ChampService"):
|
| 148 |
+
champ = ChampService(
|
| 149 |
+
vector_store=vector_store,
|
| 150 |
+
lang=lang,
|
| 151 |
+
model_type="champ",
|
| 152 |
+
prompt_template=prompt_template,
|
| 153 |
+
)
|
| 154 |
|
| 155 |
with tracer.start_as_current_span("convert_messages_langchain"):
|
| 156 |
msgs = convert_messages_langchain(conversation)
|
|
|
|
| 169 |
|
| 170 |
return (
|
| 171 |
reply,
|
| 172 |
+
champ_impacts.usage.gwp.value, # pyright: ignore[reportReturnType]
|
| 173 |
triage_meta,
|
| 174 |
context,
|
| 175 |
final_token_count,
|
|
|
|
| 197 |
|
| 198 |
return (
|
| 199 |
reply,
|
| 200 |
+
qwen_impacts.usage.gwp.value, # pyright: ignore[reportReturnType]
|
| 201 |
triage_meta,
|
| 202 |
context,
|
| 203 |
n_tokens,
|
main.py
CHANGED
|
@@ -303,7 +303,7 @@ async def chat_endpoint(
|
|
| 303 |
|
| 304 |
# Endpoint for specific replies/responses
|
| 305 |
@app.post("/feedback")
|
| 306 |
-
@limiter.limit("
|
| 307 |
def feedback_endpoint(
|
| 308 |
payload: FeedbackRequest, background_tasks: BackgroundTasks, request: Request
|
| 309 |
):
|
|
@@ -328,7 +328,7 @@ def feedback_endpoint(
|
|
| 328 |
|
| 329 |
# Endpoint for specific generic comments
|
| 330 |
@app.post("/comment")
|
| 331 |
-
@limiter.limit("
|
| 332 |
def comment_endpoint(
|
| 333 |
payload: CommentRequest, background_tasks: BackgroundTasks, request: Request
|
| 334 |
):
|
|
|
|
| 303 |
|
| 304 |
# Endpoint for specific replies/responses
|
| 305 |
@app.post("/feedback")
|
| 306 |
+
@limiter.limit("450/minute")
|
| 307 |
def feedback_endpoint(
|
| 308 |
payload: FeedbackRequest, background_tasks: BackgroundTasks, request: Request
|
| 309 |
):
|
|
|
|
| 328 |
|
| 329 |
# Endpoint for specific generic comments
|
| 330 |
@app.post("/comment")
|
| 331 |
+
@limiter.limit("450/minute")
|
| 332 |
def comment_endpoint(
|
| 333 |
payload: CommentRequest, background_tasks: BackgroundTasks, request: Request
|
| 334 |
):
|
requirements.txt
CHANGED
|
@@ -7,11 +7,9 @@ python-dotenv==1.2.2
|
|
| 7 |
opentelemetry-sdk==1.40.0
|
| 8 |
slowapi==0.1.9
|
| 9 |
nh3==0.3.3
|
| 10 |
-
presidio-analyzer==2.2.362
|
| 11 |
-
presidio-anonymizer==2.2.362
|
| 12 |
boto3==1.42.70
|
| 13 |
pytz==2026.1.post1
|
| 14 |
-
opencv-python==4.13.0.92
|
| 15 |
PyMuPDF==1.27.2
|
| 16 |
python-magic==0.4.27
|
| 17 |
python-magic-bin==0.4.14; sys_platform=='win32'
|
|
@@ -28,4 +26,5 @@ opentelemetry-instrumentation==0.61b0
|
|
| 28 |
opentelemetry-instrumentation-fastapi==0.61b0
|
| 29 |
opentelemetry-instrumentation-httpx==0.61b0
|
| 30 |
python-multipart==0.0.22
|
| 31 |
-
tiktoken
|
|
|
|
|
|
| 7 |
opentelemetry-sdk==1.40.0
|
| 8 |
slowapi==0.1.9
|
| 9 |
nh3==0.3.3
|
|
|
|
|
|
|
| 10 |
boto3==1.42.70
|
| 11 |
pytz==2026.1.post1
|
| 12 |
+
opencv-python-headless==4.13.0.92
|
| 13 |
PyMuPDF==1.27.2
|
| 14 |
python-magic==0.4.27
|
| 15 |
python-magic-bin==0.4.14; sys_platform=='win32'
|
|
|
|
| 26 |
opentelemetry-instrumentation-fastapi==0.61b0
|
| 27 |
opentelemetry-instrumentation-httpx==0.61b0
|
| 28 |
python-multipart==0.0.22
|
| 29 |
+
tiktoken
|
| 30 |
+
gliner==0.2.26
|
static/styles/components/consent.css
CHANGED
|
@@ -15,13 +15,15 @@
|
|
| 15 |
padding: 0.65rem 0.9rem;
|
| 16 |
margin: 0.75rem 0 1rem;
|
| 17 |
color: #791F1F;
|
| 18 |
-
font-size:
|
| 19 |
-
font-weight:
|
| 20 |
}
|
| 21 |
|
| 22 |
.consent-emergency svg {
|
| 23 |
flex-shrink: 0;
|
| 24 |
color: #A32D2D;
|
|
|
|
|
|
|
| 25 |
}
|
| 26 |
|
| 27 |
.consent-data-note {
|
|
|
|
| 15 |
padding: 0.65rem 0.9rem;
|
| 16 |
margin: 0.75rem 0 1rem;
|
| 17 |
color: #791F1F;
|
| 18 |
+
font-size: 16px;
|
| 19 |
+
font-weight: 700;
|
| 20 |
}
|
| 21 |
|
| 22 |
.consent-emergency svg {
|
| 23 |
flex-shrink: 0;
|
| 24 |
color: #A32D2D;
|
| 25 |
+
width: 18px;
|
| 26 |
+
height: 18px;
|
| 27 |
}
|
| 28 |
|
| 29 |
.consent-data-note {
|