champ-chatbot

Paused

App Files Files Community

qyle commited on Feb 24

Commit

fc62e60

verified ·

1 Parent(s): 8fe7ab1

deployment

Browse files

Files changed (9) hide show

champ/agent.py +7 -13
champ/service.py +29 -7
classes/pii_filter.py +145 -0
classes/prompt_injection_filter.py +59 -0
classes/session_conversation_store.py +7 -0
main.py +42 -27
requirements.txt +2 -1
telemetry.py +1 -0
templates/index.html +2 -2

champ/agent.py CHANGED Viewed

@@ -8,8 +8,6 @@ from langchain_community.vectorstores import FAISS as LCFAISS
 from opentelemetry import trace
-from classes.prompt_sanitizer import PromptSanitizer
 from .prompts import CHAMP_SYSTEM_PROMPT_V5
 tracer = trace.get_tracer(__name__)
@@ -33,6 +31,8 @@ def _build_retrieval_query(messages) -> str:
 def make_prompt_with_context(
     vector_store: LCFAISS, lang: Literal["en", "fr"], k: int = 4
 ):
     @dynamic_prompt
     def prompt_with_context(request: ModelRequest) -> str:
         with tracer.start_as_current_span("retrieving documents"):
@@ -58,23 +58,17 @@ def make_prompt_with_context(
             unique_docs.append(doc)
         docs_content = "\n\n".join(doc.page_content for doc in unique_docs)
-        # No need to sanitize the docs_content as the documents are sanitized
-        # when received at the file PUT endpoint.
-        with tracer.start_as_current_span("PromptSanitizer"):
-            sanitizer = PromptSanitizer()
-        with tracer.start_as_current_span("sanitize retrieval_query"):
-            sanitized_retrieval_query = sanitizer.sanitize(retrieval_query)
         language = "English" if lang == "en" else "French"
         return CHAMP_SYSTEM_PROMPT_V5.format(
-            last_query=sanitized_retrieval_query,
             context=docs_content,
             language=language,
         )
-    return prompt_with_context
 def build_champ_agent(
@@ -91,11 +85,11 @@ def build_champ_agent(
         # huggingfacehub_api_token=... (optional; see service.py)
     )
     model_chat = ChatHuggingFace(llm=hf_llm)
-    prompt_middleware = make_prompt_with_context(vector_store, lang)
     return create_agent(
         model_chat,
         tools=[],
         middleware=[
             prompt_middleware,
         ],
-    )

 from opentelemetry import trace
 from .prompts import CHAMP_SYSTEM_PROMPT_V5
 tracer = trace.get_tracer(__name__)
 def make_prompt_with_context(
     vector_store: LCFAISS, lang: Literal["en", "fr"], k: int = 4
 ):
+    context_store = {"last_retrieved_docs": []}  # shared mutable container
     @dynamic_prompt
     def prompt_with_context(request: ModelRequest) -> str:
         with tracer.start_as_current_span("retrieving documents"):
             unique_docs.append(doc)
         docs_content = "\n\n".join(doc.page_content for doc in unique_docs)
+        context_store["last_retrieved_docs"] = [doc.page_content for doc in unique_docs]
         language = "English" if lang == "en" else "French"
         return CHAMP_SYSTEM_PROMPT_V5.format(
+            last_query=retrieval_query,
             context=docs_content,
             language=language,
         )
+    return prompt_with_context, context_store
 def build_champ_agent(
         # huggingfacehub_api_token=... (optional; see service.py)
     )
     model_chat = ChatHuggingFace(llm=hf_llm)
+    prompt_middleware, context_store = make_prompt_with_context(vector_store, lang)
     return create_agent(
         model_chat,
         tools=[],
         middleware=[
             prompt_middleware,
         ],
+    ), context_store

champ/service.py CHANGED Viewed

@@ -1,11 +1,10 @@
 # app/champ/service.py
-from typing import Literal, Optional, Sequence
 from langchain_community.vectorstores import FAISS as LCFAISS
 from langchain_core.messages import HumanMessage
 from .agent import build_champ_agent
 from .triage import safety_triage
@@ -14,12 +13,25 @@ class ChampService:
     vector_store: Optional[LCFAISS] = None
     agent = None
     lang = None
     def __init__(self, vector_store: LCFAISS, lang: Literal["en", "fr"]):
         self.vector_store = vector_store
-        self.agent = build_champ_agent(self.vector_store, lang)
-    def invoke(self, lc_messages: Sequence) -> str:
         if self.agent is None:
             raise RuntimeError("CHAMP is not initialized yet.")
         # --- Safety triage micro-layer (before LLM) ---
@@ -38,6 +50,16 @@ class ChampService:
                 }
         result = self.agent.invoke({"messages": list(lc_messages)})
-        return result["messages"][-1].text.strip(), {
-            "triage_triggered": False,
-        }

 # app/champ/service.py
+from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple
 from langchain_community.vectorstores import FAISS as LCFAISS
 from langchain_core.messages import HumanMessage
 from .agent import build_champ_agent
 from .triage import safety_triage
     vector_store: Optional[LCFAISS] = None
     agent = None
     lang = None
+    context_store = None
     def __init__(self, vector_store: LCFAISS, lang: Literal["en", "fr"]):
         self.vector_store = vector_store
+        self.agent, self.context_store = build_champ_agent(self.vector_store, lang)
+    def invoke(self, lc_messages: Sequence) -> Tuple[str, Dict[str, Any], List[str]]:
+        """Invokes the agent.
+        Args:
+            lc_messages (Sequence): Sequence of LangChain messages
+        Raises:
+            RuntimeError: Raised when the function is called before CHAMP is initialized
+        Returns:
+            Tuple[str, Dict[str, Any], List[str]]: The replay, the triage_triggered object and the retrieved passages
+        """
         if self.agent is None:
             raise RuntimeError("CHAMP is not initialized yet.")
         # --- Safety triage micro-layer (before LLM) ---
                 }
         result = self.agent.invoke({"messages": list(lc_messages)})
+        retrieved_passages = (
+            self.context_store["last_retrieved_docs"]
+            if self.context_store is not None
+            else []
+        )
+        return (
+            result["messages"][-1].text.strip(),
+            {
+                "triage_triggered": False,
+            },
+            retrieved_passages,
+        )

classes/pii_filter.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from typing import List, Optional
+from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer
+from presidio_analyzer.nlp_engine import NlpEngineProvider
+from presidio_anonymizer import AnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig
+from lingua import Language, LanguageDetector
+def create_ssn_pattern_recognizer():
+    # matches 111-111-111, 111 111 111, and 111111111
+    ssn_pattern = Pattern(
+        name="ssn_pattern", regex=r"\b\d{3}[- ]?\d{3}[- ]?\d{3}\b", score=0.8
+    )
+    return PatternRecognizer(supported_entity="SSN", patterns=[ssn_pattern])
+def create_zip_code_pattern_recognizer():
+    zip_code_pattern = Pattern(
+        name="zip_code_pattern",
+        regex=r"\b[A-Z]\d[A-Z]\s?\d[A-Z]\d\b",  # Matches A1A 1A1 and A1A1A1
+        score=0.8,
+    )
+    return PatternRecognizer(supported_entity="ZIP_CODE", patterns=[zip_code_pattern])
+def create_street_pattern_recognizer():
+    bilingual_street_regex = (
+        r"\d+\s+(?:rue|boul|boulevard|av|avenue|place|square|st|street|rd|road|ave|blvd|lane|dr|drive)"
+        r"\s+[A-ZÁÀÂÄÇÉÈÊËÍÎÏÓÔÖÚÛÜa-z]+"
+        r"(?:\s+[A-ZÁÀÂÄÇÉÈÊËÍÎÏÓÔÖÚÛÜa-z]+)*"
+        r"|(?:\d+\s+)?[A-ZÁÀÂÄÇÉÈÊËÍÎÏÓÔÖÚÛÜa-z]+(?:\s+[A-ZÁÀÂÄÇÉÈÊËÍÎÏÓÔÖÚÛÜa-z]+)*"
+        r"\s+(?:rue|boul|boulevard|av|avenue|place|square|st|street|rd|road|ave|blvd|lane|dr|drive)\b"
+    )
+    street_pattern = Pattern(
+        name="street_pattern", regex=bilingual_street_regex, score=0.8
+    )
+    return PatternRecognizer(
+        supported_entity="STREET_ADDRESS", patterns=[street_pattern]
+    )
+class PIIFilter:
+    _instance: Optional["PIIFilter"] = None
+    analyzer: AnalyzerEngine
+    anonymizer: AnonymizerEngine
+    operators: dict
+    target_entities: List[str]
+    def __new__(cls):
+        if cls._instance is None:
+            print("Initializing Presidio Engines (this should happen only once)...")
+            cls._instance = super(PIIFilter, cls).__new__(cls)
+            # Define which models to use for which language
+            configuration = {
+                "nlp_engine_name": "spacy",
+                "models": [
+                    {"lang_code": "en", "model_name": "en_core_web_lg"},
+                    {"lang_code": "fr", "model_name": "fr_core_news_lg"},
+                ],
+            }
+            provider = NlpEngineProvider(nlp_configuration=configuration)
+            nlp_engine = provider.create_engine()
+            cls._instance.analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
+            ssn_pattern_recognizer = create_ssn_pattern_recognizer()
+            zip_code_pattern_recognizer = create_zip_code_pattern_recognizer()
+            street_pattern_recognizer = create_street_pattern_recognizer()
+            cls._instance.analyzer.registry.add_recognizer(ssn_pattern_recognizer)
+            cls._instance.analyzer.registry.add_recognizer(zip_code_pattern_recognizer)
+            cls._instance.analyzer.registry.add_recognizer(street_pattern_recognizer)
+            cls._instance.anonymizer = AnonymizerEngine()
+            # Define standard masking rules
+            cls._instance.operators = {
+                "PERSON": OperatorConfig("replace", {"new_value": "[NAME]"}),
+                "EMAIL_ADDRESS": OperatorConfig("replace", {"new_value": "[EMAIL]"}),
+                "PHONE_NUMBER": OperatorConfig("replace", {"new_value": "[PHONE]"}),
+                "SSN": OperatorConfig("replace", {"new_value": "[SSN]"}),
+                "CREDIT_CARD": OperatorConfig(
+                    "replace", {"new_value": "[CREDIT_CARD]"}
+                ),
+                "LOCATION": OperatorConfig("replace", {"new_value": "[LOCATION]"}),
+                "STREET_ADDRESS": OperatorConfig(
+                    "replace", {"new_value": "[LOCATION]"}
+                ),
+                "ZIP_CODE": OperatorConfig("replace", {"new_value": "[LOCATION]"}),
+            }
+            cls._instance.target_entities = list(cls._instance.operators.keys())
+        return cls._instance
+    def sanitize(self, text: str, language_detector: LanguageDetector) -> str:
+        """Analyzes and redacts PII from the given text."""
+        if not text:
+            return text
+        # Instead of detecting the language, we do PII for both language.
+        # This seems to be more effective and faster.
+        # lang = ""
+        # detected_lang = language_detector.detect_language_of(text)
+        # if detected_lang == Language.ENGLISH:
+        #     lang = "en"
+        # elif detected_lang == Language.FRENCH:
+        #     lang = "fr"
+        # else:
+        #     # TODO: Warning, defaulting to english
+        #     lang = "en"
+        # 2. Detect PII in English
+        results_en = self.analyzer.analyze(
+            text=text,
+            entities=self.target_entities,
+            language="en",
+        )
+        # 3. Redact PII in English
+        anonymized_result_en = self.anonymizer.anonymize(
+            text=text,
+            analyzer_results=results_en,  # pyright: ignore[reportArgumentType]
+            operators=self.operators,
+        )
+        # 4. Detect PII in French
+        results_fr = self.analyzer.analyze(
+            text=anonymized_result_en.text,
+            entities=self.target_entities,
+            language="fr",
+        )
+        # 5. Redact PII in French
+        anonymized_result_fr = self.anonymizer.anonymize(
+            text=anonymized_result_en.text,
+            analyzer_results=results_fr,  # pyright: ignore[reportArgumentType]
+            operators=self.operators,
+        )
+        return anonymized_result_fr.text

classes/prompt_injection_filter.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import re
+# Taken from https://cheatsheetseries.owasp.org/cheatsheets/LLM_Prompt_Injection_Prevention_Cheat_Sheet.html#primary-defenses
+# Has to work with french and english
+class PromptInjectionFilter:
+    def __init__(self):
+        self.dangerous_patterns = [
+            r"ignore\s+(all\s+)?previous\s+instructions?",
+            r"you\s+are\s+now\s+(in\s+)?developer\s+mode",
+            r"system\s+override",
+            r"reveal\s+prompt",
+        ]
+        # Fuzzy matching for typoglycemia attacks
+        self.fuzzy_patterns = [
+            "ignore",
+            "bypass",
+            "override",
+            "reveal",
+            "delete",
+            "system",
+        ]
+    def detect_injection(self, text: str) -> bool:
+        # Standard pattern matching
+        if any(
+            re.search(pattern, text, re.IGNORECASE)
+            for pattern in self.dangerous_patterns
+        ):
+            return True
+        # Fuzzy matching for misspelled words (typoglycemia defense)
+        words = re.findall(r"\b\w+\b", text.lower())
+        for word in words:
+            for pattern in self.fuzzy_patterns:
+                if self._is_similar_word(word, pattern):
+                    return True
+        return False
+    def _is_similar_word(self, word: str, target: str) -> bool:
+        """Check if word is a typoglycemia variant of target"""
+        if len(word) != len(target) or len(word) < 3:
+            return False
+        # Same first and last letter, scrambled middle
+        return (
+            word[0] == target[0]
+            and word[-1] == target[-1]
+            and sorted(word[1:-1]) == sorted(target[1:-1])
+        )
+    def sanitize_input(self, text: str) -> str:
+        # Normalize common obfuscations
+        text = re.sub(r"\s+", " ", text)  # Collapse whitespace
+        text = re.sub(r"(.)\1{3,}", r"\1", text)  # Remove char repetition
+        for pattern in self.dangerous_patterns:
+            text = re.sub(pattern, "[FILTERED]", text, flags=re.IGNORECASE)
+        return text

classes/session_conversation_store.py CHANGED Viewed

@@ -2,6 +2,13 @@ from typing import Dict, List, Literal
 from classes.base_models import ChatMessage
 class SessionConversationStore:
     def __init__(self) -> None:

 from classes.base_models import ChatMessage
+"""
+This class should be removed after the demo and all call sites
+migrated to the LangGraph checkpointer. We should use a persistent
+checkpointer (e.g. PostgresSaver or RedisSaver) once the demo is completed.
+For more details: https://docs.langchain.com/oss/python/langchain/short-term-memory
+"""
 class SessionConversationStore:
     def __init__(self) -> None:

main.py CHANGED Viewed

@@ -34,7 +34,8 @@ from classes.base_models import (
 )
 # from classes.guardrail_manager import GuardrailManager
-from classes.prompt_sanitizer import PromptSanitizer
 from classes.session_conversation_store import SessionConversationStore
 from classes.session_tracker import SessionTracker
 from constants import (
@@ -62,6 +63,8 @@ from google import genai
 from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
 from champ.prompts import (
     DEFAULT_SYSTEM_PROMPT_V2,
     DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT_V2,
@@ -173,33 +176,20 @@ def convert_and_sanitize_messages(
         )
     )
-    sanitizer = PromptSanitizer()
     out = [{"role": "system", "content": system_prompt}]
     for m in messages:
         if m.role == "system":
             continue
-        out.append(
-            {
-                "role": m.role,
-                # We only sanitize human messages.
-                "content": m.content
-                if m.role == "assistant"
-                else sanitizer.sanitize(m.content),
-            }
-        )
     return out
 def convert_and_sanitize_messages_langchain(messages: List[ChatMessage]):
     list_chatmessages = []
-    sanitizer = PromptSanitizer()
     for m in messages[-MAX_HISTORY:]:
         if m.role == "user":
-            list_chatmessages.append(
-                HumanMessage(content=sanitizer.sanitize(m.content))
-            )
         elif m.role == "assistant":
             list_chatmessages.append(AIMessage(content=m.content))
         elif m.role == "system":
@@ -241,7 +231,7 @@ def call_llm(
     model_type: str,
     lang: Literal["en", "fr"],
     conversation: List[ChatMessage],
-) -> AsyncGenerator[str, None] | Tuple[str, Dict[str, Any]]:
     tracer = trace.get_tracer(__name__)
     if model_type == "champ":
@@ -262,9 +252,9 @@ def call_llm(
             msgs = convert_and_sanitize_messages_langchain(conversation)
         with tracer.start_as_current_span("invoke"):
-            reply, triage_meta = champ.invoke(msgs)
-        return reply, triage_meta
     if model_type not in MODEL_MAP:
         raise ValueError(f"Unknown model_type: {model_type}")
@@ -279,10 +269,10 @@ def call_llm(
         return _call_openai(model_id, msgs)
     if model_type == "google-conservative":
-        return _call_gemini(model_id, msgs, temperature=0.2), {}
     if model_type == "google-creative":
-        return _call_gemini(model_id, msgs, temperature=1.0), {}
     # If you later add HF models via hf_client, handle here.
     raise ValueError(f"Unhandled model_type: {model_type}")
@@ -297,9 +287,14 @@ async def lifespan(app: FastAPI):
     # We are loading the OCR Reader in advance, because loading the model takes time.
     app.state.ocr_reader = easyocr.Reader(["en", "fr"], gpu=torch.cuda.is_available())
     # Idem for the prompt sanitizer. No need to store it in the state since this
     # class follows the Singleton design pattern.
-    PromptSanitizer()
     bg_task = asyncio.create_task(cleanup_loop())
     yield
@@ -350,8 +345,19 @@ async def chat_endpoint(
     session_tracker.update_session(session_id)
     session_conversation_store.add_human_message(
-        session_id, payload.conversation_id, payload.human_message
     )
     conversation = session_conversation_store.get_conversation(
         session_id, conversation_id
@@ -359,6 +365,7 @@ async def chat_endpoint(
     reply = ""
     triage_meta = {}
     try:
         loop = asyncio.get_running_loop()
@@ -405,7 +412,7 @@ async def chat_endpoint(
             return StreamingResponse(logging_wrapper(), media_type="text/event-stream")
-        reply, triage_meta = result
     except Exception as e:
         background_tasks.add_task(
@@ -426,6 +433,7 @@ async def chat_endpoint(
             },
         )
     background_tasks.add_task(
         log_event,
         user_id=payload.user_id,
@@ -435,6 +443,7 @@ async def chat_endpoint(
             "consent": payload.consent,
             "human_message": payload.human_message,
             "reply": reply,
             "age_group": payload.age_group,
             "gender": payload.gender,
             "roles": payload.roles,
@@ -554,11 +563,17 @@ async def upload_file(
     if file_text is None:
         return Response(status_code=STATUS_CODE_INTERNAL_SERVER_ERROR)
-    sanitizer = PromptSanitizer()
-    sanitized_file_text = sanitizer.sanitize(file_text)
     if session_document_store.create_document(
-        session_id, sanitized_file_text, file_name, file_size
     ):
         session_tracker.update_session(session_id)
     else:

 )
 # from classes.guardrail_manager import GuardrailManager
+from classes.pii_filter import PIIFilter
+from classes.prompt_injection_filter import PromptInjectionFilter
 from classes.session_conversation_store import SessionConversationStore
 from classes.session_tracker import SessionTracker
 from constants import (
 from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
+from lingua import Language, LanguageDetectorBuilder
 from champ.prompts import (
     DEFAULT_SYSTEM_PROMPT_V2,
     DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT_V2,
         )
     )
     out = [{"role": "system", "content": system_prompt}]
     for m in messages:
         if m.role == "system":
             continue
+        out.append({"role": m.role, "content": m.content})
     return out
 def convert_and_sanitize_messages_langchain(messages: List[ChatMessage]):
     list_chatmessages = []
     for m in messages[-MAX_HISTORY:]:
         if m.role == "user":
+            list_chatmessages.append(HumanMessage(content=m.content))
         elif m.role == "assistant":
             list_chatmessages.append(AIMessage(content=m.content))
         elif m.role == "system":
     model_type: str,
     lang: Literal["en", "fr"],
     conversation: List[ChatMessage],
+) -> AsyncGenerator[str, None] | Tuple[str, Dict[str, Any], List[str]]:
     tracer = trace.get_tracer(__name__)
     if model_type == "champ":
             msgs = convert_and_sanitize_messages_langchain(conversation)
         with tracer.start_as_current_span("invoke"):
+            reply, triage_meta, context = champ.invoke(msgs)
+        return reply, triage_meta, context
     if model_type not in MODEL_MAP:
         raise ValueError(f"Unknown model_type: {model_type}")
         return _call_openai(model_id, msgs)
     if model_type == "google-conservative":
+        return _call_gemini(model_id, msgs, temperature=0.2), {}, []
     if model_type == "google-creative":
+        return _call_gemini(model_id, msgs, temperature=1.0), {}, []
     # If you later add HF models via hf_client, handle here.
     raise ValueError(f"Unhandled model_type: {model_type}")
     # We are loading the OCR Reader in advance, because loading the model takes time.
     app.state.ocr_reader = easyocr.Reader(["en", "fr"], gpu=torch.cuda.is_available())
+    languages = [Language.ENGLISH, Language.FRENCH]
+    app.state.language_detector = LanguageDetectorBuilder.from_languages(
+        *languages
+    ).build()
     # Idem for the prompt sanitizer. No need to store it in the state since this
     # class follows the Singleton design pattern.
+    PIIFilter()
     bg_task = asyncio.create_task(cleanup_loop())
     yield
     session_tracker.update_session(session_id)
+    prompt_injection_filter = PromptInjectionFilter()
+    injection_filtered_msg = prompt_injection_filter.sanitize_input(
+        payload.human_message
+    )
+    pii_filter = PIIFilter()
+    with tracer.start_as_current_span("sanitize_document"):
+        pii_filtered_msg = pii_filter.sanitize(
+            injection_filtered_msg, app.state.language_detector
+        )
     session_conversation_store.add_human_message(
+        session_id, payload.conversation_id, pii_filtered_msg
     )
     conversation = session_conversation_store.get_conversation(
         session_id, conversation_id
     reply = ""
     triage_meta = {}
+    context = []
     try:
         loop = asyncio.get_running_loop()
             return StreamingResponse(logging_wrapper(), media_type="text/event-stream")
+        reply, triage_meta, context = result
     except Exception as e:
         background_tasks.add_task(
             },
         )
+    # Ajouter les passages récupérés
     background_tasks.add_task(
         log_event,
         user_id=payload.user_id,
             "consent": payload.consent,
             "human_message": payload.human_message,
             "reply": reply,
+            "context": context,
             "age_group": payload.age_group,
             "gender": payload.gender,
             "roles": payload.roles,
     if file_text is None:
         return Response(status_code=STATUS_CODE_INTERNAL_SERVER_ERROR)
+    prompt_injection_filter = PromptInjectionFilter()
+    injection_filtered_file_text = prompt_injection_filter.sanitize_input(file_text)
+    pii_filter = PIIFilter()
+    with tracer.start_as_current_span("sanitize_document"):
+        pii_filtered_file_text = pii_filter.sanitize(
+            injection_filtered_file_text, app.state.language_detector
+        )
     if session_document_store.create_document(
+        session_id, pii_filtered_file_text, file_name, file_size
     ):
         session_tracker.update_session(session_id)
     else:

requirements.txt CHANGED Viewed

@@ -141,4 +141,5 @@ opentelemetry-sdk==1.39.1
 opentelemetry-instrumentation-fastapi==0.60b1
 opentelemetry-instrumentation-httpx==0.60b1
 slowapi==0.1.9
-psutil==7.2.2

 opentelemetry-instrumentation-fastapi==0.60b1
 opentelemetry-instrumentation-httpx==0.60b1
 slowapi==0.1.9
+psutil==7.2.2
+# lingua-language-detector==2.1.1

telemetry.py CHANGED Viewed

@@ -18,6 +18,7 @@ class FilteredConsoleExporter(SpanExporter):
         "PromptSanitizer",
         "sanitize docs_content",
         "sanitize retrieval_query",
     }
     def export(self, spans):

         "PromptSanitizer",
         "sanitize docs_content",
         "sanitize retrieval_query",
+        "sanitize_document",
     }
     def export(self, spans):

templates/index.html CHANGED Viewed

@@ -56,7 +56,7 @@
           <div class="modal-content slide language-modal">
             <div class="content-top">
               <h2 data-i18n="choose_language_title"></h2>
-              <p data-i18n="change_language_instructions"></p>
             </div>
             <div class="form-group">
@@ -196,7 +196,7 @@
           </div>
           <h3 data-i18n="file_add_title"></h3>
           <div id="file-drop-zone" class="file-drop-area">
-            <p><span data-i18n="file_add_instructions_prefix"></span><a href="#" data-i18n="click"></a><span data-i18n="file_add_instructions_suffix"></span>
             <input
               type="file"
               id="file-input"

           <div class="modal-content slide language-modal">
             <div class="content-top">
               <h2 data-i18n="choose_language_title"></h2>
+              <p style="text-align: justify;" data-i18n="change_language_instructions"></p>
             </div>
             <div class="form-group">
           </div>
           <h3 data-i18n="file_add_title"></h3>
           <div id="file-drop-zone" class="file-drop-area">
+            <p><span data-i18n="file_add_instructions_prefix"></span><a href="#" data-i18n="click"></a><span data-i18n="file_add_instructions_suffix"></span></p>
             <input
               type="file"
               id="file-input"