Spaces:

Chatbot-TLU
/

M_chatbot

Sleeping

App Files Files Community

minh-4T commited on 14 days ago

Commit

cb01350

1 Parent(s): f57df7c

clean code

Browse files

Files changed (5) hide show

core/config.py +3 -4
rag/collection_router_retriever.py +17 -12
rag/collection_utils.py +0 -22
rag/qa_pipeline.py +0 -11
services/document_ingest_service.py +1 -2

core/config.py CHANGED Viewed

@@ -9,7 +9,7 @@ try:
 except Exception:
     pass
 def _is_hf_persistent_storage_available() -> bool:
     data_dir = Path('/data')
     return data_dir.exists() and os.access(data_dir, os.W_OK)
@@ -17,13 +17,13 @@ def _is_hf_persistent_storage_available() -> bool:
 _USE_HF_PERSISTENT_STORAGE = _is_hf_persistent_storage_available()
 def _default_documents_db_url() -> str:
     if _USE_HF_PERSISTENT_STORAGE:
         return 'sqlite:////data/rag_metadata.db'
     return 'sqlite:///./rag_metadata.db'
 def _bounded_int_from_env(name: str, default: int, minimum: int, maximum: int) -> int:
     raw_value = os.getenv(name, str(default))
     try:
@@ -48,7 +48,6 @@ CHUNK_OVERLAP = int(os.getenv('CHUNK_OVERLAP', '150'))
 TOP_K_RESULTS = int(os.getenv('TOP_K_RESULTS', '10'))
 FINAL_TOP_K = int(os.getenv('FINAL_TOP_K', '5'))
-QDRANT_COLLECTION = os.getenv('QDRANT_COLLECTION', 'rag_docs')
 DOCUMENTS_DATABASE_URL = os.getenv('DOCUMENTS_DATABASE_URL', _default_documents_db_url())
 # External service configs

 except Exception:
     pass
+#Kiểm tra xem có thư mục /data không và có quyền ghi hay không để quyết định sử dụng persistent storage của Hugging Face hay không
 def _is_hf_persistent_storage_available() -> bool:
     data_dir = Path('/data')
     return data_dir.exists() and os.access(data_dir, os.W_OK)
 _USE_HF_PERSISTENT_STORAGE = _is_hf_persistent_storage_available()
+# Nếu sử dụng persistent storage của Hugging Face, lưu trữ metadata vào /data/rag_metadata.db, ngược lại lưu vào thư mục hiện tại
 def _default_documents_db_url() -> str:
     if _USE_HF_PERSISTENT_STORAGE:
         return 'sqlite:////data/rag_metadata.db'
     return 'sqlite:///./rag_metadata.db'
+# Hàm tiện ích để lấy giá trị int từ biến môi trường với giới hạn min và max, trả về default nếu không hợp lệ
 def _bounded_int_from_env(name: str, default: int, minimum: int, maximum: int) -> int:
     raw_value = os.getenv(name, str(default))
     try:
 TOP_K_RESULTS = int(os.getenv('TOP_K_RESULTS', '10'))
 FINAL_TOP_K = int(os.getenv('FINAL_TOP_K', '5'))
 DOCUMENTS_DATABASE_URL = os.getenv('DOCUMENTS_DATABASE_URL', _default_documents_db_url())
 # External service configs

rag/collection_router_retriever.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import hashlib
 import logging
 from typing import List
 from langchain_core.documents import Document as LangChainDocument
 from rank_bm25 import BM25Okapi
@@ -37,7 +38,8 @@ class CollectionRouterRetriever:
         self.embeddings_model = embeddings_model
         self.top_n_collections = max(1, int(top_n_collections or 3))
         # Cache giờ đây lưu một dict: { 'bm25': obj, 'corpus_docs': list, 'count': int }
-        self._bm25_cache = {}
     @staticmethod
     def _doc_key(doc) -> str:
@@ -69,18 +71,19 @@ class CollectionRouterRetriever:
         normalized_cohort = (cohort_key or "").strip()
         if normalized_cohort:
-            return [
                 collection_name
                 for collection_name in active_collections
                 if collection_matches_cohort(collection_name, normalized_cohort)
             ]
         return active_collections[: self.top_n_collections]
     def _ensure_bm25_loaded(self, collection_name: str) -> tuple[BM25Okapi, List[LangChainDocument]] | None:
         """Lazy load and cache BM25 index and corpus for a collection (với cơ chế tự động làm mới Cache)"""
-        # 1. Lấy tổng số chunks hiện tại trong Qdrant (Rất nhanh, tốn < 10ms)
         try:
             collection_info = self.qdrant_client.get_collection(collection_name)
             current_count = collection_info.points_count
@@ -89,10 +92,11 @@ class CollectionRouterRetriever:
             return None
         # 2. Kiểm tra Cache: Nếu chưa có hoặc số lượng thay đổi -> Xóa cache build lại
-        cached_data = self._bm25_cache.get(collection_name)
-        if cached_data and cached_data.get('count') == current_count:
-            # Tái sử dụng (Phải trả về cả bm25 VÀ corpus_docs để map điểm)
-            return cached_data['bm25'], cached_data['corpus_docs']
         logger.info(f"Phát hiện dữ liệu mới hoặc chưa có cache cho {collection_name} (Count: {current_count}). Đang build lại BM25...")
@@ -147,11 +151,12 @@ class CollectionRouterRetriever:
             bm25 = BM25Okapi(tokenized_docs, k1=1.5, b=0.5)
             # 3. Lưu lại Cache kèm the con số count và corpus_docs để đối chiếu lần sau
-            self._bm25_cache[collection_name] = {
-                'bm25': bm25,
-                'corpus_docs': corpus_docs,
-                'count': current_count
-            }
             logger.info("BM25 index built and cached for collection=%s (docs=%d)", collection_name, len(corpus_docs))
             return bm25, corpus_docs

 import hashlib
 import logging
 from typing import List
+from threading import Lock
 from langchain_core.documents import Document as LangChainDocument
 from rank_bm25 import BM25Okapi
         self.embeddings_model = embeddings_model
         self.top_n_collections = max(1, int(top_n_collections or 3))
         # Cache giờ đây lưu một dict: { 'bm25': obj, 'corpus_docs': list, 'count': int }
+        self._bm25_cache = {}
+        self._bm25_lock = Lock()  # Thread-safe lock cho BM25 cache
     @staticmethod
     def _doc_key(doc) -> str:
         normalized_cohort = (cohort_key or "").strip()
         if normalized_cohort:
+            matches = [
                 collection_name
                 for collection_name in active_collections
                 if collection_matches_cohort(collection_name, normalized_cohort)
             ]
+            return matches[:1] if matches else []
         return active_collections[: self.top_n_collections]
     def _ensure_bm25_loaded(self, collection_name: str) -> tuple[BM25Okapi, List[LangChainDocument]] | None:
         """Lazy load and cache BM25 index and corpus for a collection (với cơ chế tự động làm mới Cache)"""
+        # 1. Lấy tổng số chunks hiện tại trong Qdrant
         try:
             collection_info = self.qdrant_client.get_collection(collection_name)
             current_count = collection_info.points_count
             return None
         # 2. Kiểm tra Cache: Nếu chưa có hoặc số lượng thay đổi -> Xóa cache build lại
+        with self._bm25_lock:
+            cached_data = self._bm25_cache.get(collection_name)
+            if cached_data and cached_data.get('count') == current_count:
+                # Tái sử dụng (Phải trả về cả bm25 VÀ corpus_docs để map điểm)
+                return cached_data['bm25'], cached_data['corpus_docs']
         logger.info(f"Phát hiện dữ liệu mới hoặc chưa có cache cho {collection_name} (Count: {current_count}). Đang build lại BM25...")
             bm25 = BM25Okapi(tokenized_docs, k1=1.5, b=0.5)
             # 3. Lưu lại Cache kèm the con số count và corpus_docs để đối chiếu lần sau
+            with self._bm25_lock:
+                self._bm25_cache[collection_name] = {
+                    'bm25': bm25,
+                    'corpus_docs': corpus_docs,
+                    'count': current_count
+                }
             logger.info("BM25 index built and cached for collection=%s (docs=%d)", collection_name, len(corpus_docs))
             return bm25, corpus_docs

rag/collection_utils.py CHANGED Viewed

@@ -2,8 +2,6 @@ import re
 from typing import Set
 _COLLECTION_SAFE_RE = re.compile(r"[^a-z0-9_]+")
-_YEAR_PATTERN = re.compile(r"(20\d{2})")
 def normalize_folder_key(folder_key: str) -> str:
     value = (folder_key or "").strip().lower()
@@ -20,10 +18,6 @@ def build_collection_name(folder_key: str, prefix: str = "rag") -> str:
     return base[:63]
-def extract_year_tokens(value: str) -> Set[str]:
-    return {token for token in _YEAR_PATTERN.findall(value or "")}
 def extract_folder_key_from_collection_name(collection_name: str, prefix: str = "rag") -> str | None:
     """
     Extract folder_key from collection name.
@@ -53,19 +47,3 @@ def collection_matches_cohort(collection_name: str, cohort_key: str, prefix: str
         return False
     return extracted.lower() == cohort_key.lower()
-def collection_matches_year(collection_name: str, year_scope: str) -> bool:
-    if not year_scope:
-        return False
-    collection_years = extract_year_tokens(collection_name)
-    target_years = extract_year_tokens(year_scope)
-    if not target_years:
-        return False
-    # For explicit ranges (e.g. 2022-2023), require all years to match.
-    if len(target_years) >= 2:
-        return target_years.issubset(collection_years)
-    return bool(collection_years.intersection(target_years))

 from typing import Set
 _COLLECTION_SAFE_RE = re.compile(r"[^a-z0-9_]+")
 def normalize_folder_key(folder_key: str) -> str:
     value = (folder_key or "").strip().lower()
     return base[:63]
 def extract_folder_key_from_collection_name(collection_name: str, prefix: str = "rag") -> str | None:
     """
     Extract folder_key from collection name.
         return False
     return extracted.lower() == cohort_key.lower()

rag/qa_pipeline.py CHANGED Viewed

@@ -17,17 +17,6 @@ MAX_CONTEXT_CHARS = 12000
 MAX_DOC_CHARS = 1800
 MAX_OUT_CHARS = 3000
-# Quản lý API Keys cho Groq và Gemini với xoay tua tự động khi gặp lỗi hoặc hết hạn
-def sanitize_for_prompt(text: str) -> str:
-    """Lọc bỏ prompt injection và PII """
-    text = re.sub(r"(?i)(ignore previous instructions|system prompt|developer message|jailbreak)", "[FILTERED_INJECTION]", text)
-    text = re.sub(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", "[EMAIL]", text)
-    text = re.sub(r"\b(0\d{9}|\+84\d{9,10})\b", "[PHONE]", text)
-    text = re.sub(r"\b\d{8,12}\b", "[ID]", text)
-    return text.strip()
 def generate_standalone_query(message: str, history: List) -> str:
     """Tái tạo câu hỏi từ lịch sử """
     if not history:

 MAX_DOC_CHARS = 1800
 MAX_OUT_CHARS = 3000
 def generate_standalone_query(message: str, history: List) -> str:
     """Tái tạo câu hỏi từ lịch sử """
     if not history:

services/document_ingest_service.py CHANGED Viewed

@@ -223,8 +223,7 @@ def process_document_ingest(
         if not vectors or not vectors[0]:
             raise ValueError("Failed to create embeddings for chunks.")
-        target_collection = (collection_name or document.collection_name or QDRANT_COLLECTION or "").strip()
         if not target_collection:
             raise ValueError("Target collection is empty.")

         if not vectors or not vectors[0]:
             raise ValueError("Failed to create embeddings for chunks.")
+        target_collection = (collection_name or document.collection_name or "rag_docs" or "").strip()
         if not target_collection:
             raise ValueError("Target collection is empty.")