Spaces:

Chatbot-TLU
/

M_chatbot

Sleeping

App Files Files Community

minh-4T commited on Apr 12

Commit

628deed

1 Parent(s): 23f0b25

name khoa : supabase -> qdrant

Browse files

Files changed (5) hide show

core/collection_router_retriever.py +11 -11
core/collection_utils.py +31 -0
core/config.py +0 -7
core/qa_pipeline.py +9 -38
main.py +6 -17

core/collection_router_retriever.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import List
 from langchain_core.documents import Document as LangChainDocument
-from .collection_utils import collection_matches_year
 from .document_db import SessionLocal, list_active_collection_names
 logger = logging.getLogger(__name__)
@@ -45,18 +45,18 @@ class CollectionRouterRetriever:
         finally:
             db.close()
-    def _select_target_collections(self, year_scope: str | None) -> List[str]:
         fetch_limit = max(self.top_n_collections * 4, 12)
         active_collections = self._get_active_collections(limit=fetch_limit)
         if not active_collections:
             return []
-        normalized_year_scope = (year_scope or "").strip()
-        if normalized_year_scope:
             return [
                 collection_name
                 for collection_name in active_collections
-                if collection_matches_year(collection_name, normalized_year_scope)
             ]
         return active_collections[: self.top_n_collections]
@@ -111,15 +111,15 @@ class CollectionRouterRetriever:
         scored_docs.sort(key=lambda row: row[0], reverse=True)
         return [doc for _, doc in scored_docs]
-    def search(self, query: str, k: int = 10, alpha: float = 0.6, year_scope: str | None = None) -> List:
         if k <= 0:
             return []
         candidate_k = max(k * 4, k)
-        year_scoped = bool((year_scope or "").strip())
-        target_collections = self._select_target_collections(year_scope)
-        if year_scoped and not target_collections:
             return []
         routed_docs = self._search_target_collections(
@@ -128,7 +128,7 @@ class CollectionRouterRetriever:
             limit=candidate_k,
         )
-        if year_scoped:
             deduplicated = []
             seen = set()
             for doc in routed_docs:
@@ -148,7 +148,7 @@ class CollectionRouterRetriever:
                     query,
                     k=candidate_k,
                     alpha=alpha,
-                    year_scope=year_scope,
                 )
             except TypeError:
                 fallback_docs = self.base_retriever.search(

 from langchain_core.documents import Document as LangChainDocument
+from .collection_utils import collection_matches_cohort
 from .document_db import SessionLocal, list_active_collection_names
 logger = logging.getLogger(__name__)
         finally:
             db.close()
+    def _select_target_collections(self, cohort_key: str | None) -> List[str]:
         fetch_limit = max(self.top_n_collections * 4, 12)
         active_collections = self._get_active_collections(limit=fetch_limit)
         if not active_collections:
             return []
+        normalized_cohort = (cohort_key or "").strip()
+        if normalized_cohort:
             return [
                 collection_name
                 for collection_name in active_collections
+                if collection_matches_cohort(collection_name, normalized_cohort)
             ]
         return active_collections[: self.top_n_collections]
         scored_docs.sort(key=lambda row: row[0], reverse=True)
         return [doc for _, doc in scored_docs]
+    def search(self, query: str, k: int = 10, alpha: float = 0.6, cohort_key: str | None = None) -> List:
         if k <= 0:
             return []
         candidate_k = max(k * 4, k)
+        cohort_scoped = bool((cohort_key or "").strip())
+        target_collections = self._select_target_collections(cohort_key)
+        if cohort_scoped and not target_collections:
             return []
         routed_docs = self._search_target_collections(
             limit=candidate_k,
         )
+        if cohort_scoped:
             deduplicated = []
             seen = set()
             for doc in routed_docs:
                     query,
                     k=candidate_k,
                     alpha=alpha,
+                    cohort_key=cohort_key,
                 )
             except TypeError:
                 fallback_docs = self.base_retriever.search(

core/collection_utils.py CHANGED Viewed

@@ -24,6 +24,37 @@ def extract_year_tokens(value: str) -> Set[str]:
     return {token for token in _YEAR_PATTERN.findall(value or "")}
 def collection_matches_year(collection_name: str, year_scope: str) -> bool:
     if not year_scope:
         return False

     return {token for token in _YEAR_PATTERN.findall(value or "")}
+def extract_folder_key_from_collection_name(collection_name: str, prefix: str = "rag") -> str | None:
+    """
+    Extract folder_key from collection name.
+    E.g., 'rag_k63' -> 'k63', 'rag_2023_2024' -> '2023_2024'
+    Returns None if collection_name doesn't match the expected pattern.
+    """
+    if not collection_name:
+        return None
+    prefix_with_underscore = f"{prefix}_"
+    if collection_name.startswith(prefix_with_underscore):
+        return collection_name[len(prefix_with_underscore):]
+    return None
+def collection_matches_cohort(collection_name: str, cohort_key: str, prefix: str = "rag") -> bool:
+    """
+    Check if collection matches the given cohort_key.
+    E.g., collection='rag_k63', cohort_key='k63' -> True
+    """
+    if not cohort_key:
+        return False
+    extracted = extract_folder_key_from_collection_name(collection_name, prefix)
+    if not extracted:
+        return False
+    return extracted.lower() == cohort_key.lower()
 def collection_matches_year(collection_name: str, year_scope: str) -> bool:
     if not year_scope:
         return False

core/config.py CHANGED Viewed

@@ -67,13 +67,6 @@ SUPABASE_SYNC_ALLOWED_IPS = [ip.strip() for ip in os.getenv('SUPABASE_SYNC_ALLOW
 SUPABASE_SYNC_ALLOW_PRIVATE_NETWORK = os.getenv('SUPABASE_SYNC_ALLOW_PRIVATE_NETWORK', 'true').strip().lower() in {'1', 'true', 'yes', 'on'}
 COLLECTION_ROUTER_TOP_N = _bounded_int_from_env('COLLECTION_ROUTER_TOP_N', 3, 1, 20)
-# Cohort to academic year mapping
-COHORT_TO_YEAR = {
-    'k65': '2023-2024',
-    'k64': '2022-2023',
-    'k63': '2021-2022',
-}
 # - Context and output limits
 MAX_CONTEXT_CHARS = int(os.getenv('MAX_CONTEXT_CHARS', '12000'))
 MAX_OUT_CHARS = int(os.getenv('MAX_OUT_CHARS', '3000'))

 SUPABASE_SYNC_ALLOW_PRIVATE_NETWORK = os.getenv('SUPABASE_SYNC_ALLOW_PRIVATE_NETWORK', 'true').strip().lower() in {'1', 'true', 'yes', 'on'}
 COLLECTION_ROUTER_TOP_N = _bounded_int_from_env('COLLECTION_ROUTER_TOP_N', 3, 1, 20)
 # - Context and output limits
 MAX_CONTEXT_CHARS = int(os.getenv('MAX_CONTEXT_CHARS', '12000'))
 MAX_OUT_CHARS = int(os.getenv('MAX_OUT_CHARS', '3000'))

core/qa_pipeline.py CHANGED Viewed

@@ -221,16 +221,16 @@ def generate_standalone_query(message: str, history: List) -> str:
     return message
-def ask_ai_improved(message: str, history: List, hybrid_retriever, year_scope: str | None = None) -> Generator[str, None, None]:
     full_response = ""
-    for delta in ask_ai_stream_delta(message, history, hybrid_retriever, year_scope=year_scope):
         full_response += delta
         if len(full_response) > MAX_OUT_CHARS:
             yield full_response[:MAX_OUT_CHARS] + "\n\n[Đã cắt bớt nội dung dài]"
             return
         yield full_response
-def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, year_scope: str | None = None) -> Generator[str, None, None]:
     if not message.strip():
         yield " Bạn chưa nhập câu hỏi."
         return
@@ -241,12 +241,6 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, year_scop
     logger.info(f" CÂU HỎI GỐC: {message}")
     question = generate_standalone_query(message, history)
-    # [YEAR-AWARE CHANGE] Xac dinh pham vi nam ma nguoi dung yeu cau.
-    requested_year_range, mentioned_years = detect_requested_year(f"{message}\n{question}")
-    if requested_year_range:
-        logger.info(f"Lọc theo năm học yêu cầu: {requested_year_range}")
-    elif mentioned_years:
-        logger.info(f"Lọc theo năm được nhắc tới: {sorted(mentioned_years)}")
     processed_data = analyze_and_expand_query(question)
@@ -261,12 +255,9 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, year_scop
     all_docs: List = []
     seen = set()
-    # Prefer passed year_scope over detected year
-    if year_scope:
-        year_scope_hint = year_scope
-        logger.info(f"Sử dụng year_scope từ cohort: {year_scope_hint}")
-    else:
-        year_scope_hint = requested_year_range or (", ".join(sorted(mentioned_years)) if mentioned_years else None)
     for query in queries:
         #Giữ nguyên logic alpha ngành CNTT của Minh
         current_alpha = 0.4 if "CNTT" in query.upper() else 0.5
@@ -274,7 +265,7 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, year_scop
             query,
             k=TOP_K_RESULTS,
             alpha=current_alpha,
-            year_scope=year_scope_hint,
         )
         for doc in docs:
             content_hash = hashlib.sha256(doc.page_content.encode("utf-8")).hexdigest()
@@ -287,23 +278,6 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, year_scop
         yield "Không tìm thấy thông tin liên quan trong tài liệu."
         return
-    # [YEAR-AWARE CHANGE] Lọc theo năm nhưng vẫn fallback nếu không có tài liệu đúng năm.
-    year_scope = None
-    year_filter_requested = bool(requested_year_range or mentioned_years)
-    year_filtered_docs = filter_docs_by_year(all_docs, requested_year_range, mentioned_years)
-    if year_filter_requested:
-        if year_filtered_docs:
-            if len(year_filtered_docs) != len(all_docs):
-                logger.info(f"Đã lọc theo năm: còn {len(year_filtered_docs)}/{len(all_docs)} documents")
-            all_docs = year_filtered_docs
-            if requested_year_range:
-                year_scope = requested_year_range
-            elif mentioned_years:
-                year_scope = ", ".join(sorted(mentioned_years))
-        else:
-            logger.warning("Không tìm thấy tài liệu đúng năm yêu cầu, fallback sang tập tài liệu tổng quát")
     final_docs = advanced_rerank(question, all_docs, top_k=FINAL_TOP_K)
     context_parts = []
@@ -311,10 +285,7 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, year_scop
     for doc in final_docs:
         page = doc.metadata.get('page_number', 'N/A')
         file_name = doc.metadata.get('source_file') or doc.metadata.get('source')
-        # [YEAR-AWARE CHANGE] Gan nhan nam trong context de LLM bam dung nguon.
-        doc_year = infer_doc_academic_year(doc)
-        year_label = f"Năm {doc_year}" if doc_year != "ALL" else "Áp dụng nhiều năm"
-        source = f"[{year_label} | {os.path.basename(file_name)} | Trang {page}]" if file_name else f"[{year_label} | Trang {page}]"
         block = f"{source}\n{doc.page_content}"
         if total_chars + len(block) > MAX_CONTEXT_CHARS:
             break
@@ -324,7 +295,7 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, year_scop
     context = "\n\n---\n\n".join(context_parts)
     topic_hint = processed_data.get('topic') or processed_data.get('root_question') or question
-    prompt = create_advanced_prompt(question, context, question_type, topic_hint, year_scope=year_scope)
     logger.info("Đang tạo câu trả lời cuối cùng ...")

     return message
+def ask_ai_improved(message: str, history: List, hybrid_retriever, cohort_key: str | None = None) -> Generator[str, None, None]:
     full_response = ""
+    for delta in ask_ai_stream_delta(message, history, hybrid_retriever, cohort_key=cohort_key):
         full_response += delta
         if len(full_response) > MAX_OUT_CHARS:
             yield full_response[:MAX_OUT_CHARS] + "\n\n[Đã cắt bớt nội dung dài]"
             return
         yield full_response
+def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, cohort_key: str | None = None) -> Generator[str, None, None]:
     if not message.strip():
         yield " Bạn chưa nhập câu hỏi."
         return
     logger.info(f" CÂU HỎI GỐC: {message}")
     question = generate_standalone_query(message, history)
     processed_data = analyze_and_expand_query(question)
     all_docs: List = []
     seen = set()
+    if cohort_key:
+        logger.info(f"Sử dụng cohort_key: {cohort_key}")
     for query in queries:
         #Giữ nguyên logic alpha ngành CNTT của Minh
         current_alpha = 0.4 if "CNTT" in query.upper() else 0.5
             query,
             k=TOP_K_RESULTS,
             alpha=current_alpha,
+            cohort_key=cohort_key,
         )
         for doc in docs:
             content_hash = hashlib.sha256(doc.page_content.encode("utf-8")).hexdigest()
         yield "Không tìm thấy thông tin liên quan trong tài liệu."
         return
     final_docs = advanced_rerank(question, all_docs, top_k=FINAL_TOP_K)
     context_parts = []
     for doc in final_docs:
         page = doc.metadata.get('page_number', 'N/A')
         file_name = doc.metadata.get('source_file') or doc.metadata.get('source')
+        source = f"[{os.path.basename(file_name)} | Trang {page}]" if file_name else f"[Trang {page}]"
         block = f"{source}\n{doc.page_content}"
         if total_chars + len(block) > MAX_CONTEXT_CHARS:
             break
     context = "\n\n---\n\n".join(context_parts)
     topic_hint = processed_data.get('topic') or processed_data.get('root_question') or question
+    prompt = create_advanced_prompt(question, context, question_type, topic_hint)
     logger.info("Đang tạo câu trả lời cuối cùng ...")

main.py CHANGED Viewed

@@ -14,7 +14,6 @@ from qdrant_client import QdrantClient
 #Import các model và các hàm cần thiết từ core
 from core.config import (
     COLLECTION_ROUTER_TOP_N,
-    COHORT_TO_YEAR,
     DATABASE_URL,
     QDRANT_API_KEY,
     QDRANT_URL,
@@ -344,20 +343,15 @@ async def chat_endpoint(payload: ChatRequest, request: Request):
     user_id = payload.user_id # Lấy user_id từ request
     cohort_key = payload.cohort_key  # Lấy cohort_key từ request
-    # Convert cohort_key to year_scope for collection routing
-    year_scope = None
-    if cohort_key and cohort_key in COHORT_TO_YEAR:
-        year_scope = COHORT_TO_YEAR[cohort_key]
-        logger.info(f"Sử dụng cohort: {cohort_key} -> năm học: {year_scope}")
-    elif cohort_key:
-        logger.warning(f"Cohort không hợp lệ: {cohort_key}")
     history = await get_history_async(db_pool, session_id)
     # Tập hợp toàn bộ response từ generator
     full_response = ""
     try:
-        async for chunk in iterate_in_threadpool(ask_ai_improved(user_msg, history, retriever, year_scope=year_scope)):
             full_response = chunk
     except Exception:
         logger.exception("Lỗi khi xử lý phản hồi từ AI:", exc_info=True)
@@ -381,13 +375,8 @@ async def chat_stream_endpoint(payload: ChatRequest, request: Request):
     user_id = payload.user_id # Lấy user_id từ request
     cohort_key = payload.cohort_key  # Lấy cohort_key từ request
-    # Convert cohort_key to year_scope for collection routing
-    year_scope = None
-    if cohort_key and cohort_key in COHORT_TO_YEAR:
-        year_scope = COHORT_TO_YEAR[cohort_key]
-        logger.info(f"Sử dụng cohort: {cohort_key} -> năm học: {year_scope}")
-    elif cohort_key:
-        logger.warning(f"Cohort không hợp lệ: {cohort_key}")
     history = await get_history_async(db_pool, session_id)
@@ -396,7 +385,7 @@ async def chat_stream_endpoint(payload: ChatRequest, request: Request):
         full_response = ""
         try:
             # ask_ai_stream_delta yield từng delta chunk (không cumulative)
-            async for delta_chunk in iterate_in_threadpool(ask_ai_stream_delta(user_msg, history, retriever, year_scope=year_scope)):
                 full_response += delta_chunk
                 # Gửi SSE event với delta chunk
                 sse_data = json.dumps({"delta": delta_chunk, "done": False}, ensure_ascii=False)

 #Import các model và các hàm cần thiết từ core
 from core.config import (
     COLLECTION_ROUTER_TOP_N,
     DATABASE_URL,
     QDRANT_API_KEY,
     QDRANT_URL,
     user_id = payload.user_id # Lấy user_id từ request
     cohort_key = payload.cohort_key  # Lấy cohort_key từ request
+    if cohort_key:
+        logger.info(f"Sử dụng cohort: {cohort_key}")
     history = await get_history_async(db_pool, session_id)
     # Tập hợp toàn bộ response từ generator
     full_response = ""
     try:
+        async for chunk in iterate_in_threadpool(ask_ai_improved(user_msg, history, retriever, cohort_key=cohort_key)):
             full_response = chunk
     except Exception:
         logger.exception("Lỗi khi xử lý phản hồi từ AI:", exc_info=True)
     user_id = payload.user_id # Lấy user_id từ request
     cohort_key = payload.cohort_key  # Lấy cohort_key từ request
+    if cohort_key:
+        logger.info(f"Sử dụng cohort: {cohort_key}")
     history = await get_history_async(db_pool, session_id)
         full_response = ""
         try:
             # ask_ai_stream_delta yield từng delta chunk (không cumulative)
+            async for delta_chunk in iterate_in_threadpool(ask_ai_stream_delta(user_msg, history, retriever, cohort_key=cohort_key)):
                 full_response += delta_chunk
                 # Gửi SSE event với delta chunk
                 sse_data = json.dumps({"delta": delta_chunk, "done": False}, ensure_ascii=False)