minh-4T commited on
Commit
89c8b6a
·
1 Parent(s): 31fbcc3

unroll table and project restructuring

Browse files
Files changed (34) hide show
  1. api/chat_api_routers.py +0 -0
  2. core/ai_provider.py +38 -0
  3. core/chunking.py +0 -63
  4. core/llm_utils.py +0 -31
  5. core/retriever.py +0 -73
  6. data/Sổ tay sinh viên 2023-2024/0. Mục lục Sổ tay sinh viên K65.docx +0 -3
  7. data/Sổ tay sinh viên 2023-2024/1. QĐ-1226-Quy che dao tao dai hoc-DHTL (ban hanh).docx +0 -3
  8. data/Sổ tay sinh viên 2023-2024/10. QĐ 1089 thi OLP môn học (Final 10-5-2023).docx +0 -3
  9. data/Sổ tay sinh viên 2023-2024/11. QĐ về Học phí final (25-10-2021).docx +0 -3
  10. data/Sổ tay sinh viên 2023-2024/12. QD ngoại trú.docx +0 -3
  11. data/Sổ tay sinh viên 2023-2024/2. QĐ về tiếng anh CTTT.300921.QD.1315.docx +0 -3
  12. data/Sổ tay sinh viên 2023-2024/3. QD1767.TA tăng cường ban hanh.docx +0 -3
  13. data/Sổ tay sinh viên 2023-2024/4. QD411_QD_DHTL-Chuan_Dau_Ra_CNTT.pdf +0 -3
  14. data/Sổ tay sinh viên 2023-2024/4.1. QuyDinh_Ve_CDR_CNTT_Ban_hanh_theo_QD411-06-4-2022.pdf +0 -3
  15. data/Sổ tay sinh viên 2023-2024/5. QD_1038_16.07.2021_GDTC.docx +0 -3
  16. data/Sổ tay sinh viên 2023-2024/6. Quy định về tổ chức thi trực tuyến.docx +0 -3
  17. data/Sổ tay sinh viên 2023-2024/7. QĐ đánh giá KQRL (Final 18-8-2016).docx +0 -3
  18. data/Sổ tay sinh viên 2023-2024/8. QĐ ve HBKKHT, HBCS (final 12-5-2021).docx +0 -3
  19. data/Sổ tay sinh viên 2023-2024/9. QĐ Khen thưởng - KL (Final 10-8-2016).docx +0 -3
  20. {core → database}/document_db.py +1 -1
  21. main.py +6 -6
  22. {core → rag}/analyze_and_expand.py +1 -2
  23. rag/chunking.py +38 -0
  24. {core → rag}/collection_router_retriever.py +120 -131
  25. {core → rag}/collection_utils.py +0 -0
  26. {core → rag}/models.py +1 -1
  27. {core → rag}/prompting.py +1 -14
  28. {core → rag}/qa_pipeline.py +9 -114
  29. {core → rag}/rerank.py +0 -0
  30. {core → rag}/vectorstore.py +57 -47
  31. requirements.txt +2 -1
  32. {core → services}/document_ingest_service.py +7 -11
  33. {core → services}/supabase_sync_service.py +3 -4
  34. {core → utils}/text_utils.py +4 -2
api/chat_api_routers.py DELETED
File without changes
core/ai_provider.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import threading
4
+ import groq
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ class AIProviderManager:
9
+ def __init__(self):
10
+ # Lấy danh sách keys
11
+ self.groq_keys = [k.strip() for k in os.getenv("GROQ_API_KEYS", "").split(",") if k.strip()]
12
+ self.gemini_keys = [k.strip() for k in os.getenv("GEMINI_API_KEYS", "").split(",") if k.strip()]
13
+ self.groq_idx = 0
14
+ self.gemini_idx = 0
15
+ self._lock = threading.Lock() # Đảm bảo Thread-Safe khi có nhiều Request cùng lúc
16
+
17
+ def get_groq_client(self):
18
+ if not self.groq_keys: return None
19
+ # Chỉ lấy key, không thay đổi state nên không cần lock
20
+ return groq.Groq(api_key=self.groq_keys[self.groq_idx])
21
+
22
+ def rotate_groq(self):
23
+ with self._lock: # Khóa luồng khi xoay tua để tránh xung đột
24
+ if len(self.groq_keys) > 1:
25
+ self.groq_idx = (self.groq_idx + 1) % len(self.groq_keys)
26
+ logger.info(f"Đã xoay sang Groq Key thứ {self.groq_idx + 1}")
27
+
28
+ def get_gemini_key(self):
29
+ if not self.gemini_keys: return None
30
+ return self.gemini_keys[self.gemini_idx]
31
+
32
+ def rotate_gemini(self):
33
+ with self._lock:
34
+ if len(self.gemini_keys) > 1:
35
+ self.gemini_idx = (self.gemini_idx + 1) % len(self.gemini_keys)
36
+ logger.info("Đã xoay sang Gemini Key dự phòng")
37
+
38
+ api_manager = AIProviderManager()
core/chunking.py DELETED
@@ -1,63 +0,0 @@
1
- import re
2
- from typing import List
3
- from langchain_text_splitters import RecursiveCharacterTextSplitter
4
- from .config import CHUNK_SIZE, CHUNK_OVERLAP
5
-
6
- def extract_and_protect_tables(text: str) -> tuple[str, dict]:
7
- """Tìm và bọc các bảng Markdown để bảo vệ chúng khỏi việc bị cắt gãy."""
8
- # Pattern tìm bảng Markdown (các dòng bắt đầu và chứa ký tự | liên tiếp)
9
- table_pattern = re.compile(r'(?:\|.*\|[\r\n]+)+')
10
- tables = {}
11
-
12
- def replace_table(match):
13
- table_id = f"<TABLE_{len(tables)}>"
14
- tables[table_id] = match.group(0)
15
- return f"\n{table_id}\n"
16
-
17
- protected_text = re.sub(table_pattern, replace_table, text)
18
- return protected_text, tables
19
-
20
- def smart_chunking(docs: List) -> List:
21
- print("Đang áp dụng Smart Chunking (Bảo toàn Bảng & Danh sách)...")
22
- legal_splitter = RecursiveCharacterTextSplitter(
23
- chunk_size=CHUNK_SIZE,
24
- chunk_overlap=CHUNK_OVERLAP,
25
- separators=[
26
- "\nĐiều ", "\nChương ", "\nMục ", "\nKhoản ",
27
- "\n\n", "\n", ". ", " ", ""
28
- ],
29
- length_function=len,
30
- is_separator_regex=False
31
- )
32
-
33
- chunks = []
34
- for doc in docs:
35
- # 1. Bảo vệ List đang có
36
- protected_text = doc.page_content.replace('\na.', '<LIST_a>') \
37
- .replace('\nb.', '<LIST_b>') \
38
- .replace('\nc.', '<LIST_c>')
39
-
40
- # 2. Bảo vệ Table
41
- protected_text, tables = extract_and_protect_tables(protected_text)
42
-
43
- # 3. Tiến hành cắt
44
- doc_chunks = legal_splitter.split_text(protected_text)
45
-
46
- # 4. Phục hồi dữ liệu
47
- for chunk_text in doc_chunks:
48
- restored = chunk_text.replace('<LIST_a>', '\na.') \
49
- .replace('<LIST_b>', '\nb.') \
50
- .replace('<LIST_c>', '\nc.')
51
-
52
- for table_id, table_content in tables.items():
53
- if table_id in restored:
54
- restored = restored.replace(table_id, table_content)
55
-
56
- new_doc = type(doc)(
57
- page_content=restored,
58
- metadata=doc.metadata.copy()
59
- )
60
- chunks.append(new_doc)
61
-
62
- print(f" Đã tạo {len(chunks)} chunks thông minh (giữ nguyên cấu trúc bảng)")
63
- return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/llm_utils.py DELETED
@@ -1,31 +0,0 @@
1
- import time
2
- import logging
3
- from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
4
-
5
- logger = logging.getLogger(__name__)
6
-
7
- def safe_stream(llm, prompt) :
8
- try:
9
- for chunk in llm.stream(prompt):
10
- text = getattr(chunk, "content", str(chunk))
11
- if text:
12
- yield text
13
- except Exception :
14
- logger.exception("Lỗi khi stream LLM:")
15
- yield "Lỗi khi stream LLM "
16
- def safe_invoke(llm ,prompt : str, timeout : int =30, retries: int =2):
17
- last_error = None
18
- for attempt in range(1, retries+1):
19
- try:
20
- with ThreadPoolExecutor(max_workers=1) as pool:
21
- fut = pool.submit(llm.invoke, prompt)
22
- return fut.result(timeout=timeout)
23
- except FuturesTimeoutError as e:
24
- last_error = e
25
- logger.warning(f" Lần {attempt}: LLM invoke timeout sau {timeout} giây. Đang thử lại...")
26
- except Exception as e:
27
- last_error = e
28
- logger.error(f"Lần {attempt}: Lỗi khi gọi LLM: {e}. Đang thử lại...")
29
- time.sleep(0.6*attempt)
30
-
31
- raise RuntimeError (f"LLM failed after {retries} attempts: {last_error}") # Thêm delay nhỏ trước khi thử lại
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/retriever.py DELETED
@@ -1,73 +0,0 @@
1
- from typing import List
2
- import hashlib
3
- from rank_bm25 import BM25Okapi
4
-
5
- class HybridRetriever:
6
- """Kết hợp BM25 và Vector Search."""
7
- def __init__(self, vectorstore, documents):
8
- self.vectorstore = vectorstore
9
- self.documents = documents
10
- print(" Đang khởi tạo BM25...")
11
- tokenized_docs = [doc.page_content.lower().split() for doc in documents]
12
- self.bm25 = BM25Okapi(tokenized_docs, k1=1.5, b=0.5)
13
- self.rrf_c = 60
14
- print(" BM25 sẵn sàng!")
15
-
16
- @staticmethod
17
- def _doc_key(doc) -> str:
18
- metadata = doc.metadata if isinstance(doc.metadata, dict) else {}
19
- source = str(metadata.get("source_relpath") or metadata.get("source_file") or metadata.get("source") or "")
20
- page = str(metadata.get("page_number") or metadata.get("page") or "")
21
- content = (doc.page_content or "").strip()
22
- digest = hashlib.sha1(content.encode("utf-8")).hexdigest() if content else "empty"
23
- return f"{source}|{page}|{digest}"
24
-
25
- def search(self, query: str, k: int = 10, alpha: float = 0.6, year_scope: str | None = None) -> List:
26
- del year_scope
27
- if not self.documents or k <= 0:
28
- return []
29
-
30
- alpha = max(0.0, min(1.0, float(alpha)))
31
- bm25_weight = 1.0 - alpha
32
- vector_weight = alpha
33
-
34
- # Lấy top k từ BM25
35
- tokenized_query = query.lower().split()
36
- candidate_k = min(max(k * 4, k), len(self.documents))
37
- bm25_top_docs = self.bm25.get_top_n(tokenized_query, self.documents, n=candidate_k)
38
-
39
- bm25_ranked = {}
40
- all_retrieved = {}
41
- for rank, doc in enumerate(bm25_top_docs, 1):
42
- key = self._doc_key(doc)
43
- bm25_ranked[key] = rank
44
- all_retrieved[key] = doc
45
-
46
- # Lấy top k từ Vector
47
- try:
48
- vector_results = self.vectorstore.similarity_search(query, k=candidate_k)
49
- except Exception as e:
50
- print(f"Lỗi Vector Search: {e}")
51
- return [doc for doc in bm25_top_docs[:k]]
52
-
53
- vector_ranked = {}
54
- for rank, doc in enumerate(vector_results, 1):
55
- key = self._doc_key(doc)
56
- vector_ranked[key] = rank
57
- all_retrieved[key] = doc
58
-
59
- rrf_results = []
60
-
61
- for content, doc in all_retrieved.items():
62
- score = 0.0
63
- if content in bm25_ranked:
64
- score += bm25_weight / (self.rrf_c + bm25_ranked[content])
65
- if content in vector_ranked:
66
- score += vector_weight / (self.rrf_c + vector_ranked[content])
67
-
68
- if score > 0:
69
- rrf_results.append((score, doc))
70
-
71
- rrf_results.sort(key=lambda x: x[0], reverse=True)
72
- return [doc for score, doc in rrf_results[:k]]
73
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/Sổ tay sinh viên 2023-2024/0. Mục lục Sổ tay sinh viên K65.docx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:725638f2ebed983fb38354fa9d2073937a4df89418b4c4c6958c68f29c868113
3
- size 14530
 
 
 
 
data/Sổ tay sinh viên 2023-2024/1. QĐ-1226-Quy che dao tao dai hoc-DHTL (ban hanh).docx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:aaf33dda956f69966bbe64864432618068eaded7d3016644c4acfe8889f4dd5a
3
- size 120225
 
 
 
 
data/Sổ tay sinh viên 2023-2024/10. QĐ 1089 thi OLP môn học (Final 10-5-2023).docx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8616d772baf6ff2225f2163e12a88418f2a6b46c1770fcd7d306da952ec3a94e
3
- size 52007
 
 
 
 
data/Sổ tay sinh viên 2023-2024/11. QĐ về Học phí final (25-10-2021).docx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:17a91329f9e31308df57a53e36a69665fe37f3aea37fc32bc69a7fe984fbf3c5
3
- size 56350
 
 
 
 
data/Sổ tay sinh viên 2023-2024/12. QD ngoại trú.docx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:06e7018cfa51e785f90009a66a2dc442a17f55ae52f5f28ddce208e61a6ba7d6
3
- size 26065
 
 
 
 
data/Sổ tay sinh viên 2023-2024/2. QĐ về tiếng anh CTTT.300921.QD.1315.docx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7b27a9545255d7d69158cc8e8d42d560f00c6865f665317619822159af7e90c
3
- size 40894
 
 
 
 
data/Sổ tay sinh viên 2023-2024/3. QD1767.TA tăng cường ban hanh.docx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1dc5e1e6aef63069f673606c5083fb9dab484fa1b9cb10f955121bc4042c7131
3
- size 37167
 
 
 
 
data/Sổ tay sinh viên 2023-2024/4. QD411_QD_DHTL-Chuan_Dau_Ra_CNTT.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:544fe7b5b71f4fb407175fe0bcdc8853e0943eb410a4947b2ac8d891f359619b
3
- size 369669
 
 
 
 
data/Sổ tay sinh viên 2023-2024/4.1. QuyDinh_Ve_CDR_CNTT_Ban_hanh_theo_QD411-06-4-2022.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:17ebf4761e56b3d3336cfeaeab86bee4de7614fd2c27c20d848a0fed7650abfa
3
- size 498348
 
 
 
 
data/Sổ tay sinh viên 2023-2024/5. QD_1038_16.07.2021_GDTC.docx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee1a9d836579a08c6cbde24f63f9c8eb64aa70f52cfeec4c5e1c6ed789993da3
3
- size 47518
 
 
 
 
data/Sổ tay sinh viên 2023-2024/6. Quy định về tổ chức thi trực tuyến.docx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:80c185d0fdb75571f7f9fafd974e4ac4a4006a890ca3ce975ca7ca70181848a3
3
- size 65814
 
 
 
 
data/Sổ tay sinh viên 2023-2024/7. QĐ đánh giá KQRL (Final 18-8-2016).docx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4905773937cd8d12a8d24224146dc1442418ed3a57b60e3348a0e1b8eeb5ea2b
3
- size 74575
 
 
 
 
data/Sổ tay sinh viên 2023-2024/8. QĐ ve HBKKHT, HBCS (final 12-5-2021).docx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:da1cd4f217a697635a5dae85285f46aa9a19f217f3fd03c981a9f47d79e31f2d
3
- size 30512
 
 
 
 
data/Sổ tay sinh viên 2023-2024/9. QĐ Khen thưởng - KL (Final 10-8-2016).docx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:46e6bdba5671d6aabfda753efa4010b8fb828d2c9180ab58882d7361c0200c67
3
- size 76052
 
 
 
 
{core → database}/document_db.py RENAMED
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional
7
  from sqlalchemy import Column, DateTime, ForeignKey, Integer, String, Text, create_engine, func, inspect, or_, text
8
  from sqlalchemy.orm import Session, declarative_base, relationship, sessionmaker
9
 
10
- from .config import DOCUMENTS_DATABASE_URL
11
 
12
  Base = declarative_base()
13
  logger = logging.getLogger(__name__)
 
7
  from sqlalchemy import Column, DateTime, ForeignKey, Integer, String, Text, create_engine, func, inspect, or_, text
8
  from sqlalchemy.orm import Session, declarative_base, relationship, sessionmaker
9
 
10
+ from core.config import DOCUMENTS_DATABASE_URL
11
 
12
  Base = declarative_base()
13
  logger = logging.getLogger(__name__)
main.py CHANGED
@@ -26,12 +26,12 @@ from core.config import (
26
  SUPABASE_SYNC_SNAPSHOT_FILE,
27
  SUPABASE_URL,
28
  )
29
- from core.document_db import init_document_db
30
- from core.supabase_sync_service import SupabaseStorageSyncService, SupabaseSyncCoordinator
31
- from core.collection_router_retriever import CollectionRouterRetriever
32
- from core.vectorstore import build_vectorstore_improved, load_vectorstore_improved
33
- from core.models import embeddings
34
- from core.qa_pipeline import ask_ai_improved, ask_ai_stream_delta
35
  from api.admin_sync_router import router as admin_sync_router
36
 
37
  # Hàm log lỗi an toàn
 
26
  SUPABASE_SYNC_SNAPSHOT_FILE,
27
  SUPABASE_URL,
28
  )
29
+ from database.document_db import init_document_db
30
+ from services.supabase_sync_service import SupabaseStorageSyncService, SupabaseSyncCoordinator
31
+ from rag.collection_router_retriever import CollectionRouterRetriever
32
+ from rag.vectorstore import build_vectorstore_improved, load_vectorstore_improved
33
+ from rag.models import embeddings
34
+ from rag.qa_pipeline import ask_ai_improved, ask_ai_stream_delta
35
  from api.admin_sync_router import router as admin_sync_router
36
 
37
  # Hàm log lỗi an toàn
{core → rag}/analyze_and_expand.py RENAMED
@@ -21,8 +21,7 @@ def clean_json_string(text: str) -> str:
21
  def analyze_and_expand_query(question: str) -> Dict[str, Any]:
22
  print(" Phân tích & Mở rộng câu hỏi...")
23
 
24
- # Import cục bộ để tránh lỗi vòng lặp import (circular import) với qa_pipeline
25
- from .qa_pipeline import api_manager
26
 
27
  # Prompt được tối ưu để ép AI trả về JSON chuẩn
28
  prompt = f"""
 
21
  def analyze_and_expand_query(question: str) -> Dict[str, Any]:
22
  print(" Phân tích & Mở rộng câu hỏi...")
23
 
24
+ from core.ai_provider import api_manager
 
25
 
26
  # Prompt được tối ưu để ép AI trả về JSON chuẩn
27
  prompt = f"""
rag/chunking.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
3
+ from core.config import CHUNK_SIZE, CHUNK_OVERLAP
4
+
5
+ def smart_chunking(docs: List) -> List:
6
+ print("Đang áp dụng Smart Chunking (Regex Lookahead)...")
7
+
8
+ # Cấu hình Regex bắt cấu trúc phân cấp hành chính (Chương -> Điều -> Khoản)
9
+ legal_splitter = RecursiveCharacterTextSplitter(
10
+ chunk_size=CHUNK_SIZE,
11
+ chunk_overlap=CHUNK_OVERLAP,
12
+ separators=[
13
+ "\nChương ",
14
+ "\nĐiều ",
15
+ "\nKhoản ",
16
+ "\n\n",
17
+ r"\n(?=\d+\.)",
18
+ r"\n(?=[a-z]\.)",
19
+ r"\n(?=-|\+)",
20
+ "\n", " ", ""
21
+ ],
22
+ length_function=len,
23
+ is_separator_regex=True
24
+ )
25
+
26
+ chunks = []
27
+ for doc in docs:
28
+ doc_chunks = legal_splitter.split_text(doc.page_content)
29
+
30
+ for chunk_text in doc_chunks:
31
+ new_doc = type(doc)(
32
+ page_content=chunk_text,
33
+ metadata=doc.metadata.copy()
34
+ )
35
+ chunks.append(new_doc)
36
+
37
+ print(f"Đã tạo {len(chunks)} chunks thông minh.")
38
+ return chunks
{core → rag}/collection_router_retriever.py RENAMED
@@ -5,25 +5,39 @@ from typing import List
5
  from langchain_core.documents import Document as LangChainDocument
6
  from rank_bm25 import BM25Okapi
7
 
 
 
 
 
 
 
8
  from .collection_utils import collection_matches_cohort
9
- from .document_db import SessionLocal, list_active_collection_names
10
 
11
  logger = logging.getLogger(__name__)
12
 
13
 
 
 
 
 
 
 
 
 
 
14
  class CollectionRouterRetriever:
15
  def __init__(
16
  self,
17
- base_retriever,
18
- qdrant_client,
19
  embeddings_model,
20
  top_n_collections: int = 3,
21
  ) -> None:
22
- self.base_retriever = base_retriever
23
  self.qdrant_client = qdrant_client
24
  self.embeddings_model = embeddings_model
25
  self.top_n_collections = max(1, int(top_n_collections or 3))
26
- self._bm25_cache = {} # {collection_name -> BM25Okapi instance}
 
27
 
28
  @staticmethod
29
  def _doc_key(doc) -> str:
@@ -63,63 +77,91 @@ class CollectionRouterRetriever:
63
 
64
  return active_collections[: self.top_n_collections]
65
 
66
- def _ensure_bm25_loaded(self, collection_name: str) -> BM25Okapi | None:
67
- """Lazy load and cache BM25 index for a collection.
68
-
69
- First time: fetch all docs from Qdrant, build BM25, cache it (~0.3s)
70
- Subsequent times: reuse from cache (~0.001s)
71
- """
72
- # Check if already cached
73
- if collection_name in self._bm25_cache:
74
- return self._bm25_cache[collection_name]
75
 
 
76
  try:
77
- # Fetch ALL documents from collection (no query vector, get full corpus)
78
- all_points = self.qdrant_client.scroll(
79
- collection_name=collection_name,
80
- limit=10000, # Batch size
81
- )
 
 
 
 
 
 
82
 
83
- points_list = all_points[0] if isinstance(all_points, tuple) else all_points
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  if not points_list:
86
  logger.warning("No documents found in collection=%s for BM25 indexing", collection_name)
87
  return None
88
 
89
- # Filter out None values
90
- points_list = [p for p in points_list if p is not None]
91
- if not points_list:
92
- logger.warning("No valid points found in collection=%s after filtering", collection_name)
93
- return None
94
-
95
- # Extract documents and tokenize for BM25
96
- docs_for_bm25 = []
97
  for point in points_list:
98
  payload = point.payload if isinstance(point.payload, dict) else {}
99
  content = str(payload.get("content") or "").strip()
100
  if content:
101
- docs_for_bm25.append(content)
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
- if not docs_for_bm25:
104
  logger.warning("No valid content found in collection=%s for BM25 indexing", collection_name)
105
  return None
106
 
107
- # Build BM25 index
108
- tokenized_docs = [doc.lower().split() for doc in docs_for_bm25]
109
  bm25 = BM25Okapi(tokenized_docs, k1=1.5, b=0.5)
110
 
111
- # Cache it
112
- self._bm25_cache[collection_name] = bm25
113
- logger.info("BM25 index built and cached for collection=%s (docs=%d)", collection_name, len(docs_for_bm25))
 
 
 
 
114
 
115
- return bm25
116
 
117
  except Exception:
118
  logger.exception("Failed to build BM25 index for collection=%s", collection_name)
119
  return None
120
 
121
  def _search_target_collections(self, query: str, collections: List[str], limit: int, alpha: float = 0.6) -> List:
122
- """Hybrid search: BM25 + Vector + RRF (Option 2 with cached BM25)"""
123
  if not collections:
124
  return []
125
 
@@ -129,9 +171,9 @@ class CollectionRouterRetriever:
129
  logger.exception("Failed to embed query for collection routing")
130
  return []
131
 
132
- # Step 1: Vector search (từ Qdrant)
133
- all_docs_dict = {} # {doc_key -> LangChainDocument}
134
- vector_ranked = {} # {doc_key -> rank}
135
 
136
  vector_rank = 0
137
  for collection_name in collections:
@@ -171,60 +213,45 @@ class CollectionRouterRetriever:
171
  vector_rank += 1
172
  vector_ranked[doc_key] = vector_rank
173
 
174
- # Step 2: BM25 search (lexical) - using CACHED index
175
- bm25_ranked = {} # {doc_key -> rank}
176
- if all_docs_dict:
177
- try:
178
- # Validate query is not empty
179
- if not query.strip():
180
- logger.warning("Query is empty, skipping BM25 search")
181
- else:
182
- tokenized_query = query.lower().split()
183
-
184
- # For each collection, use cached BM25 index
185
  for collection_name in collections:
186
- # Load cached BM25 (or build if first time)
187
- bm25 = self._ensure_bm25_loaded(collection_name)
188
- if bm25 is None:
189
  continue
190
-
191
- # Get BM25 scores for vector results
192
- docs_from_collection = [
193
- doc for doc in all_docs_dict.values()
194
- if doc.metadata.get("collection_name") == collection_name
195
- ]
196
-
197
- if not docs_from_collection:
198
- continue
199
-
200
- # Extract content strings for BM25 scoring
201
- content_for_bm25 = [doc.page_content for doc in docs_from_collection]
202
-
203
- # Build BM25 index for this subset and score
204
- if content_for_bm25:
205
- tokenized_subset = [content.lower().split() for content in content_for_bm25]
206
- bm25_subset = BM25Okapi(tokenized_subset, k1=1.5, b=0.5)
207
- bm25_results = bm25_subset.get_top_n(tokenized_query, content_for_bm25, n=len(content_for_bm25))
208
 
209
- bm25_rank = 0
210
- for content in bm25_results: # bm25_results contains strings
211
- # Find matching doc by content (handles duplicates)
212
- matched_doc = None
213
- for doc in docs_from_collection:
214
- if doc.page_content == content:
215
- matched_doc = doc
216
- break
217
 
218
- if matched_doc:
219
- doc_key = self._doc_key(matched_doc)
220
- if doc_key not in bm25_ranked:
221
- bm25_rank += 1
222
- bm25_ranked[doc_key] = bm25_rank
223
 
224
- except Exception:
225
- logger.exception("BM25 search failed, falling back to vector-only")
 
 
 
226
 
227
- # Step 3: RRF combination (Reciprocal Rank Fusion)
228
  alpha = max(0.0, min(1.0, float(alpha)))
229
  bm25_weight = 1.0 - alpha
230
  vector_weight = alpha
@@ -234,18 +261,15 @@ class CollectionRouterRetriever:
234
  for doc_key, doc in all_docs_dict.items():
235
  score = 0.0
236
 
237
- # Vector score
238
  if doc_key in vector_ranked:
239
  score += vector_weight / (rrf_c + vector_ranked[doc_key])
240
 
241
- # BM25 score
242
  if doc_key in bm25_ranked:
243
  score += bm25_weight / (rrf_c + bm25_ranked[doc_key])
244
 
245
  if score > 0:
246
  rrf_scores[doc_key] = score
247
 
248
- # Sort by RRF score
249
  sorted_results = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
250
  return [all_docs_dict[doc_key] for doc_key, _ in sorted_results[:limit]]
251
 
@@ -267,54 +291,19 @@ class CollectionRouterRetriever:
267
  alpha=alpha,
268
  )
269
 
270
- # Log warning if no documents found
271
  if not routed_docs:
272
  logger.warning("No documents found for query=%s, cohort=%s", query[:50], cohort_key)
273
-
274
- if cohort_scoped:
275
- deduplicated = []
276
- seen = set()
277
- for doc in routed_docs:
278
- key = self._doc_key(doc)
279
- if key in seen:
280
- continue
281
- seen.add(key)
282
- deduplicated.append(doc)
283
- if len(deduplicated) >= candidate_k:
284
- break
285
- return deduplicated[:k]
286
-
287
- fallback_docs = []
288
- if self.base_retriever is not None:
289
- try:
290
- fallback_docs = self.base_retriever.search(
291
- query,
292
- k=candidate_k,
293
- alpha=alpha,
294
- cohort_key=cohort_key,
295
- )
296
- except TypeError:
297
- fallback_docs = self.base_retriever.search(
298
- query,
299
- k=candidate_k,
300
- alpha=alpha,
301
- )
302
- except Exception:
303
- logger.exception("Base retriever fallback failed")
304
 
305
  deduplicated = []
306
  seen = set()
307
-
308
- # Safe handling of fallback_docs which might be None
309
- fallback_docs_list = list(fallback_docs) if fallback_docs else []
310
-
311
- for doc in routed_docs + fallback_docs_list:
312
  key = self._doc_key(doc)
313
  if key in seen:
314
  continue
315
  seen.add(key)
316
  deduplicated.append(doc)
317
- if len(deduplicated) >= candidate_k:
318
  break
319
-
320
- return deduplicated[:k]
 
5
  from langchain_core.documents import Document as LangChainDocument
6
  from rank_bm25 import BM25Okapi
7
 
8
+
9
+ try:
10
+ from pyvi import ViTokenizer
11
+ except Exception:
12
+ ViTokenizer = None
13
+
14
  from .collection_utils import collection_matches_cohort
15
+ from database.document_db import SessionLocal, list_active_collection_names
16
 
17
  logger = logging.getLogger(__name__)
18
 
19
 
20
+ def _vi_tokenize(text: str) -> List[str]:
21
+ normalized = (text or "").lower().strip()
22
+ if not normalized:
23
+ return []
24
+ if ViTokenizer is None:
25
+ return normalized.split()
26
+ return ViTokenizer.tokenize(normalized).split()
27
+
28
+
29
  class CollectionRouterRetriever:
30
  def __init__(
31
  self,
32
+ qdrant_client,
 
33
  embeddings_model,
34
  top_n_collections: int = 3,
35
  ) -> None:
 
36
  self.qdrant_client = qdrant_client
37
  self.embeddings_model = embeddings_model
38
  self.top_n_collections = max(1, int(top_n_collections or 3))
39
+ # Cache giờ đây lưu một dict: { 'bm25': obj, 'corpus_docs': list, 'count': int }
40
+ self._bm25_cache = {}
41
 
42
  @staticmethod
43
  def _doc_key(doc) -> str:
 
77
 
78
  return active_collections[: self.top_n_collections]
79
 
80
+ def _ensure_bm25_loaded(self, collection_name: str) -> tuple[BM25Okapi, List[LangChainDocument]] | None:
81
+ """Lazy load and cache BM25 index and corpus for a collection (với cơ chế tự động làm mới Cache)"""
 
 
 
 
 
 
 
82
 
83
+ # 1. Lấy tổng số chunks hiện tại trong Qdrant (Rất nhanh, tốn < 10ms)
84
  try:
85
+ collection_info = self.qdrant_client.get_collection(collection_name)
86
+ current_count = collection_info.points_count
87
+ except Exception:
88
+ logger.exception("Failed to get collection info for %s", collection_name)
89
+ return None
90
+
91
+ # 2. Kiểm tra Cache: Nếu chưa có hoặc số lượng thay đổi -> Xóa cache build lại
92
+ cached_data = self._bm25_cache.get(collection_name)
93
+ if cached_data and cached_data.get('count') == current_count:
94
+ # Tái sử dụng (Phải trả về cả bm25 VÀ corpus_docs để map điểm)
95
+ return cached_data['bm25'], cached_data['corpus_docs']
96
 
97
+ logger.info(f"Phát hiện dữ liệu mới hoặc chưa có cache cho {collection_name} (Count: {current_count}). Đang build lại BM25...")
98
+
99
+ try:
100
+ points_list = []
101
+ offset = None
102
+ # Phân trang để lấy TOÀN BỘ documents từ collection
103
+ while True:
104
+ response = self.qdrant_client.scroll(
105
+ collection_name=collection_name,
106
+ limit=10000,
107
+ offset=offset,
108
+ with_payload=True,
109
+ with_vectors=False
110
+ )
111
+ batch_points, next_offset = response
112
+ points_list.extend([p for p in batch_points if p is not None])
113
+
114
+ offset = next_offset
115
+ if offset is None:
116
+ break
117
 
118
  if not points_list:
119
  logger.warning("No documents found in collection=%s for BM25 indexing", collection_name)
120
  return None
121
 
122
+ # Trích xuất content và build documents
123
+ corpus_docs = []
 
 
 
 
 
 
124
  for point in points_list:
125
  payload = point.payload if isinstance(point.payload, dict) else {}
126
  content = str(payload.get("content") or "").strip()
127
  if content:
128
+ metadata = {
129
+ "source": payload.get("path") or payload.get("object_path") or payload.get("stored_name") or "",
130
+ "source_file": payload.get("filename") or payload.get("stored_name") or "",
131
+ "source_relpath": payload.get("object_path") or payload.get("path") or "",
132
+ "object_path": payload.get("object_path") or "",
133
+ "folder_key": payload.get("folder_key") or "",
134
+ "collection_name": collection_name,
135
+ "academic_year": payload.get("academic_year") or "",
136
+ "chunk_index": payload.get("chunk_index"),
137
+ "page_number": payload.get("page_number"),
138
+ }
139
+ doc = LangChainDocument(page_content=content, metadata=metadata)
140
+ corpus_docs.append(doc)
141
 
142
+ if not corpus_docs:
143
  logger.warning("No valid content found in collection=%s for BM25 indexing", collection_name)
144
  return None
145
 
146
+ tokenized_docs = [_vi_tokenize(doc.page_content) for doc in corpus_docs]
 
147
  bm25 = BM25Okapi(tokenized_docs, k1=1.5, b=0.5)
148
 
149
+ # 3. Lưu lại Cache kèm the con số count và corpus_docs để đối chiếu lần sau
150
+ self._bm25_cache[collection_name] = {
151
+ 'bm25': bm25,
152
+ 'corpus_docs': corpus_docs,
153
+ 'count': current_count
154
+ }
155
+ logger.info("BM25 index built and cached for collection=%s (docs=%d)", collection_name, len(corpus_docs))
156
 
157
+ return bm25, corpus_docs
158
 
159
  except Exception:
160
  logger.exception("Failed to build BM25 index for collection=%s", collection_name)
161
  return None
162
 
163
  def _search_target_collections(self, query: str, collections: List[str], limit: int, alpha: float = 0.6) -> List:
164
+ """Hybrid search: BM25 + Vector + RRF"""
165
  if not collections:
166
  return []
167
 
 
171
  logger.exception("Failed to embed query for collection routing")
172
  return []
173
 
174
+ # Step 1: Vector search
175
+ all_docs_dict = {}
176
+ vector_ranked = {}
177
 
178
  vector_rank = 0
179
  for collection_name in collections:
 
213
  vector_rank += 1
214
  vector_ranked[doc_key] = vector_rank
215
 
216
+ # Step 2: BM25 search
217
+ bm25_ranked = {}
218
+ try:
219
+ tokenized_query = _vi_tokenize(query)
220
+
221
+ if not tokenized_query:
222
+ logger.warning("Query is empty after tokenization, skipping BM25 search")
223
+ else:
 
 
 
224
  for collection_name in collections:
225
+ bm25_data = self._ensure_bm25_loaded(collection_name)
226
+ if bm25_data is None:
 
227
  continue
228
+
229
+ bm25, corpus_docs = bm25_data
230
+
231
+ scores = bm25.get_scores(tokenized_query)
232
+ scored_docs = sorted(zip(corpus_docs, scores), key=lambda x: x[1], reverse=True)
233
+
234
+ bm25_rank = 0
235
+ for doc, score in scored_docs:
236
+ if score <= 0:
237
+ break
238
+
239
+ doc_key = self._doc_key(doc)
 
 
 
 
 
 
240
 
241
+ if doc_key not in all_docs_dict:
242
+ all_docs_dict[doc_key] = doc
 
 
 
 
 
 
243
 
244
+ if doc_key not in bm25_ranked:
245
+ bm25_rank += 1
246
+ bm25_ranked[doc_key] = bm25_rank
 
 
247
 
248
+ if bm25_rank >= limit:
249
+ break
250
+
251
+ except Exception:
252
+ logger.exception("BM25 search failed, falling back to vector-only")
253
 
254
+ # Step 3: RRF combination
255
  alpha = max(0.0, min(1.0, float(alpha)))
256
  bm25_weight = 1.0 - alpha
257
  vector_weight = alpha
 
261
  for doc_key, doc in all_docs_dict.items():
262
  score = 0.0
263
 
 
264
  if doc_key in vector_ranked:
265
  score += vector_weight / (rrf_c + vector_ranked[doc_key])
266
 
 
267
  if doc_key in bm25_ranked:
268
  score += bm25_weight / (rrf_c + bm25_ranked[doc_key])
269
 
270
  if score > 0:
271
  rrf_scores[doc_key] = score
272
 
 
273
  sorted_results = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
274
  return [all_docs_dict[doc_key] for doc_key, _ in sorted_results[:limit]]
275
 
 
291
  alpha=alpha,
292
  )
293
 
 
294
  if not routed_docs:
295
  logger.warning("No documents found for query=%s, cohort=%s", query[:50], cohort_key)
296
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
298
  deduplicated = []
299
  seen = set()
300
+ for doc in routed_docs:
 
 
 
 
301
  key = self._doc_key(doc)
302
  if key in seen:
303
  continue
304
  seen.add(key)
305
  deduplicated.append(doc)
306
+ if len(deduplicated) >= k:
307
  break
308
+
309
+ return deduplicated
{core → rag}/collection_utils.py RENAMED
File without changes
{core → rag}/models.py RENAMED
@@ -1,6 +1,6 @@
1
  from langchain_huggingface import HuggingFaceEmbeddings
2
  from sentence_transformers import CrossEncoder
3
- from .config import EMBED_MODEL, CROSS_ENCODER_MODEL
4
 
5
  # Khởi tạo Embedding model - Chạy trên CPU của Hugging Face
6
  embeddings = HuggingFaceEmbeddings(
 
1
  from langchain_huggingface import HuggingFaceEmbeddings
2
  from sentence_transformers import CrossEncoder
3
+ from core.config import EMBED_MODEL, CROSS_ENCODER_MODEL
4
 
5
  # Khởi tạo Embedding model - Chạy trên CPU của Hugging Face
6
  embeddings = HuggingFaceEmbeddings(
{core → rag}/prompting.py RENAMED
@@ -1,4 +1,4 @@
1
- def create_advanced_prompt(question: str, context: str, question_type: str, topic: str = None, year_scope: str = None) -> str:
2
  # Base system - Định nghĩa tư duy cho AI
3
  base_system = """Bạn là Trợ lý AI chuyên gia về Pháp chế và Quy định Đại học. Nhiệm vụ của bạn là hỗ trợ tra cứu thông tin chính xác từ các văn bản quy phạm nội bộ (Quyết định, Thông tư, Quy định...).
4
 
@@ -97,25 +97,12 @@ Về vấn đề [Chủ đề], theo **Điều [Số]**, các trường hợp ng
97
  else:
98
  topic_instr = ""
99
 
100
- # [YEAR-AWARE CHANGE] Rang buoc cau tra loi theo nam hoc duoc hoi.
101
- if year_scope:
102
- year_instr = (
103
- f"\n\n **RÀNG BUỘC NĂM HỌC (BẮT BUỘC):**\n"
104
- f"- Người dùng đang hỏi trong phạm vi năm: **{year_scope}**.\n"
105
- f"- Ưu tiên các đoạn có nhãn nguồn cùng năm trong context (ví dụ: [Năm 2022-2023 | ...]).\n"
106
- f"- Nếu chưa đủ bằng chứng đúng năm, được phép dùng đoạn có nhãn 'Áp dụng nhiều năm' hoặc quy định gần nhất và phải ghi chú rõ phạm vi áp dụng.\n"
107
- f"- Không kết luận 'không có dữ liệu' chỉ vì thiếu đúng nhãn năm nếu vẫn có quy định bao quát liên quan.\n"
108
- )
109
- else:
110
- year_instr = ""
111
-
112
  # 4. Gộp Prompt
113
  full_prompt = f"""{base_system}
114
  ----------------
115
  {example}
116
  ----------------
117
  {topic_instr}
118
- {year_instr}
119
 
120
  **TÀI LIỆU THAM KHẢO (CONTEXT):**
121
  {context}
 
1
+ def create_advanced_prompt(question: str, context: str, question_type: str, topic: str = None) -> str:
2
  # Base system - Định nghĩa tư duy cho AI
3
  base_system = """Bạn là Trợ lý AI chuyên gia về Pháp chế và Quy định Đại học. Nhiệm vụ của bạn là hỗ trợ tra cứu thông tin chính xác từ các văn bản quy phạm nội bộ (Quyết định, Thông tư, Quy định...).
4
 
 
97
  else:
98
  topic_instr = ""
99
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  # 4. Gộp Prompt
101
  full_prompt = f"""{base_system}
102
  ----------------
103
  {example}
104
  ----------------
105
  {topic_instr}
 
106
 
107
  **TÀI LIỆU THAM KHẢO (CONTEXT):**
108
  {context}
{core → rag}/qa_pipeline.py RENAMED
@@ -1,128 +1,24 @@
1
  from typing import List, Generator
2
  import os, re, hashlib
3
  import logging
4
- import groq
5
  import google.generativeai as genai
6
  import json
7
  from concurrent.futures import ThreadPoolExecutor
8
  from threading import Lock
9
-
10
- from .models import llm
11
- from .config import TOP_K_RESULTS, FINAL_TOP_K
12
  from .rerank import advanced_rerank
13
  from .prompting import create_advanced_prompt
14
- from .retriever import HybridRetriever
15
  from .analyze_and_expand import analyze_and_expand_query
16
- from .llm_utils import safe_invoke, safe_stream
17
 
18
  logger = logging.getLogger(__name__)
19
 
20
- # Giữ nguyên các hằng số
21
  MAX_CONTEXT_CHARS = 12000
22
  MAX_DOC_CHARS = 1800
23
  MAX_OUT_CHARS = 3000
24
- # [YEAR-AWARE CHANGE] Pattern nhan dien nam hoc trong cau hoi.
25
- ACADEMIC_YEAR_PATTERN = re.compile(r"\b(20\d{2})\s*[-_/]\s*(20\d{2})\b")
26
- SINGLE_YEAR_PATTERN = re.compile(r"\b(20\d{2})\b")
27
 
28
  # Quản lý API Keys cho Groq và Gemini với xoay tua tự động khi gặp lỗi hoặc hết hạn
29
- class AIProviderManager:
30
- def __init__(self):
31
- # Lấy danh sách keys
32
- self.groq_keys = [k.strip() for k in os.getenv("GROQ_API_KEYS", "").split(",") if k.strip()]
33
- self.gemini_keys = [k.strip() for k in os.getenv("GEMINI_API_KEYS", "").split(",") if k.strip()]
34
- self.groq_idx = 0
35
- self.gemini_idx = 0
36
-
37
- def get_groq_client(self):
38
- if not self.groq_keys: return None
39
- return groq.Groq(api_key=self.groq_keys[self.groq_idx])
40
-
41
- def rotate_groq(self):
42
- if len(self.groq_keys) > 1:
43
- self.groq_idx = (self.groq_idx + 1) % len(self.groq_keys)
44
- logger.info(f" Đã xoay sang Groq Key thứ {self.groq_idx + 1}")
45
-
46
- def get_gemini_key(self):
47
- if not self.gemini_keys: return None
48
- return self.gemini_keys[self.gemini_idx]
49
-
50
- def rotate_gemini(self):
51
- if len(self.gemini_keys) > 1:
52
- self.gemini_idx = (self.gemini_idx + 1) % len(self.gemini_keys)
53
- logger.info(f"Đã xoay sang Gemini Key dự phòng")
54
-
55
- api_manager = AIProviderManager()
56
-
57
-
58
- def normalize_academic_year(start_year: str, end_year: str) -> str:
59
- return f"{int(start_year):04d}-{int(end_year):04d}"
60
-
61
-
62
- # [YEAR-AWARE CHANGE] Trich xuat nam yeu cau tu cau hoi.
63
- def detect_requested_year(text: str) -> tuple[str, set]:
64
- """Phat hien nam hoc duoc nhac den trong cau hoi."""
65
- requested_range = ""
66
- mentioned_years = set()
67
-
68
- for start_year, end_year in ACADEMIC_YEAR_PATTERN.findall(text or ""):
69
- requested_range = normalize_academic_year(start_year, end_year)
70
- mentioned_years.add(start_year)
71
- mentioned_years.add(end_year)
72
-
73
- for year in SINGLE_YEAR_PATTERN.findall(text or ""):
74
- mentioned_years.add(year)
75
-
76
- return requested_range, mentioned_years
77
-
78
-
79
- def infer_doc_academic_year(doc) -> str:
80
- metadata = doc.metadata if isinstance(doc.metadata, dict) else {}
81
- existing_year = metadata.get("academic_year")
82
- if existing_year:
83
- return existing_year
84
 
85
- source_text = " ".join(
86
- str(x) for x in [
87
- metadata.get("source_relpath"),
88
- metadata.get("source"),
89
- metadata.get("source_file"),
90
- ]
91
- if x
92
- )
93
- match = ACADEMIC_YEAR_PATTERN.search(source_text)
94
- if match:
95
- year = normalize_academic_year(match.group(1), match.group(2))
96
- metadata["academic_year"] = year
97
- doc.metadata = metadata
98
- return year
99
-
100
- metadata["academic_year"] = "ALL"
101
- doc.metadata = metadata
102
- return "ALL"
103
-
104
-
105
- # [YEAR-AWARE CHANGE] Loc tai lieu theo metadata nam hoc.
106
- def filter_docs_by_year(docs: List, requested_range: str, mentioned_years: set) -> List:
107
- if not requested_range and not mentioned_years:
108
- return docs
109
-
110
- filtered_docs = []
111
- for doc in docs:
112
- doc_year = infer_doc_academic_year(doc)
113
- if doc_year == "ALL":
114
- filtered_docs.append(doc)
115
- continue
116
-
117
- if requested_range and doc_year == requested_range:
118
- filtered_docs.append(doc)
119
- continue
120
-
121
- doc_year_tokens = set(SINGLE_YEAR_PATTERN.findall(doc_year))
122
- if doc_year_tokens.intersection(mentioned_years):
123
- filtered_docs.append(doc)
124
-
125
- return filtered_docs
126
 
127
  def sanitize_for_prompt(text: str) -> str:
128
  """Lọc bỏ prompt injection và PII """
@@ -261,7 +157,6 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, cohort_ke
261
  if cohort_key:
262
  logger.info(f"Sử dụng cohort_key: {cohort_key}")
263
 
264
- # Gửi song song các truy vấn đến Qdrant
265
  def search_query(query: str):
266
  current_alpha = 0.4 if "CNTT" in query.upper() else 0.5
267
  return hybrid_retriever.search(
@@ -293,18 +188,21 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, cohort_ke
293
  final_docs = advanced_rerank(question, all_docs, top_k=FINAL_TOP_K)
294
 
295
  context_parts = []
296
- context_docs = [] # Lưu metadata để trích dẫn ở cuối
297
  total_chars = 0
 
298
  for doc in final_docs:
299
  page = doc.metadata.get('page_number', 'N/A')
300
  file_name = doc.metadata.get('source_file') or doc.metadata.get('source')
301
  source = f"[{os.path.basename(file_name)} | Trang {page}]" if file_name else f"[Trang {page}]"
302
  block = f"{source}\n{doc.page_content}"
 
 
303
  if total_chars + len(block) > MAX_CONTEXT_CHARS:
304
- break
 
305
  total_chars += len(block)
306
  context_parts.append(block)
307
- # Lưu metadata cho phần tài liệu tham khảo ở cuối
308
  context_docs.append({
309
  'source': file_name or "Không rõ",
310
  'page': page
@@ -318,7 +216,6 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, cohort_ke
318
  logger.info("Đang tạo câu trả lời cuối cùng ...")
319
 
320
  success = False
321
- # Ưu tiên Groq (tiết kiệm token)
322
  for _ in range(len(api_manager.groq_keys) if api_manager.groq_keys else 1):
323
  try:
324
  client = api_manager.get_groq_client()
@@ -336,13 +233,12 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, cohort_ke
336
  success = True
337
  break
338
  except Exception as e:
339
- if "429" in str(e): # Rate Limit
340
  api_manager.rotate_groq()
341
  continue
342
  logger.error(f"Lỗi Groq: {e}")
343
  break
344
 
345
- # Fallback sang Gemini nếu Groq lỗi
346
  if not success:
347
  logger.warning("Chuyển sang Gemini ...")
348
  for _ in range(len(api_manager.gemini_keys) if api_manager.gemini_keys else 1):
@@ -363,7 +259,6 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, cohort_ke
363
  yield "Đã xảy ra lỗi hệ thống hoặc quá tải. Vui lòng thử lại sau giây lát!"
364
  return
365
 
366
- # Thêm phần Tài liệu tham khảo ở cuối
367
  if context_docs:
368
  yield "\n\n---\n\n"
369
  yield "## 📚 Tài liệu tham khảo\n\n"
 
1
  from typing import List, Generator
2
  import os, re, hashlib
3
  import logging
 
4
  import google.generativeai as genai
5
  import json
6
  from concurrent.futures import ThreadPoolExecutor
7
  from threading import Lock
8
+ from core.ai_provider import api_manager
9
+ from core.config import TOP_K_RESULTS, FINAL_TOP_K
 
10
  from .rerank import advanced_rerank
11
  from .prompting import create_advanced_prompt
 
12
  from .analyze_and_expand import analyze_and_expand_query
 
13
 
14
  logger = logging.getLogger(__name__)
15
 
 
16
  MAX_CONTEXT_CHARS = 12000
17
  MAX_DOC_CHARS = 1800
18
  MAX_OUT_CHARS = 3000
 
 
 
19
 
20
  # Quản lý API Keys cho Groq và Gemini với xoay tua tự động khi gặp lỗi hoặc hết hạn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def sanitize_for_prompt(text: str) -> str:
24
  """Lọc bỏ prompt injection và PII """
 
157
  if cohort_key:
158
  logger.info(f"Sử dụng cohort_key: {cohort_key}")
159
 
 
160
  def search_query(query: str):
161
  current_alpha = 0.4 if "CNTT" in query.upper() else 0.5
162
  return hybrid_retriever.search(
 
188
  final_docs = advanced_rerank(question, all_docs, top_k=FINAL_TOP_K)
189
 
190
  context_parts = []
191
+ context_docs = []
192
  total_chars = 0
193
+
194
  for doc in final_docs:
195
  page = doc.metadata.get('page_number', 'N/A')
196
  file_name = doc.metadata.get('source_file') or doc.metadata.get('source')
197
  source = f"[{os.path.basename(file_name)} | Trang {page}]" if file_name else f"[Trang {page}]"
198
  block = f"{source}\n{doc.page_content}"
199
+
200
+ # Dùng continue để nhét tối đa các chunk ngắn thay vì break làm đứt gánh
201
  if total_chars + len(block) > MAX_CONTEXT_CHARS:
202
+ continue
203
+
204
  total_chars += len(block)
205
  context_parts.append(block)
 
206
  context_docs.append({
207
  'source': file_name or "Không rõ",
208
  'page': page
 
216
  logger.info("Đang tạo câu trả lời cuối cùng ...")
217
 
218
  success = False
 
219
  for _ in range(len(api_manager.groq_keys) if api_manager.groq_keys else 1):
220
  try:
221
  client = api_manager.get_groq_client()
 
233
  success = True
234
  break
235
  except Exception as e:
236
+ if "429" in str(e):
237
  api_manager.rotate_groq()
238
  continue
239
  logger.error(f"Lỗi Groq: {e}")
240
  break
241
 
 
242
  if not success:
243
  logger.warning("Chuyển sang Gemini ...")
244
  for _ in range(len(api_manager.gemini_keys) if api_manager.gemini_keys else 1):
 
259
  yield "Đã xảy ra lỗi hệ thống hoặc quá tải. Vui lòng thử lại sau giây lát!"
260
  return
261
 
 
262
  if context_docs:
263
  yield "\n\n---\n\n"
264
  yield "## 📚 Tài liệu tham khảo\n\n"
{core → rag}/rerank.py RENAMED
File without changes
{core → rag}/vectorstore.py RENAMED
@@ -1,7 +1,6 @@
1
  import asyncio
2
  import logging
3
  import os
4
- import re
5
  from typing import Any, Dict, List
6
 
7
  import pdfplumber
@@ -13,43 +12,60 @@ from docx.table import Table, _Cell
13
  from docx.text.paragraph import Paragraph
14
  from langchain_core.documents import Document as LangChainDocument
15
 
16
- from .text_utils import clean_text
17
 
18
  logger = logging.getLogger(__name__)
19
 
20
- ACADEMIC_YEAR_PATTERN = re.compile(r"(20\d{2})\s*[-_]\s*(20\d{2})")
21
-
22
-
23
- def normalize_academic_year(start_year: str, end_year: str) -> str:
24
- return f"{int(start_year):04d}-{int(end_year):04d}"
25
-
26
-
27
- def extract_academic_year(text: str) -> str:
28
- if not text:
29
- return ""
30
- match = ACADEMIC_YEAR_PATTERN.search(text)
31
- if not match:
32
- return ""
33
- return normalize_academic_year(match.group(1), match.group(2))
34
-
35
-
36
- def table_to_markdown(data: List[List[str]]) -> str:
37
  if not data or len(data) < 2:
38
  return ""
39
 
40
- header = [str(cell).replace("\n", " ").strip() if cell else "" for cell in data[0]]
41
- separator = ["---"] * len(header)
42
- markdown_lines = [
43
- "| " + " | ".join(header) + " |",
44
- "| " + " | ".join(separator) + " |",
45
- ]
46
-
47
- for row in data[1:]:
48
- clean_row = [str(cell).replace("\n", "<br>").strip() if cell else "" for cell in row]
49
- markdown_lines.append("| " + " | ".join(clean_row) + " |")
50
-
51
- return "\n".join(markdown_lines) + "\n\n"
52
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  def read_pdf_with_tables(filepath: str) -> List[LangChainDocument]:
55
  docs: List[LangChainDocument] = []
@@ -61,9 +77,10 @@ def read_pdf_with_tables(filepath: str) -> List[LangChainDocument]:
61
  table_texts: List[str] = []
62
  if tables:
63
  for table in tables:
64
- md_table = table_to_markdown(table)
65
- if md_table:
66
- table_texts.append(md_table)
 
67
 
68
  full_content = text + "\n\n[BANG DU LIEU TRICH XUAT]:\n" + "\n".join(table_texts)
69
  if full_content.strip():
@@ -74,11 +91,10 @@ def read_pdf_with_tables(filepath: str) -> List[LangChainDocument]:
74
  )
75
  )
76
  except Exception as error:
77
- logger.error("Loi doc PDF (pdfplumber) %s: %s", os.path.basename(filepath), error)
78
 
79
  return docs
80
 
81
-
82
  def iter_block_items(parent):
83
  if isinstance(parent, _Document):
84
  parent_elm = parent.element.body
@@ -92,7 +108,6 @@ def iter_block_items(parent):
92
  elif isinstance(child, CT_Tbl):
93
  yield Table(child, parent)
94
 
95
-
96
  def read_docx_with_tables(filepath: str) -> str:
97
  doc = Document(filepath)
98
  full_text: List[str] = []
@@ -108,13 +123,13 @@ def read_docx_with_tables(filepath: str) -> str:
108
  row_data.append(clean_text(cell.text))
109
  table_data.append(row_data)
110
 
111
- md_table = table_to_markdown(table_data)
112
- if md_table:
113
- full_text.append(f"\n{md_table}\n")
 
114
 
115
  return "\n".join(full_text)
116
 
117
-
118
  def load_documents_from_file(filepath: str, filename: str) -> List[LangChainDocument]:
119
  docs: List[LangChainDocument] = []
120
  lower_name = filename.lower()
@@ -140,12 +155,10 @@ def load_documents_from_file(filepath: str, filename: str) -> List[LangChainDocu
140
  logger.error("Loi doc %s: %s", filename, str(error)[:120])
141
  return []
142
 
143
-
144
  async def build_vectorstore_improved(
145
  sync_coordinator: Any,
146
  startup_wait_seconds: int = 5,
147
  ) -> Dict[str, Any]:
148
- """Supabase build step: trigger one initial sync and optionally wait for completion."""
149
  if sync_coordinator is None:
150
  raise ValueError("sync_coordinator is required")
151
 
@@ -180,12 +193,9 @@ async def build_vectorstore_improved(
180
  "timed_out": True,
181
  }
182
 
183
-
184
  def load_vectorstore_improved(sync_coordinator: Any) -> Dict[str, Any]:
185
- """Supabase load step: return current coordinator health snapshot."""
186
  if sync_coordinator is None:
187
  return {}
188
-
189
  try:
190
  state = sync_coordinator.get_health_snapshot()
191
  return state if isinstance(state, dict) else {}
 
1
  import asyncio
2
  import logging
3
  import os
 
4
  from typing import Any, Dict, List
5
 
6
  import pdfplumber
 
12
  from docx.text.paragraph import Paragraph
13
  from langchain_core.documents import Document as LangChainDocument
14
 
15
+ from utils.text_utils import clean_text
16
 
17
  logger = logging.getLogger(__name__)
18
 
19
+ def table_to_unrolled_text(data: List[List[str]], is_docx: bool = False) -> str:
20
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  if not data or len(data) < 2:
22
  return ""
23
 
24
+ # Làm sạch dữ liệu ban đầu chuyển None thành chuỗi rỗng
25
+ cleaned_data = []
26
+ for row in data:
27
+ cleaned_row = [str(cell).strip() if cell else "" for cell in row]
28
+ cleaned_data.append(cleaned_row)
29
+
30
+ num_cols = len(cleaned_data[0])
31
+ header_row = cleaned_data[0]
32
+
33
+ # CHỈ CHẠY FORWARD FILL NẾU KHÔNG PHẢI FILE WORD
34
+ if not is_docx:
35
+ # 2. Kỹ thuật Forward-Fill cho khu vực Header (Xử gộp cột - Colspan)
36
+ # Giả định hàng đầu tiên chắc chắn là Header
37
+ for i in range(1, num_cols):
38
+ if not header_row[i] and header_row[i-1]:
39
+ header_row[i] = header_row[i-1] # Kéo giá trị từ trái sang phải
40
+
41
+ # 3. Kỹ thuật Forward-Fill cho khu vực Dữ liệu (Xử lý gộp hàng - Rowspan)
42
+ for r in range(1, len(cleaned_data)):
43
+ for c in range(num_cols):
44
+ # Nếu ô hiện tại rỗng, kéo giá trị từ ô ngay bên trên xuống
45
+ if not cleaned_data[r][c] and cleaned_data[r-1][c]:
46
+ cleaned_data[r][c] = cleaned_data[r-1][c]
47
+
48
+ # 4. Trải phẳng bảng (Unrolling)
49
+ headers = cleaned_data[0]
50
+ unrolled_rows = []
51
+
52
+ for r in range(1, len(cleaned_data)):
53
+ row_values = cleaned_data[r]
54
+ row_text_parts = []
55
+
56
+ # Chỉ ghép những ô có dữ liệu thực sự (khác Header)
57
+ for c in range(min(len(headers), len(row_values))):
58
+ header_val = headers[c]
59
+ cell_val = row_values[c]
60
+
61
+ # Tránh lặp lại nếu dữ liệu vô tình giống hệt Header
62
+ if cell_val and cell_val != header_val:
63
+ row_text_parts.append(f"{header_val}: {cell_val}")
64
+
65
+ if row_text_parts:
66
+ unrolled_rows.append("- " + " | ".join(row_text_parts))
67
+
68
+ return "\n" + "\n".join(unrolled_rows) + "\n\n"
69
 
70
  def read_pdf_with_tables(filepath: str) -> List[LangChainDocument]:
71
  docs: List[LangChainDocument] = []
 
77
  table_texts: List[str] = []
78
  if tables:
79
  for table in tables:
80
+ # Vẫn chạy Forward-Fill bình thường cho PDF
81
+ unrolled_table = table_to_unrolled_text(table, is_docx=False)
82
+ if unrolled_table:
83
+ table_texts.append(unrolled_table)
84
 
85
  full_content = text + "\n\n[BANG DU LIEU TRICH XUAT]:\n" + "\n".join(table_texts)
86
  if full_content.strip():
 
91
  )
92
  )
93
  except Exception as error:
94
+ logger.error("Lỗi đọc PDF %s: %s", os.path.basename(filepath), error)
95
 
96
  return docs
97
 
 
98
  def iter_block_items(parent):
99
  if isinstance(parent, _Document):
100
  parent_elm = parent.element.body
 
108
  elif isinstance(child, CT_Tbl):
109
  yield Table(child, parent)
110
 
 
111
  def read_docx_with_tables(filepath: str) -> str:
112
  doc = Document(filepath)
113
  full_text: List[str] = []
 
123
  row_data.append(clean_text(cell.text))
124
  table_data.append(row_data)
125
 
126
+ # CẮT FORWARD-FILL TẠI ĐÂY BẰNG is_docx=True
127
+ unrolled_table = table_to_unrolled_text(table_data, is_docx=True)
128
+ if unrolled_table:
129
+ full_text.append(f"\n{unrolled_table}\n")
130
 
131
  return "\n".join(full_text)
132
 
 
133
  def load_documents_from_file(filepath: str, filename: str) -> List[LangChainDocument]:
134
  docs: List[LangChainDocument] = []
135
  lower_name = filename.lower()
 
155
  logger.error("Loi doc %s: %s", filename, str(error)[:120])
156
  return []
157
 
 
158
  async def build_vectorstore_improved(
159
  sync_coordinator: Any,
160
  startup_wait_seconds: int = 5,
161
  ) -> Dict[str, Any]:
 
162
  if sync_coordinator is None:
163
  raise ValueError("sync_coordinator is required")
164
 
 
193
  "timed_out": True,
194
  }
195
 
 
196
  def load_vectorstore_improved(sync_coordinator: Any) -> Dict[str, Any]:
 
197
  if sync_coordinator is None:
198
  return {}
 
199
  try:
200
  state = sync_coordinator.get_health_snapshot()
201
  return state if isinstance(state, dict) else {}
requirements.txt CHANGED
@@ -30,4 +30,5 @@ langchain-huggingface>=0.0.3,<0.1.0
30
  #File Loaders
31
  python-docx
32
  pdfplumber
33
- pypdf
 
 
30
  #File Loaders
31
  python-docx
32
  pdfplumber
33
+ pypdf
34
+ pyvi
{core → services}/document_ingest_service.py RENAMED
@@ -3,7 +3,6 @@ import os
3
  import uuid
4
  from datetime import datetime, timezone
5
  from typing import List, Optional
6
-
7
  from langchain_core.documents import Document as LangChainDocument
8
  from qdrant_client import QdrantClient
9
  from qdrant_client.http.exceptions import UnexpectedResponse
@@ -17,12 +16,12 @@ from qdrant_client.models import (
17
  VectorParams,
18
  )
19
 
20
- from .chunking import smart_chunking
21
- from .config import QDRANT_API_KEY, QDRANT_COLLECTION, QDRANT_URL
22
- from .document_db import Document, DocumentChunk, SessionLocal
23
- from .models import embeddings
24
- from .text_utils import clean_text
25
- from .vectorstore import extract_academic_year, load_documents_from_file
26
 
27
  logger = logging.getLogger(__name__)
28
 
@@ -74,11 +73,9 @@ def chunk_documents_for_ingest(
74
  if not cleaned_docs:
75
  return []
76
 
77
- academic_year = extract_academic_year(source_relpath) or "ALL"
78
  for doc in cleaned_docs:
79
  metadata = doc.metadata.copy() if isinstance(doc.metadata, dict) else {}
80
  metadata["source_relpath"] = source_relpath
81
- metadata["academic_year"] = academic_year
82
  doc.metadata = metadata
83
 
84
  return [doc for doc in smart_chunking(cleaned_docs) if (doc.page_content or "").strip()]
@@ -252,7 +249,6 @@ def process_document_ingest(
252
  "collection_name": target_collection,
253
  "source_file": metadata.get("source_file") or source_name,
254
  "source_relpath": metadata.get("source_relpath") or source_relpath,
255
- "academic_year": metadata.get("academic_year") or "ALL",
256
  "page_number": metadata.get("page_number"),
257
  "source_updated_at": source_updated_at,
258
  "source_etag": source_etag,
@@ -365,4 +361,4 @@ def delete_vectors_for_object_path(collection_name: str, object_path: str) -> bo
365
  wait=True,
366
  )
367
 
368
- return True
 
3
  import uuid
4
  from datetime import datetime, timezone
5
  from typing import List, Optional
 
6
  from langchain_core.documents import Document as LangChainDocument
7
  from qdrant_client import QdrantClient
8
  from qdrant_client.http.exceptions import UnexpectedResponse
 
16
  VectorParams,
17
  )
18
 
19
+ from rag.chunking import smart_chunking
20
+ from core.config import QDRANT_API_KEY, QDRANT_COLLECTION, QDRANT_URL
21
+ from database.document_db import Document, DocumentChunk, SessionLocal
22
+ from rag.models import embeddings
23
+ from utils.text_utils import clean_text
24
+ from rag.vectorstore import load_documents_from_file
25
 
26
  logger = logging.getLogger(__name__)
27
 
 
73
  if not cleaned_docs:
74
  return []
75
 
 
76
  for doc in cleaned_docs:
77
  metadata = doc.metadata.copy() if isinstance(doc.metadata, dict) else {}
78
  metadata["source_relpath"] = source_relpath
 
79
  doc.metadata = metadata
80
 
81
  return [doc for doc in smart_chunking(cleaned_docs) if (doc.page_content or "").strip()]
 
249
  "collection_name": target_collection,
250
  "source_file": metadata.get("source_file") or source_name,
251
  "source_relpath": metadata.get("source_relpath") or source_relpath,
 
252
  "page_number": metadata.get("page_number"),
253
  "source_updated_at": source_updated_at,
254
  "source_etag": source_etag,
 
361
  wait=True,
362
  )
363
 
364
+ return True
{core → services}/supabase_sync_service.py RENAMED
@@ -2,15 +2,14 @@ import asyncio
2
  import json
3
  import logging
4
  import os
5
- import re
6
  import tempfile
7
  import time
8
  from datetime import datetime, timezone
9
  from typing import Any, Dict, List, Optional
10
  from urllib import error, parse, request
11
 
12
- from .collection_utils import build_collection_name
13
- from .document_db import (
14
  Document,
15
  DocumentChunk,
16
  SessionLocal,
@@ -20,7 +19,7 @@ from .document_db import (
20
  mark_document_sync_error_resolved,
21
  utcnow,
22
  )
23
- from .document_ingest_service import delete_vectors_for_object_path, process_document_ingest
24
 
25
  logger = logging.getLogger(__name__)
26
 
 
2
  import json
3
  import logging
4
  import os
 
5
  import tempfile
6
  import time
7
  from datetime import datetime, timezone
8
  from typing import Any, Dict, List, Optional
9
  from urllib import error, parse, request
10
 
11
+ from rag.collection_utils import build_collection_name
12
+ from database.document_db import (
13
  Document,
14
  DocumentChunk,
15
  SessionLocal,
 
19
  mark_document_sync_error_resolved,
20
  utcnow,
21
  )
22
+ from services.document_ingest_service import delete_vectors_for_object_path, process_document_ingest
23
 
24
  logger = logging.getLogger(__name__)
25
 
{core → utils}/text_utils.py RENAMED
@@ -7,8 +7,10 @@ def clean_text(text: str) -> str:
7
  # Nối các từ bị gãy ngang do xuống dòng
8
  text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)
9
 
10
- # \| < > vào để bảo vệ khung Bảng Markdown các Placeholder
11
- text = re.sub(r'[^\w\s\.,;:!?\-$$\"\'\À-\n\|<>]', ' ', text)
 
 
12
 
13
  # Chuẩn hóa khoảng trắng
14
  text = re.sub(r'[ \t]+', ' ', text)
 
7
  # Nối các từ bị gãy ngang do xuống dòng
8
  text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)
9
 
10
+ # Loại bỏ các tự điều khiển không mong muốn, nhưng giữ lại các dấu câu thông thường
11
+ text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
12
+ # Xóa các ký tự không nhìn thấy và các ký tự đặc biệt như zero-width space và BOM
13
+ text = text.replace('\u200b', '').replace('\ufeff', '')
14
 
15
  # Chuẩn hóa khoảng trắng
16
  text = re.sub(r'[ \t]+', ' ', text)