Spaces:
Sleeping
Sleeping
unroll table and project restructuring
Browse files- api/chat_api_routers.py +0 -0
- core/ai_provider.py +38 -0
- core/chunking.py +0 -63
- core/llm_utils.py +0 -31
- core/retriever.py +0 -73
- data/Sổ tay sinh viên 2023-2024/0. Mục lục Sổ tay sinh viên K65.docx +0 -3
- data/Sổ tay sinh viên 2023-2024/1. QĐ-1226-Quy che dao tao dai hoc-DHTL (ban hanh).docx +0 -3
- data/Sổ tay sinh viên 2023-2024/10. QĐ 1089 thi OLP môn học (Final 10-5-2023).docx +0 -3
- data/Sổ tay sinh viên 2023-2024/11. QĐ về Học phí final (25-10-2021).docx +0 -3
- data/Sổ tay sinh viên 2023-2024/12. QD ngoại trú.docx +0 -3
- data/Sổ tay sinh viên 2023-2024/2. QĐ về tiếng anh CTTT.300921.QD.1315.docx +0 -3
- data/Sổ tay sinh viên 2023-2024/3. QD1767.TA tăng cường ban hanh.docx +0 -3
- data/Sổ tay sinh viên 2023-2024/4. QD411_QD_DHTL-Chuan_Dau_Ra_CNTT.pdf +0 -3
- data/Sổ tay sinh viên 2023-2024/4.1. QuyDinh_Ve_CDR_CNTT_Ban_hanh_theo_QD411-06-4-2022.pdf +0 -3
- data/Sổ tay sinh viên 2023-2024/5. QD_1038_16.07.2021_GDTC.docx +0 -3
- data/Sổ tay sinh viên 2023-2024/6. Quy định về tổ chức thi trực tuyến.docx +0 -3
- data/Sổ tay sinh viên 2023-2024/7. QĐ đánh giá KQRL (Final 18-8-2016).docx +0 -3
- data/Sổ tay sinh viên 2023-2024/8. QĐ ve HBKKHT, HBCS (final 12-5-2021).docx +0 -3
- data/Sổ tay sinh viên 2023-2024/9. QĐ Khen thưởng - KL (Final 10-8-2016).docx +0 -3
- {core → database}/document_db.py +1 -1
- main.py +6 -6
- {core → rag}/analyze_and_expand.py +1 -2
- rag/chunking.py +38 -0
- {core → rag}/collection_router_retriever.py +120 -131
- {core → rag}/collection_utils.py +0 -0
- {core → rag}/models.py +1 -1
- {core → rag}/prompting.py +1 -14
- {core → rag}/qa_pipeline.py +9 -114
- {core → rag}/rerank.py +0 -0
- {core → rag}/vectorstore.py +57 -47
- requirements.txt +2 -1
- {core → services}/document_ingest_service.py +7 -11
- {core → services}/supabase_sync_service.py +3 -4
- {core → utils}/text_utils.py +4 -2
api/chat_api_routers.py
DELETED
|
File without changes
|
core/ai_provider.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
import threading
|
| 4 |
+
import groq
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
class AIProviderManager:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
# Lấy danh sách keys
|
| 11 |
+
self.groq_keys = [k.strip() for k in os.getenv("GROQ_API_KEYS", "").split(",") if k.strip()]
|
| 12 |
+
self.gemini_keys = [k.strip() for k in os.getenv("GEMINI_API_KEYS", "").split(",") if k.strip()]
|
| 13 |
+
self.groq_idx = 0
|
| 14 |
+
self.gemini_idx = 0
|
| 15 |
+
self._lock = threading.Lock() # Đảm bảo Thread-Safe khi có nhiều Request cùng lúc
|
| 16 |
+
|
| 17 |
+
def get_groq_client(self):
|
| 18 |
+
if not self.groq_keys: return None
|
| 19 |
+
# Chỉ lấy key, không thay đổi state nên không cần lock
|
| 20 |
+
return groq.Groq(api_key=self.groq_keys[self.groq_idx])
|
| 21 |
+
|
| 22 |
+
def rotate_groq(self):
|
| 23 |
+
with self._lock: # Khóa luồng khi xoay tua để tránh xung đột
|
| 24 |
+
if len(self.groq_keys) > 1:
|
| 25 |
+
self.groq_idx = (self.groq_idx + 1) % len(self.groq_keys)
|
| 26 |
+
logger.info(f"Đã xoay sang Groq Key thứ {self.groq_idx + 1}")
|
| 27 |
+
|
| 28 |
+
def get_gemini_key(self):
|
| 29 |
+
if not self.gemini_keys: return None
|
| 30 |
+
return self.gemini_keys[self.gemini_idx]
|
| 31 |
+
|
| 32 |
+
def rotate_gemini(self):
|
| 33 |
+
with self._lock:
|
| 34 |
+
if len(self.gemini_keys) > 1:
|
| 35 |
+
self.gemini_idx = (self.gemini_idx + 1) % len(self.gemini_keys)
|
| 36 |
+
logger.info("Đã xoay sang Gemini Key dự phòng")
|
| 37 |
+
|
| 38 |
+
api_manager = AIProviderManager()
|
core/chunking.py
DELETED
|
@@ -1,63 +0,0 @@
|
|
| 1 |
-
import re
|
| 2 |
-
from typing import List
|
| 3 |
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 4 |
-
from .config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 5 |
-
|
| 6 |
-
def extract_and_protect_tables(text: str) -> tuple[str, dict]:
|
| 7 |
-
"""Tìm và bọc các bảng Markdown để bảo vệ chúng khỏi việc bị cắt gãy."""
|
| 8 |
-
# Pattern tìm bảng Markdown (các dòng bắt đầu và chứa ký tự | liên tiếp)
|
| 9 |
-
table_pattern = re.compile(r'(?:\|.*\|[\r\n]+)+')
|
| 10 |
-
tables = {}
|
| 11 |
-
|
| 12 |
-
def replace_table(match):
|
| 13 |
-
table_id = f"<TABLE_{len(tables)}>"
|
| 14 |
-
tables[table_id] = match.group(0)
|
| 15 |
-
return f"\n{table_id}\n"
|
| 16 |
-
|
| 17 |
-
protected_text = re.sub(table_pattern, replace_table, text)
|
| 18 |
-
return protected_text, tables
|
| 19 |
-
|
| 20 |
-
def smart_chunking(docs: List) -> List:
|
| 21 |
-
print("Đang áp dụng Smart Chunking (Bảo toàn Bảng & Danh sách)...")
|
| 22 |
-
legal_splitter = RecursiveCharacterTextSplitter(
|
| 23 |
-
chunk_size=CHUNK_SIZE,
|
| 24 |
-
chunk_overlap=CHUNK_OVERLAP,
|
| 25 |
-
separators=[
|
| 26 |
-
"\nĐiều ", "\nChương ", "\nMục ", "\nKhoản ",
|
| 27 |
-
"\n\n", "\n", ". ", " ", ""
|
| 28 |
-
],
|
| 29 |
-
length_function=len,
|
| 30 |
-
is_separator_regex=False
|
| 31 |
-
)
|
| 32 |
-
|
| 33 |
-
chunks = []
|
| 34 |
-
for doc in docs:
|
| 35 |
-
# 1. Bảo vệ List đang có
|
| 36 |
-
protected_text = doc.page_content.replace('\na.', '<LIST_a>') \
|
| 37 |
-
.replace('\nb.', '<LIST_b>') \
|
| 38 |
-
.replace('\nc.', '<LIST_c>')
|
| 39 |
-
|
| 40 |
-
# 2. Bảo vệ Table
|
| 41 |
-
protected_text, tables = extract_and_protect_tables(protected_text)
|
| 42 |
-
|
| 43 |
-
# 3. Tiến hành cắt
|
| 44 |
-
doc_chunks = legal_splitter.split_text(protected_text)
|
| 45 |
-
|
| 46 |
-
# 4. Phục hồi dữ liệu
|
| 47 |
-
for chunk_text in doc_chunks:
|
| 48 |
-
restored = chunk_text.replace('<LIST_a>', '\na.') \
|
| 49 |
-
.replace('<LIST_b>', '\nb.') \
|
| 50 |
-
.replace('<LIST_c>', '\nc.')
|
| 51 |
-
|
| 52 |
-
for table_id, table_content in tables.items():
|
| 53 |
-
if table_id in restored:
|
| 54 |
-
restored = restored.replace(table_id, table_content)
|
| 55 |
-
|
| 56 |
-
new_doc = type(doc)(
|
| 57 |
-
page_content=restored,
|
| 58 |
-
metadata=doc.metadata.copy()
|
| 59 |
-
)
|
| 60 |
-
chunks.append(new_doc)
|
| 61 |
-
|
| 62 |
-
print(f" Đã tạo {len(chunks)} chunks thông minh (giữ nguyên cấu trúc bảng)")
|
| 63 |
-
return chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core/llm_utils.py
DELETED
|
@@ -1,31 +0,0 @@
|
|
| 1 |
-
import time
|
| 2 |
-
import logging
|
| 3 |
-
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
|
| 4 |
-
|
| 5 |
-
logger = logging.getLogger(__name__)
|
| 6 |
-
|
| 7 |
-
def safe_stream(llm, prompt) :
|
| 8 |
-
try:
|
| 9 |
-
for chunk in llm.stream(prompt):
|
| 10 |
-
text = getattr(chunk, "content", str(chunk))
|
| 11 |
-
if text:
|
| 12 |
-
yield text
|
| 13 |
-
except Exception :
|
| 14 |
-
logger.exception("Lỗi khi stream LLM:")
|
| 15 |
-
yield "Lỗi khi stream LLM "
|
| 16 |
-
def safe_invoke(llm ,prompt : str, timeout : int =30, retries: int =2):
|
| 17 |
-
last_error = None
|
| 18 |
-
for attempt in range(1, retries+1):
|
| 19 |
-
try:
|
| 20 |
-
with ThreadPoolExecutor(max_workers=1) as pool:
|
| 21 |
-
fut = pool.submit(llm.invoke, prompt)
|
| 22 |
-
return fut.result(timeout=timeout)
|
| 23 |
-
except FuturesTimeoutError as e:
|
| 24 |
-
last_error = e
|
| 25 |
-
logger.warning(f" Lần {attempt}: LLM invoke timeout sau {timeout} giây. Đang thử lại...")
|
| 26 |
-
except Exception as e:
|
| 27 |
-
last_error = e
|
| 28 |
-
logger.error(f"Lần {attempt}: Lỗi khi gọi LLM: {e}. Đang thử lại...")
|
| 29 |
-
time.sleep(0.6*attempt)
|
| 30 |
-
|
| 31 |
-
raise RuntimeError (f"LLM failed after {retries} attempts: {last_error}") # Thêm delay nhỏ trước khi thử lại
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core/retriever.py
DELETED
|
@@ -1,73 +0,0 @@
|
|
| 1 |
-
from typing import List
|
| 2 |
-
import hashlib
|
| 3 |
-
from rank_bm25 import BM25Okapi
|
| 4 |
-
|
| 5 |
-
class HybridRetriever:
|
| 6 |
-
"""Kết hợp BM25 và Vector Search."""
|
| 7 |
-
def __init__(self, vectorstore, documents):
|
| 8 |
-
self.vectorstore = vectorstore
|
| 9 |
-
self.documents = documents
|
| 10 |
-
print(" Đang khởi tạo BM25...")
|
| 11 |
-
tokenized_docs = [doc.page_content.lower().split() for doc in documents]
|
| 12 |
-
self.bm25 = BM25Okapi(tokenized_docs, k1=1.5, b=0.5)
|
| 13 |
-
self.rrf_c = 60
|
| 14 |
-
print(" BM25 sẵn sàng!")
|
| 15 |
-
|
| 16 |
-
@staticmethod
|
| 17 |
-
def _doc_key(doc) -> str:
|
| 18 |
-
metadata = doc.metadata if isinstance(doc.metadata, dict) else {}
|
| 19 |
-
source = str(metadata.get("source_relpath") or metadata.get("source_file") or metadata.get("source") or "")
|
| 20 |
-
page = str(metadata.get("page_number") or metadata.get("page") or "")
|
| 21 |
-
content = (doc.page_content or "").strip()
|
| 22 |
-
digest = hashlib.sha1(content.encode("utf-8")).hexdigest() if content else "empty"
|
| 23 |
-
return f"{source}|{page}|{digest}"
|
| 24 |
-
|
| 25 |
-
def search(self, query: str, k: int = 10, alpha: float = 0.6, year_scope: str | None = None) -> List:
|
| 26 |
-
del year_scope
|
| 27 |
-
if not self.documents or k <= 0:
|
| 28 |
-
return []
|
| 29 |
-
|
| 30 |
-
alpha = max(0.0, min(1.0, float(alpha)))
|
| 31 |
-
bm25_weight = 1.0 - alpha
|
| 32 |
-
vector_weight = alpha
|
| 33 |
-
|
| 34 |
-
# Lấy top k từ BM25
|
| 35 |
-
tokenized_query = query.lower().split()
|
| 36 |
-
candidate_k = min(max(k * 4, k), len(self.documents))
|
| 37 |
-
bm25_top_docs = self.bm25.get_top_n(tokenized_query, self.documents, n=candidate_k)
|
| 38 |
-
|
| 39 |
-
bm25_ranked = {}
|
| 40 |
-
all_retrieved = {}
|
| 41 |
-
for rank, doc in enumerate(bm25_top_docs, 1):
|
| 42 |
-
key = self._doc_key(doc)
|
| 43 |
-
bm25_ranked[key] = rank
|
| 44 |
-
all_retrieved[key] = doc
|
| 45 |
-
|
| 46 |
-
# Lấy top k từ Vector
|
| 47 |
-
try:
|
| 48 |
-
vector_results = self.vectorstore.similarity_search(query, k=candidate_k)
|
| 49 |
-
except Exception as e:
|
| 50 |
-
print(f"Lỗi Vector Search: {e}")
|
| 51 |
-
return [doc for doc in bm25_top_docs[:k]]
|
| 52 |
-
|
| 53 |
-
vector_ranked = {}
|
| 54 |
-
for rank, doc in enumerate(vector_results, 1):
|
| 55 |
-
key = self._doc_key(doc)
|
| 56 |
-
vector_ranked[key] = rank
|
| 57 |
-
all_retrieved[key] = doc
|
| 58 |
-
|
| 59 |
-
rrf_results = []
|
| 60 |
-
|
| 61 |
-
for content, doc in all_retrieved.items():
|
| 62 |
-
score = 0.0
|
| 63 |
-
if content in bm25_ranked:
|
| 64 |
-
score += bm25_weight / (self.rrf_c + bm25_ranked[content])
|
| 65 |
-
if content in vector_ranked:
|
| 66 |
-
score += vector_weight / (self.rrf_c + vector_ranked[content])
|
| 67 |
-
|
| 68 |
-
if score > 0:
|
| 69 |
-
rrf_results.append((score, doc))
|
| 70 |
-
|
| 71 |
-
rrf_results.sort(key=lambda x: x[0], reverse=True)
|
| 72 |
-
return [doc for score, doc in rrf_results[:k]]
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/Sổ tay sinh viên 2023-2024/0. Mục lục Sổ tay sinh viên K65.docx
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:725638f2ebed983fb38354fa9d2073937a4df89418b4c4c6958c68f29c868113
|
| 3 |
-
size 14530
|
|
|
|
|
|
|
|
|
|
|
|
data/Sổ tay sinh viên 2023-2024/1. QĐ-1226-Quy che dao tao dai hoc-DHTL (ban hanh).docx
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:aaf33dda956f69966bbe64864432618068eaded7d3016644c4acfe8889f4dd5a
|
| 3 |
-
size 120225
|
|
|
|
|
|
|
|
|
|
|
|
data/Sổ tay sinh viên 2023-2024/10. QĐ 1089 thi OLP môn học (Final 10-5-2023).docx
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:8616d772baf6ff2225f2163e12a88418f2a6b46c1770fcd7d306da952ec3a94e
|
| 3 |
-
size 52007
|
|
|
|
|
|
|
|
|
|
|
|
data/Sổ tay sinh viên 2023-2024/11. QĐ về Học phí final (25-10-2021).docx
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:17a91329f9e31308df57a53e36a69665fe37f3aea37fc32bc69a7fe984fbf3c5
|
| 3 |
-
size 56350
|
|
|
|
|
|
|
|
|
|
|
|
data/Sổ tay sinh viên 2023-2024/12. QD ngoại trú.docx
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:06e7018cfa51e785f90009a66a2dc442a17f55ae52f5f28ddce208e61a6ba7d6
|
| 3 |
-
size 26065
|
|
|
|
|
|
|
|
|
|
|
|
data/Sổ tay sinh viên 2023-2024/2. QĐ về tiếng anh CTTT.300921.QD.1315.docx
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e7b27a9545255d7d69158cc8e8d42d560f00c6865f665317619822159af7e90c
|
| 3 |
-
size 40894
|
|
|
|
|
|
|
|
|
|
|
|
data/Sổ tay sinh viên 2023-2024/3. QD1767.TA tăng cường ban hanh.docx
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:1dc5e1e6aef63069f673606c5083fb9dab484fa1b9cb10f955121bc4042c7131
|
| 3 |
-
size 37167
|
|
|
|
|
|
|
|
|
|
|
|
data/Sổ tay sinh viên 2023-2024/4. QD411_QD_DHTL-Chuan_Dau_Ra_CNTT.pdf
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:544fe7b5b71f4fb407175fe0bcdc8853e0943eb410a4947b2ac8d891f359619b
|
| 3 |
-
size 369669
|
|
|
|
|
|
|
|
|
|
|
|
data/Sổ tay sinh viên 2023-2024/4.1. QuyDinh_Ve_CDR_CNTT_Ban_hanh_theo_QD411-06-4-2022.pdf
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:17ebf4761e56b3d3336cfeaeab86bee4de7614fd2c27c20d848a0fed7650abfa
|
| 3 |
-
size 498348
|
|
|
|
|
|
|
|
|
|
|
|
data/Sổ tay sinh viên 2023-2024/5. QD_1038_16.07.2021_GDTC.docx
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:ee1a9d836579a08c6cbde24f63f9c8eb64aa70f52cfeec4c5e1c6ed789993da3
|
| 3 |
-
size 47518
|
|
|
|
|
|
|
|
|
|
|
|
data/Sổ tay sinh viên 2023-2024/6. Quy định về tổ chức thi trực tuyến.docx
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:80c185d0fdb75571f7f9fafd974e4ac4a4006a890ca3ce975ca7ca70181848a3
|
| 3 |
-
size 65814
|
|
|
|
|
|
|
|
|
|
|
|
data/Sổ tay sinh viên 2023-2024/7. QĐ đánh giá KQRL (Final 18-8-2016).docx
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:4905773937cd8d12a8d24224146dc1442418ed3a57b60e3348a0e1b8eeb5ea2b
|
| 3 |
-
size 74575
|
|
|
|
|
|
|
|
|
|
|
|
data/Sổ tay sinh viên 2023-2024/8. QĐ ve HBKKHT, HBCS (final 12-5-2021).docx
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:da1cd4f217a697635a5dae85285f46aa9a19f217f3fd03c981a9f47d79e31f2d
|
| 3 |
-
size 30512
|
|
|
|
|
|
|
|
|
|
|
|
data/Sổ tay sinh viên 2023-2024/9. QĐ Khen thưởng - KL (Final 10-8-2016).docx
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:46e6bdba5671d6aabfda753efa4010b8fb828d2c9180ab58882d7361c0200c67
|
| 3 |
-
size 76052
|
|
|
|
|
|
|
|
|
|
|
|
{core → database}/document_db.py
RENAMED
|
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional
|
|
| 7 |
from sqlalchemy import Column, DateTime, ForeignKey, Integer, String, Text, create_engine, func, inspect, or_, text
|
| 8 |
from sqlalchemy.orm import Session, declarative_base, relationship, sessionmaker
|
| 9 |
|
| 10 |
-
from .config import DOCUMENTS_DATABASE_URL
|
| 11 |
|
| 12 |
Base = declarative_base()
|
| 13 |
logger = logging.getLogger(__name__)
|
|
|
|
| 7 |
from sqlalchemy import Column, DateTime, ForeignKey, Integer, String, Text, create_engine, func, inspect, or_, text
|
| 8 |
from sqlalchemy.orm import Session, declarative_base, relationship, sessionmaker
|
| 9 |
|
| 10 |
+
from core.config import DOCUMENTS_DATABASE_URL
|
| 11 |
|
| 12 |
Base = declarative_base()
|
| 13 |
logger = logging.getLogger(__name__)
|
main.py
CHANGED
|
@@ -26,12 +26,12 @@ from core.config import (
|
|
| 26 |
SUPABASE_SYNC_SNAPSHOT_FILE,
|
| 27 |
SUPABASE_URL,
|
| 28 |
)
|
| 29 |
-
from
|
| 30 |
-
from
|
| 31 |
-
from
|
| 32 |
-
from
|
| 33 |
-
from
|
| 34 |
-
from
|
| 35 |
from api.admin_sync_router import router as admin_sync_router
|
| 36 |
|
| 37 |
# Hàm log lỗi an toàn
|
|
|
|
| 26 |
SUPABASE_SYNC_SNAPSHOT_FILE,
|
| 27 |
SUPABASE_URL,
|
| 28 |
)
|
| 29 |
+
from database.document_db import init_document_db
|
| 30 |
+
from services.supabase_sync_service import SupabaseStorageSyncService, SupabaseSyncCoordinator
|
| 31 |
+
from rag.collection_router_retriever import CollectionRouterRetriever
|
| 32 |
+
from rag.vectorstore import build_vectorstore_improved, load_vectorstore_improved
|
| 33 |
+
from rag.models import embeddings
|
| 34 |
+
from rag.qa_pipeline import ask_ai_improved, ask_ai_stream_delta
|
| 35 |
from api.admin_sync_router import router as admin_sync_router
|
| 36 |
|
| 37 |
# Hàm log lỗi an toàn
|
{core → rag}/analyze_and_expand.py
RENAMED
|
@@ -21,8 +21,7 @@ def clean_json_string(text: str) -> str:
|
|
| 21 |
def analyze_and_expand_query(question: str) -> Dict[str, Any]:
|
| 22 |
print(" Phân tích & Mở rộng câu hỏi...")
|
| 23 |
|
| 24 |
-
|
| 25 |
-
from .qa_pipeline import api_manager
|
| 26 |
|
| 27 |
# Prompt được tối ưu để ép AI trả về JSON chuẩn
|
| 28 |
prompt = f"""
|
|
|
|
| 21 |
def analyze_and_expand_query(question: str) -> Dict[str, Any]:
|
| 22 |
print(" Phân tích & Mở rộng câu hỏi...")
|
| 23 |
|
| 24 |
+
from core.ai_provider import api_manager
|
|
|
|
| 25 |
|
| 26 |
# Prompt được tối ưu để ép AI trả về JSON chuẩn
|
| 27 |
prompt = f"""
|
rag/chunking.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 3 |
+
from core.config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 4 |
+
|
| 5 |
+
def smart_chunking(docs: List) -> List:
|
| 6 |
+
print("Đang áp dụng Smart Chunking (Regex Lookahead)...")
|
| 7 |
+
|
| 8 |
+
# Cấu hình Regex bắt cấu trúc phân cấp hành chính (Chương -> Điều -> Khoản)
|
| 9 |
+
legal_splitter = RecursiveCharacterTextSplitter(
|
| 10 |
+
chunk_size=CHUNK_SIZE,
|
| 11 |
+
chunk_overlap=CHUNK_OVERLAP,
|
| 12 |
+
separators=[
|
| 13 |
+
"\nChương ",
|
| 14 |
+
"\nĐiều ",
|
| 15 |
+
"\nKhoản ",
|
| 16 |
+
"\n\n",
|
| 17 |
+
r"\n(?=\d+\.)",
|
| 18 |
+
r"\n(?=[a-z]\.)",
|
| 19 |
+
r"\n(?=-|\+)",
|
| 20 |
+
"\n", " ", ""
|
| 21 |
+
],
|
| 22 |
+
length_function=len,
|
| 23 |
+
is_separator_regex=True
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
chunks = []
|
| 27 |
+
for doc in docs:
|
| 28 |
+
doc_chunks = legal_splitter.split_text(doc.page_content)
|
| 29 |
+
|
| 30 |
+
for chunk_text in doc_chunks:
|
| 31 |
+
new_doc = type(doc)(
|
| 32 |
+
page_content=chunk_text,
|
| 33 |
+
metadata=doc.metadata.copy()
|
| 34 |
+
)
|
| 35 |
+
chunks.append(new_doc)
|
| 36 |
+
|
| 37 |
+
print(f"Đã tạo {len(chunks)} chunks thông minh.")
|
| 38 |
+
return chunks
|
{core → rag}/collection_router_retriever.py
RENAMED
|
@@ -5,25 +5,39 @@ from typing import List
|
|
| 5 |
from langchain_core.documents import Document as LangChainDocument
|
| 6 |
from rank_bm25 import BM25Okapi
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
from .collection_utils import collection_matches_cohort
|
| 9 |
-
from .document_db import SessionLocal, list_active_collection_names
|
| 10 |
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
class CollectionRouterRetriever:
|
| 15 |
def __init__(
|
| 16 |
self,
|
| 17 |
-
|
| 18 |
-
qdrant_client,
|
| 19 |
embeddings_model,
|
| 20 |
top_n_collections: int = 3,
|
| 21 |
) -> None:
|
| 22 |
-
self.base_retriever = base_retriever
|
| 23 |
self.qdrant_client = qdrant_client
|
| 24 |
self.embeddings_model = embeddings_model
|
| 25 |
self.top_n_collections = max(1, int(top_n_collections or 3))
|
| 26 |
-
|
|
|
|
| 27 |
|
| 28 |
@staticmethod
|
| 29 |
def _doc_key(doc) -> str:
|
|
@@ -63,63 +77,91 @@ class CollectionRouterRetriever:
|
|
| 63 |
|
| 64 |
return active_collections[: self.top_n_collections]
|
| 65 |
|
| 66 |
-
def _ensure_bm25_loaded(self, collection_name: str) -> BM25Okapi | None:
|
| 67 |
-
"""Lazy load and cache BM25 index for a collection
|
| 68 |
-
|
| 69 |
-
First time: fetch all docs from Qdrant, build BM25, cache it (~0.3s)
|
| 70 |
-
Subsequent times: reuse from cache (~0.001s)
|
| 71 |
-
"""
|
| 72 |
-
# Check if already cached
|
| 73 |
-
if collection_name in self._bm25_cache:
|
| 74 |
-
return self._bm25_cache[collection_name]
|
| 75 |
|
|
|
|
| 76 |
try:
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
if not points_list:
|
| 86 |
logger.warning("No documents found in collection=%s for BM25 indexing", collection_name)
|
| 87 |
return None
|
| 88 |
|
| 89 |
-
#
|
| 90 |
-
|
| 91 |
-
if not points_list:
|
| 92 |
-
logger.warning("No valid points found in collection=%s after filtering", collection_name)
|
| 93 |
-
return None
|
| 94 |
-
|
| 95 |
-
# Extract documents and tokenize for BM25
|
| 96 |
-
docs_for_bm25 = []
|
| 97 |
for point in points_list:
|
| 98 |
payload = point.payload if isinstance(point.payload, dict) else {}
|
| 99 |
content = str(payload.get("content") or "").strip()
|
| 100 |
if content:
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
-
if not
|
| 104 |
logger.warning("No valid content found in collection=%s for BM25 indexing", collection_name)
|
| 105 |
return None
|
| 106 |
|
| 107 |
-
|
| 108 |
-
tokenized_docs = [doc.lower().split() for doc in docs_for_bm25]
|
| 109 |
bm25 = BM25Okapi(tokenized_docs, k1=1.5, b=0.5)
|
| 110 |
|
| 111 |
-
# Cache
|
| 112 |
-
self._bm25_cache[collection_name] =
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
return bm25
|
| 116 |
|
| 117 |
except Exception:
|
| 118 |
logger.exception("Failed to build BM25 index for collection=%s", collection_name)
|
| 119 |
return None
|
| 120 |
|
| 121 |
def _search_target_collections(self, query: str, collections: List[str], limit: int, alpha: float = 0.6) -> List:
|
| 122 |
-
"""Hybrid search: BM25 + Vector + RRF
|
| 123 |
if not collections:
|
| 124 |
return []
|
| 125 |
|
|
@@ -129,9 +171,9 @@ class CollectionRouterRetriever:
|
|
| 129 |
logger.exception("Failed to embed query for collection routing")
|
| 130 |
return []
|
| 131 |
|
| 132 |
-
# Step 1: Vector search
|
| 133 |
-
all_docs_dict = {}
|
| 134 |
-
vector_ranked = {}
|
| 135 |
|
| 136 |
vector_rank = 0
|
| 137 |
for collection_name in collections:
|
|
@@ -171,60 +213,45 @@ class CollectionRouterRetriever:
|
|
| 171 |
vector_rank += 1
|
| 172 |
vector_ranked[doc_key] = vector_rank
|
| 173 |
|
| 174 |
-
# Step 2: BM25 search
|
| 175 |
-
bm25_ranked = {}
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
tokenized_query = query.lower().split()
|
| 183 |
-
|
| 184 |
-
# For each collection, use cached BM25 index
|
| 185 |
for collection_name in collections:
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
if bm25 is None:
|
| 189 |
continue
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
# Build BM25 index for this subset and score
|
| 204 |
-
if content_for_bm25:
|
| 205 |
-
tokenized_subset = [content.lower().split() for content in content_for_bm25]
|
| 206 |
-
bm25_subset = BM25Okapi(tokenized_subset, k1=1.5, b=0.5)
|
| 207 |
-
bm25_results = bm25_subset.get_top_n(tokenized_query, content_for_bm25, n=len(content_for_bm25))
|
| 208 |
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
# Find matching doc by content (handles duplicates)
|
| 212 |
-
matched_doc = None
|
| 213 |
-
for doc in docs_from_collection:
|
| 214 |
-
if doc.page_content == content:
|
| 215 |
-
matched_doc = doc
|
| 216 |
-
break
|
| 217 |
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
bm25_rank += 1
|
| 222 |
-
bm25_ranked[doc_key] = bm25_rank
|
| 223 |
|
| 224 |
-
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
-
# Step 3: RRF combination
|
| 228 |
alpha = max(0.0, min(1.0, float(alpha)))
|
| 229 |
bm25_weight = 1.0 - alpha
|
| 230 |
vector_weight = alpha
|
|
@@ -234,18 +261,15 @@ class CollectionRouterRetriever:
|
|
| 234 |
for doc_key, doc in all_docs_dict.items():
|
| 235 |
score = 0.0
|
| 236 |
|
| 237 |
-
# Vector score
|
| 238 |
if doc_key in vector_ranked:
|
| 239 |
score += vector_weight / (rrf_c + vector_ranked[doc_key])
|
| 240 |
|
| 241 |
-
# BM25 score
|
| 242 |
if doc_key in bm25_ranked:
|
| 243 |
score += bm25_weight / (rrf_c + bm25_ranked[doc_key])
|
| 244 |
|
| 245 |
if score > 0:
|
| 246 |
rrf_scores[doc_key] = score
|
| 247 |
|
| 248 |
-
# Sort by RRF score
|
| 249 |
sorted_results = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
|
| 250 |
return [all_docs_dict[doc_key] for doc_key, _ in sorted_results[:limit]]
|
| 251 |
|
|
@@ -267,54 +291,19 @@ class CollectionRouterRetriever:
|
|
| 267 |
alpha=alpha,
|
| 268 |
)
|
| 269 |
|
| 270 |
-
# Log warning if no documents found
|
| 271 |
if not routed_docs:
|
| 272 |
logger.warning("No documents found for query=%s, cohort=%s", query[:50], cohort_key)
|
| 273 |
-
|
| 274 |
-
if cohort_scoped:
|
| 275 |
-
deduplicated = []
|
| 276 |
-
seen = set()
|
| 277 |
-
for doc in routed_docs:
|
| 278 |
-
key = self._doc_key(doc)
|
| 279 |
-
if key in seen:
|
| 280 |
-
continue
|
| 281 |
-
seen.add(key)
|
| 282 |
-
deduplicated.append(doc)
|
| 283 |
-
if len(deduplicated) >= candidate_k:
|
| 284 |
-
break
|
| 285 |
-
return deduplicated[:k]
|
| 286 |
-
|
| 287 |
-
fallback_docs = []
|
| 288 |
-
if self.base_retriever is not None:
|
| 289 |
-
try:
|
| 290 |
-
fallback_docs = self.base_retriever.search(
|
| 291 |
-
query,
|
| 292 |
-
k=candidate_k,
|
| 293 |
-
alpha=alpha,
|
| 294 |
-
cohort_key=cohort_key,
|
| 295 |
-
)
|
| 296 |
-
except TypeError:
|
| 297 |
-
fallback_docs = self.base_retriever.search(
|
| 298 |
-
query,
|
| 299 |
-
k=candidate_k,
|
| 300 |
-
alpha=alpha,
|
| 301 |
-
)
|
| 302 |
-
except Exception:
|
| 303 |
-
logger.exception("Base retriever fallback failed")
|
| 304 |
|
| 305 |
deduplicated = []
|
| 306 |
seen = set()
|
| 307 |
-
|
| 308 |
-
# Safe handling of fallback_docs which might be None
|
| 309 |
-
fallback_docs_list = list(fallback_docs) if fallback_docs else []
|
| 310 |
-
|
| 311 |
-
for doc in routed_docs + fallback_docs_list:
|
| 312 |
key = self._doc_key(doc)
|
| 313 |
if key in seen:
|
| 314 |
continue
|
| 315 |
seen.add(key)
|
| 316 |
deduplicated.append(doc)
|
| 317 |
-
if len(deduplicated) >=
|
| 318 |
break
|
| 319 |
-
|
| 320 |
-
return deduplicated
|
|
|
|
| 5 |
from langchain_core.documents import Document as LangChainDocument
|
| 6 |
from rank_bm25 import BM25Okapi
|
| 7 |
|
| 8 |
+
|
| 9 |
+
try:
|
| 10 |
+
from pyvi import ViTokenizer
|
| 11 |
+
except Exception:
|
| 12 |
+
ViTokenizer = None
|
| 13 |
+
|
| 14 |
from .collection_utils import collection_matches_cohort
|
| 15 |
+
from database.document_db import SessionLocal, list_active_collection_names
|
| 16 |
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
| 19 |
|
| 20 |
+
def _vi_tokenize(text: str) -> List[str]:
|
| 21 |
+
normalized = (text or "").lower().strip()
|
| 22 |
+
if not normalized:
|
| 23 |
+
return []
|
| 24 |
+
if ViTokenizer is None:
|
| 25 |
+
return normalized.split()
|
| 26 |
+
return ViTokenizer.tokenize(normalized).split()
|
| 27 |
+
|
| 28 |
+
|
| 29 |
class CollectionRouterRetriever:
|
| 30 |
def __init__(
|
| 31 |
self,
|
| 32 |
+
qdrant_client,
|
|
|
|
| 33 |
embeddings_model,
|
| 34 |
top_n_collections: int = 3,
|
| 35 |
) -> None:
|
|
|
|
| 36 |
self.qdrant_client = qdrant_client
|
| 37 |
self.embeddings_model = embeddings_model
|
| 38 |
self.top_n_collections = max(1, int(top_n_collections or 3))
|
| 39 |
+
# Cache giờ đây lưu một dict: { 'bm25': obj, 'corpus_docs': list, 'count': int }
|
| 40 |
+
self._bm25_cache = {}
|
| 41 |
|
| 42 |
@staticmethod
|
| 43 |
def _doc_key(doc) -> str:
|
|
|
|
| 77 |
|
| 78 |
return active_collections[: self.top_n_collections]
|
| 79 |
|
| 80 |
+
def _ensure_bm25_loaded(self, collection_name: str) -> tuple[BM25Okapi, List[LangChainDocument]] | None:
|
| 81 |
+
"""Lazy load and cache BM25 index and corpus for a collection (với cơ chế tự động làm mới Cache)"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
# 1. Lấy tổng số chunks hiện tại trong Qdrant (Rất nhanh, tốn < 10ms)
|
| 84 |
try:
|
| 85 |
+
collection_info = self.qdrant_client.get_collection(collection_name)
|
| 86 |
+
current_count = collection_info.points_count
|
| 87 |
+
except Exception:
|
| 88 |
+
logger.exception("Failed to get collection info for %s", collection_name)
|
| 89 |
+
return None
|
| 90 |
+
|
| 91 |
+
# 2. Kiểm tra Cache: Nếu chưa có hoặc số lượng thay đổi -> Xóa cache build lại
|
| 92 |
+
cached_data = self._bm25_cache.get(collection_name)
|
| 93 |
+
if cached_data and cached_data.get('count') == current_count:
|
| 94 |
+
# Tái sử dụng (Phải trả về cả bm25 VÀ corpus_docs để map điểm)
|
| 95 |
+
return cached_data['bm25'], cached_data['corpus_docs']
|
| 96 |
|
| 97 |
+
logger.info(f"Phát hiện dữ liệu mới hoặc chưa có cache cho {collection_name} (Count: {current_count}). Đang build lại BM25...")
|
| 98 |
+
|
| 99 |
+
try:
|
| 100 |
+
points_list = []
|
| 101 |
+
offset = None
|
| 102 |
+
# Phân trang để lấy TOÀN BỘ documents từ collection
|
| 103 |
+
while True:
|
| 104 |
+
response = self.qdrant_client.scroll(
|
| 105 |
+
collection_name=collection_name,
|
| 106 |
+
limit=10000,
|
| 107 |
+
offset=offset,
|
| 108 |
+
with_payload=True,
|
| 109 |
+
with_vectors=False
|
| 110 |
+
)
|
| 111 |
+
batch_points, next_offset = response
|
| 112 |
+
points_list.extend([p for p in batch_points if p is not None])
|
| 113 |
+
|
| 114 |
+
offset = next_offset
|
| 115 |
+
if offset is None:
|
| 116 |
+
break
|
| 117 |
|
| 118 |
if not points_list:
|
| 119 |
logger.warning("No documents found in collection=%s for BM25 indexing", collection_name)
|
| 120 |
return None
|
| 121 |
|
| 122 |
+
# Trích xuất content và build documents
|
| 123 |
+
corpus_docs = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
for point in points_list:
|
| 125 |
payload = point.payload if isinstance(point.payload, dict) else {}
|
| 126 |
content = str(payload.get("content") or "").strip()
|
| 127 |
if content:
|
| 128 |
+
metadata = {
|
| 129 |
+
"source": payload.get("path") or payload.get("object_path") or payload.get("stored_name") or "",
|
| 130 |
+
"source_file": payload.get("filename") or payload.get("stored_name") or "",
|
| 131 |
+
"source_relpath": payload.get("object_path") or payload.get("path") or "",
|
| 132 |
+
"object_path": payload.get("object_path") or "",
|
| 133 |
+
"folder_key": payload.get("folder_key") or "",
|
| 134 |
+
"collection_name": collection_name,
|
| 135 |
+
"academic_year": payload.get("academic_year") or "",
|
| 136 |
+
"chunk_index": payload.get("chunk_index"),
|
| 137 |
+
"page_number": payload.get("page_number"),
|
| 138 |
+
}
|
| 139 |
+
doc = LangChainDocument(page_content=content, metadata=metadata)
|
| 140 |
+
corpus_docs.append(doc)
|
| 141 |
|
| 142 |
+
if not corpus_docs:
|
| 143 |
logger.warning("No valid content found in collection=%s for BM25 indexing", collection_name)
|
| 144 |
return None
|
| 145 |
|
| 146 |
+
tokenized_docs = [_vi_tokenize(doc.page_content) for doc in corpus_docs]
|
|
|
|
| 147 |
bm25 = BM25Okapi(tokenized_docs, k1=1.5, b=0.5)
|
| 148 |
|
| 149 |
+
# 3. Lưu lại Cache kèm the con số count và corpus_docs để đối chiếu lần sau
|
| 150 |
+
self._bm25_cache[collection_name] = {
|
| 151 |
+
'bm25': bm25,
|
| 152 |
+
'corpus_docs': corpus_docs,
|
| 153 |
+
'count': current_count
|
| 154 |
+
}
|
| 155 |
+
logger.info("BM25 index built and cached for collection=%s (docs=%d)", collection_name, len(corpus_docs))
|
| 156 |
|
| 157 |
+
return bm25, corpus_docs
|
| 158 |
|
| 159 |
except Exception:
|
| 160 |
logger.exception("Failed to build BM25 index for collection=%s", collection_name)
|
| 161 |
return None
|
| 162 |
|
| 163 |
def _search_target_collections(self, query: str, collections: List[str], limit: int, alpha: float = 0.6) -> List:
|
| 164 |
+
"""Hybrid search: BM25 + Vector + RRF"""
|
| 165 |
if not collections:
|
| 166 |
return []
|
| 167 |
|
|
|
|
| 171 |
logger.exception("Failed to embed query for collection routing")
|
| 172 |
return []
|
| 173 |
|
| 174 |
+
# Step 1: Vector search
|
| 175 |
+
all_docs_dict = {}
|
| 176 |
+
vector_ranked = {}
|
| 177 |
|
| 178 |
vector_rank = 0
|
| 179 |
for collection_name in collections:
|
|
|
|
| 213 |
vector_rank += 1
|
| 214 |
vector_ranked[doc_key] = vector_rank
|
| 215 |
|
| 216 |
+
# Step 2: BM25 search
|
| 217 |
+
bm25_ranked = {}
|
| 218 |
+
try:
|
| 219 |
+
tokenized_query = _vi_tokenize(query)
|
| 220 |
+
|
| 221 |
+
if not tokenized_query:
|
| 222 |
+
logger.warning("Query is empty after tokenization, skipping BM25 search")
|
| 223 |
+
else:
|
|
|
|
|
|
|
|
|
|
| 224 |
for collection_name in collections:
|
| 225 |
+
bm25_data = self._ensure_bm25_loaded(collection_name)
|
| 226 |
+
if bm25_data is None:
|
|
|
|
| 227 |
continue
|
| 228 |
+
|
| 229 |
+
bm25, corpus_docs = bm25_data
|
| 230 |
+
|
| 231 |
+
scores = bm25.get_scores(tokenized_query)
|
| 232 |
+
scored_docs = sorted(zip(corpus_docs, scores), key=lambda x: x[1], reverse=True)
|
| 233 |
+
|
| 234 |
+
bm25_rank = 0
|
| 235 |
+
for doc, score in scored_docs:
|
| 236 |
+
if score <= 0:
|
| 237 |
+
break
|
| 238 |
+
|
| 239 |
+
doc_key = self._doc_key(doc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
+
if doc_key not in all_docs_dict:
|
| 242 |
+
all_docs_dict[doc_key] = doc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
|
| 244 |
+
if doc_key not in bm25_ranked:
|
| 245 |
+
bm25_rank += 1
|
| 246 |
+
bm25_ranked[doc_key] = bm25_rank
|
|
|
|
|
|
|
| 247 |
|
| 248 |
+
if bm25_rank >= limit:
|
| 249 |
+
break
|
| 250 |
+
|
| 251 |
+
except Exception:
|
| 252 |
+
logger.exception("BM25 search failed, falling back to vector-only")
|
| 253 |
|
| 254 |
+
# Step 3: RRF combination
|
| 255 |
alpha = max(0.0, min(1.0, float(alpha)))
|
| 256 |
bm25_weight = 1.0 - alpha
|
| 257 |
vector_weight = alpha
|
|
|
|
| 261 |
for doc_key, doc in all_docs_dict.items():
|
| 262 |
score = 0.0
|
| 263 |
|
|
|
|
| 264 |
if doc_key in vector_ranked:
|
| 265 |
score += vector_weight / (rrf_c + vector_ranked[doc_key])
|
| 266 |
|
|
|
|
| 267 |
if doc_key in bm25_ranked:
|
| 268 |
score += bm25_weight / (rrf_c + bm25_ranked[doc_key])
|
| 269 |
|
| 270 |
if score > 0:
|
| 271 |
rrf_scores[doc_key] = score
|
| 272 |
|
|
|
|
| 273 |
sorted_results = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
|
| 274 |
return [all_docs_dict[doc_key] for doc_key, _ in sorted_results[:limit]]
|
| 275 |
|
|
|
|
| 291 |
alpha=alpha,
|
| 292 |
)
|
| 293 |
|
|
|
|
| 294 |
if not routed_docs:
|
| 295 |
logger.warning("No documents found for query=%s, cohort=%s", query[:50], cohort_key)
|
| 296 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
|
| 298 |
deduplicated = []
|
| 299 |
seen = set()
|
| 300 |
+
for doc in routed_docs:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
key = self._doc_key(doc)
|
| 302 |
if key in seen:
|
| 303 |
continue
|
| 304 |
seen.add(key)
|
| 305 |
deduplicated.append(doc)
|
| 306 |
+
if len(deduplicated) >= k:
|
| 307 |
break
|
| 308 |
+
|
| 309 |
+
return deduplicated
|
{core → rag}/collection_utils.py
RENAMED
|
File without changes
|
{core → rag}/models.py
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from langchain_huggingface import HuggingFaceEmbeddings
|
| 2 |
from sentence_transformers import CrossEncoder
|
| 3 |
-
from .config import EMBED_MODEL, CROSS_ENCODER_MODEL
|
| 4 |
|
| 5 |
# Khởi tạo Embedding model - Chạy trên CPU của Hugging Face
|
| 6 |
embeddings = HuggingFaceEmbeddings(
|
|
|
|
| 1 |
from langchain_huggingface import HuggingFaceEmbeddings
|
| 2 |
from sentence_transformers import CrossEncoder
|
| 3 |
+
from core.config import EMBED_MODEL, CROSS_ENCODER_MODEL
|
| 4 |
|
| 5 |
# Khởi tạo Embedding model - Chạy trên CPU của Hugging Face
|
| 6 |
embeddings = HuggingFaceEmbeddings(
|
{core → rag}/prompting.py
RENAMED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
def create_advanced_prompt(question: str, context: str, question_type: str, topic: str = None
|
| 2 |
# Base system - Định nghĩa tư duy cho AI
|
| 3 |
base_system = """Bạn là Trợ lý AI chuyên gia về Pháp chế và Quy định Đại học. Nhiệm vụ của bạn là hỗ trợ tra cứu thông tin chính xác từ các văn bản quy phạm nội bộ (Quyết định, Thông tư, Quy định...).
|
| 4 |
|
|
@@ -97,25 +97,12 @@ Về vấn đề [Chủ đề], theo **Điều [Số]**, các trường hợp ng
|
|
| 97 |
else:
|
| 98 |
topic_instr = ""
|
| 99 |
|
| 100 |
-
# [YEAR-AWARE CHANGE] Rang buoc cau tra loi theo nam hoc duoc hoi.
|
| 101 |
-
if year_scope:
|
| 102 |
-
year_instr = (
|
| 103 |
-
f"\n\n **RÀNG BUỘC NĂM HỌC (BẮT BUỘC):**\n"
|
| 104 |
-
f"- Người dùng đang hỏi trong phạm vi năm: **{year_scope}**.\n"
|
| 105 |
-
f"- Ưu tiên các đoạn có nhãn nguồn cùng năm trong context (ví dụ: [Năm 2022-2023 | ...]).\n"
|
| 106 |
-
f"- Nếu chưa đủ bằng chứng đúng năm, được phép dùng đoạn có nhãn 'Áp dụng nhiều năm' hoặc quy định gần nhất và phải ghi chú rõ phạm vi áp dụng.\n"
|
| 107 |
-
f"- Không kết luận 'không có dữ liệu' chỉ vì thiếu đúng nhãn năm nếu vẫn có quy định bao quát liên quan.\n"
|
| 108 |
-
)
|
| 109 |
-
else:
|
| 110 |
-
year_instr = ""
|
| 111 |
-
|
| 112 |
# 4. Gộp Prompt
|
| 113 |
full_prompt = f"""{base_system}
|
| 114 |
----------------
|
| 115 |
{example}
|
| 116 |
----------------
|
| 117 |
{topic_instr}
|
| 118 |
-
{year_instr}
|
| 119 |
|
| 120 |
**TÀI LIỆU THAM KHẢO (CONTEXT):**
|
| 121 |
{context}
|
|
|
|
| 1 |
+
def create_advanced_prompt(question: str, context: str, question_type: str, topic: str = None) -> str:
|
| 2 |
# Base system - Định nghĩa tư duy cho AI
|
| 3 |
base_system = """Bạn là Trợ lý AI chuyên gia về Pháp chế và Quy định Đại học. Nhiệm vụ của bạn là hỗ trợ tra cứu thông tin chính xác từ các văn bản quy phạm nội bộ (Quyết định, Thông tư, Quy định...).
|
| 4 |
|
|
|
|
| 97 |
else:
|
| 98 |
topic_instr = ""
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
# 4. Gộp Prompt
|
| 101 |
full_prompt = f"""{base_system}
|
| 102 |
----------------
|
| 103 |
{example}
|
| 104 |
----------------
|
| 105 |
{topic_instr}
|
|
|
|
| 106 |
|
| 107 |
**TÀI LIỆU THAM KHẢO (CONTEXT):**
|
| 108 |
{context}
|
{core → rag}/qa_pipeline.py
RENAMED
|
@@ -1,128 +1,24 @@
|
|
| 1 |
from typing import List, Generator
|
| 2 |
import os, re, hashlib
|
| 3 |
import logging
|
| 4 |
-
import groq
|
| 5 |
import google.generativeai as genai
|
| 6 |
import json
|
| 7 |
from concurrent.futures import ThreadPoolExecutor
|
| 8 |
from threading import Lock
|
| 9 |
-
|
| 10 |
-
from .
|
| 11 |
-
from .config import TOP_K_RESULTS, FINAL_TOP_K
|
| 12 |
from .rerank import advanced_rerank
|
| 13 |
from .prompting import create_advanced_prompt
|
| 14 |
-
from .retriever import HybridRetriever
|
| 15 |
from .analyze_and_expand import analyze_and_expand_query
|
| 16 |
-
from .llm_utils import safe_invoke, safe_stream
|
| 17 |
|
| 18 |
logger = logging.getLogger(__name__)
|
| 19 |
|
| 20 |
-
# Giữ nguyên các hằng số
|
| 21 |
MAX_CONTEXT_CHARS = 12000
|
| 22 |
MAX_DOC_CHARS = 1800
|
| 23 |
MAX_OUT_CHARS = 3000
|
| 24 |
-
# [YEAR-AWARE CHANGE] Pattern nhan dien nam hoc trong cau hoi.
|
| 25 |
-
ACADEMIC_YEAR_PATTERN = re.compile(r"\b(20\d{2})\s*[-_/]\s*(20\d{2})\b")
|
| 26 |
-
SINGLE_YEAR_PATTERN = re.compile(r"\b(20\d{2})\b")
|
| 27 |
|
| 28 |
# Quản lý API Keys cho Groq và Gemini với xoay tua tự động khi gặp lỗi hoặc hết hạn
|
| 29 |
-
class AIProviderManager:
|
| 30 |
-
def __init__(self):
|
| 31 |
-
# Lấy danh sách keys
|
| 32 |
-
self.groq_keys = [k.strip() for k in os.getenv("GROQ_API_KEYS", "").split(",") if k.strip()]
|
| 33 |
-
self.gemini_keys = [k.strip() for k in os.getenv("GEMINI_API_KEYS", "").split(",") if k.strip()]
|
| 34 |
-
self.groq_idx = 0
|
| 35 |
-
self.gemini_idx = 0
|
| 36 |
-
|
| 37 |
-
def get_groq_client(self):
|
| 38 |
-
if not self.groq_keys: return None
|
| 39 |
-
return groq.Groq(api_key=self.groq_keys[self.groq_idx])
|
| 40 |
-
|
| 41 |
-
def rotate_groq(self):
|
| 42 |
-
if len(self.groq_keys) > 1:
|
| 43 |
-
self.groq_idx = (self.groq_idx + 1) % len(self.groq_keys)
|
| 44 |
-
logger.info(f" Đã xoay sang Groq Key thứ {self.groq_idx + 1}")
|
| 45 |
-
|
| 46 |
-
def get_gemini_key(self):
|
| 47 |
-
if not self.gemini_keys: return None
|
| 48 |
-
return self.gemini_keys[self.gemini_idx]
|
| 49 |
-
|
| 50 |
-
def rotate_gemini(self):
|
| 51 |
-
if len(self.gemini_keys) > 1:
|
| 52 |
-
self.gemini_idx = (self.gemini_idx + 1) % len(self.gemini_keys)
|
| 53 |
-
logger.info(f"Đã xoay sang Gemini Key dự phòng")
|
| 54 |
-
|
| 55 |
-
api_manager = AIProviderManager()
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
def normalize_academic_year(start_year: str, end_year: str) -> str:
|
| 59 |
-
return f"{int(start_year):04d}-{int(end_year):04d}"
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
# [YEAR-AWARE CHANGE] Trich xuat nam yeu cau tu cau hoi.
|
| 63 |
-
def detect_requested_year(text: str) -> tuple[str, set]:
|
| 64 |
-
"""Phat hien nam hoc duoc nhac den trong cau hoi."""
|
| 65 |
-
requested_range = ""
|
| 66 |
-
mentioned_years = set()
|
| 67 |
-
|
| 68 |
-
for start_year, end_year in ACADEMIC_YEAR_PATTERN.findall(text or ""):
|
| 69 |
-
requested_range = normalize_academic_year(start_year, end_year)
|
| 70 |
-
mentioned_years.add(start_year)
|
| 71 |
-
mentioned_years.add(end_year)
|
| 72 |
-
|
| 73 |
-
for year in SINGLE_YEAR_PATTERN.findall(text or ""):
|
| 74 |
-
mentioned_years.add(year)
|
| 75 |
-
|
| 76 |
-
return requested_range, mentioned_years
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
def infer_doc_academic_year(doc) -> str:
|
| 80 |
-
metadata = doc.metadata if isinstance(doc.metadata, dict) else {}
|
| 81 |
-
existing_year = metadata.get("academic_year")
|
| 82 |
-
if existing_year:
|
| 83 |
-
return existing_year
|
| 84 |
|
| 85 |
-
source_text = " ".join(
|
| 86 |
-
str(x) for x in [
|
| 87 |
-
metadata.get("source_relpath"),
|
| 88 |
-
metadata.get("source"),
|
| 89 |
-
metadata.get("source_file"),
|
| 90 |
-
]
|
| 91 |
-
if x
|
| 92 |
-
)
|
| 93 |
-
match = ACADEMIC_YEAR_PATTERN.search(source_text)
|
| 94 |
-
if match:
|
| 95 |
-
year = normalize_academic_year(match.group(1), match.group(2))
|
| 96 |
-
metadata["academic_year"] = year
|
| 97 |
-
doc.metadata = metadata
|
| 98 |
-
return year
|
| 99 |
-
|
| 100 |
-
metadata["academic_year"] = "ALL"
|
| 101 |
-
doc.metadata = metadata
|
| 102 |
-
return "ALL"
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
# [YEAR-AWARE CHANGE] Loc tai lieu theo metadata nam hoc.
|
| 106 |
-
def filter_docs_by_year(docs: List, requested_range: str, mentioned_years: set) -> List:
|
| 107 |
-
if not requested_range and not mentioned_years:
|
| 108 |
-
return docs
|
| 109 |
-
|
| 110 |
-
filtered_docs = []
|
| 111 |
-
for doc in docs:
|
| 112 |
-
doc_year = infer_doc_academic_year(doc)
|
| 113 |
-
if doc_year == "ALL":
|
| 114 |
-
filtered_docs.append(doc)
|
| 115 |
-
continue
|
| 116 |
-
|
| 117 |
-
if requested_range and doc_year == requested_range:
|
| 118 |
-
filtered_docs.append(doc)
|
| 119 |
-
continue
|
| 120 |
-
|
| 121 |
-
doc_year_tokens = set(SINGLE_YEAR_PATTERN.findall(doc_year))
|
| 122 |
-
if doc_year_tokens.intersection(mentioned_years):
|
| 123 |
-
filtered_docs.append(doc)
|
| 124 |
-
|
| 125 |
-
return filtered_docs
|
| 126 |
|
| 127 |
def sanitize_for_prompt(text: str) -> str:
|
| 128 |
"""Lọc bỏ prompt injection và PII """
|
|
@@ -261,7 +157,6 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, cohort_ke
|
|
| 261 |
if cohort_key:
|
| 262 |
logger.info(f"Sử dụng cohort_key: {cohort_key}")
|
| 263 |
|
| 264 |
-
# Gửi song song các truy vấn đến Qdrant
|
| 265 |
def search_query(query: str):
|
| 266 |
current_alpha = 0.4 if "CNTT" in query.upper() else 0.5
|
| 267 |
return hybrid_retriever.search(
|
|
@@ -293,18 +188,21 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, cohort_ke
|
|
| 293 |
final_docs = advanced_rerank(question, all_docs, top_k=FINAL_TOP_K)
|
| 294 |
|
| 295 |
context_parts = []
|
| 296 |
-
context_docs = []
|
| 297 |
total_chars = 0
|
|
|
|
| 298 |
for doc in final_docs:
|
| 299 |
page = doc.metadata.get('page_number', 'N/A')
|
| 300 |
file_name = doc.metadata.get('source_file') or doc.metadata.get('source')
|
| 301 |
source = f"[{os.path.basename(file_name)} | Trang {page}]" if file_name else f"[Trang {page}]"
|
| 302 |
block = f"{source}\n{doc.page_content}"
|
|
|
|
|
|
|
| 303 |
if total_chars + len(block) > MAX_CONTEXT_CHARS:
|
| 304 |
-
|
|
|
|
| 305 |
total_chars += len(block)
|
| 306 |
context_parts.append(block)
|
| 307 |
-
# Lưu metadata cho phần tài liệu tham khảo ở cuối
|
| 308 |
context_docs.append({
|
| 309 |
'source': file_name or "Không rõ",
|
| 310 |
'page': page
|
|
@@ -318,7 +216,6 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, cohort_ke
|
|
| 318 |
logger.info("Đang tạo câu trả lời cuối cùng ...")
|
| 319 |
|
| 320 |
success = False
|
| 321 |
-
# Ưu tiên Groq (tiết kiệm token)
|
| 322 |
for _ in range(len(api_manager.groq_keys) if api_manager.groq_keys else 1):
|
| 323 |
try:
|
| 324 |
client = api_manager.get_groq_client()
|
|
@@ -336,13 +233,12 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, cohort_ke
|
|
| 336 |
success = True
|
| 337 |
break
|
| 338 |
except Exception as e:
|
| 339 |
-
if "429" in str(e):
|
| 340 |
api_manager.rotate_groq()
|
| 341 |
continue
|
| 342 |
logger.error(f"Lỗi Groq: {e}")
|
| 343 |
break
|
| 344 |
|
| 345 |
-
# Fallback sang Gemini nếu Groq lỗi
|
| 346 |
if not success:
|
| 347 |
logger.warning("Chuyển sang Gemini ...")
|
| 348 |
for _ in range(len(api_manager.gemini_keys) if api_manager.gemini_keys else 1):
|
|
@@ -363,7 +259,6 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever, cohort_ke
|
|
| 363 |
yield "Đã xảy ra lỗi hệ thống hoặc quá tải. Vui lòng thử lại sau giây lát!"
|
| 364 |
return
|
| 365 |
|
| 366 |
-
# Thêm phần Tài liệu tham khảo ở cuối
|
| 367 |
if context_docs:
|
| 368 |
yield "\n\n---\n\n"
|
| 369 |
yield "## 📚 Tài liệu tham khảo\n\n"
|
|
|
|
| 1 |
from typing import List, Generator
|
| 2 |
import os, re, hashlib
|
| 3 |
import logging
|
|
|
|
| 4 |
import google.generativeai as genai
|
| 5 |
import json
|
| 6 |
from concurrent.futures import ThreadPoolExecutor
|
| 7 |
from threading import Lock
|
| 8 |
+
from core.ai_provider import api_manager
|
| 9 |
+
from core.config import TOP_K_RESULTS, FINAL_TOP_K
|
|
|
|
| 10 |
from .rerank import advanced_rerank
|
| 11 |
from .prompting import create_advanced_prompt
|
|
|
|
| 12 |
from .analyze_and_expand import analyze_and_expand_query
|
|
|
|
| 13 |
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
|
|
|
|
| 16 |
MAX_CONTEXT_CHARS = 12000
|
| 17 |
MAX_DOC_CHARS = 1800
|
| 18 |
MAX_OUT_CHARS = 3000
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# Quản lý API Keys cho Groq và Gemini với xoay tua tự động khi gặp lỗi hoặc hết hạn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def sanitize_for_prompt(text: str) -> str:
|
| 24 |
"""Lọc bỏ prompt injection và PII """
|
|
|
|
| 157 |
if cohort_key:
|
| 158 |
logger.info(f"Sử dụng cohort_key: {cohort_key}")
|
| 159 |
|
|
|
|
| 160 |
def search_query(query: str):
|
| 161 |
current_alpha = 0.4 if "CNTT" in query.upper() else 0.5
|
| 162 |
return hybrid_retriever.search(
|
|
|
|
| 188 |
final_docs = advanced_rerank(question, all_docs, top_k=FINAL_TOP_K)
|
| 189 |
|
| 190 |
context_parts = []
|
| 191 |
+
context_docs = []
|
| 192 |
total_chars = 0
|
| 193 |
+
|
| 194 |
for doc in final_docs:
|
| 195 |
page = doc.metadata.get('page_number', 'N/A')
|
| 196 |
file_name = doc.metadata.get('source_file') or doc.metadata.get('source')
|
| 197 |
source = f"[{os.path.basename(file_name)} | Trang {page}]" if file_name else f"[Trang {page}]"
|
| 198 |
block = f"{source}\n{doc.page_content}"
|
| 199 |
+
|
| 200 |
+
# Dùng continue để nhét tối đa các chunk ngắn thay vì break làm đứt gánh
|
| 201 |
if total_chars + len(block) > MAX_CONTEXT_CHARS:
|
| 202 |
+
continue
|
| 203 |
+
|
| 204 |
total_chars += len(block)
|
| 205 |
context_parts.append(block)
|
|
|
|
| 206 |
context_docs.append({
|
| 207 |
'source': file_name or "Không rõ",
|
| 208 |
'page': page
|
|
|
|
| 216 |
logger.info("Đang tạo câu trả lời cuối cùng ...")
|
| 217 |
|
| 218 |
success = False
|
|
|
|
| 219 |
for _ in range(len(api_manager.groq_keys) if api_manager.groq_keys else 1):
|
| 220 |
try:
|
| 221 |
client = api_manager.get_groq_client()
|
|
|
|
| 233 |
success = True
|
| 234 |
break
|
| 235 |
except Exception as e:
|
| 236 |
+
if "429" in str(e):
|
| 237 |
api_manager.rotate_groq()
|
| 238 |
continue
|
| 239 |
logger.error(f"Lỗi Groq: {e}")
|
| 240 |
break
|
| 241 |
|
|
|
|
| 242 |
if not success:
|
| 243 |
logger.warning("Chuyển sang Gemini ...")
|
| 244 |
for _ in range(len(api_manager.gemini_keys) if api_manager.gemini_keys else 1):
|
|
|
|
| 259 |
yield "Đã xảy ra lỗi hệ thống hoặc quá tải. Vui lòng thử lại sau giây lát!"
|
| 260 |
return
|
| 261 |
|
|
|
|
| 262 |
if context_docs:
|
| 263 |
yield "\n\n---\n\n"
|
| 264 |
yield "## 📚 Tài liệu tham khảo\n\n"
|
{core → rag}/rerank.py
RENAMED
|
File without changes
|
{core → rag}/vectorstore.py
RENAMED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import asyncio
|
| 2 |
import logging
|
| 3 |
import os
|
| 4 |
-
import re
|
| 5 |
from typing import Any, Dict, List
|
| 6 |
|
| 7 |
import pdfplumber
|
|
@@ -13,43 +12,60 @@ from docx.table import Table, _Cell
|
|
| 13 |
from docx.text.paragraph import Paragraph
|
| 14 |
from langchain_core.documents import Document as LangChainDocument
|
| 15 |
|
| 16 |
-
from .text_utils import clean_text
|
| 17 |
|
| 18 |
logger = logging.getLogger(__name__)
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
def normalize_academic_year(start_year: str, end_year: str) -> str:
|
| 24 |
-
return f"{int(start_year):04d}-{int(end_year):04d}"
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
def extract_academic_year(text: str) -> str:
|
| 28 |
-
if not text:
|
| 29 |
-
return ""
|
| 30 |
-
match = ACADEMIC_YEAR_PATTERN.search(text)
|
| 31 |
-
if not match:
|
| 32 |
-
return ""
|
| 33 |
-
return normalize_academic_year(match.group(1), match.group(2))
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
def table_to_markdown(data: List[List[str]]) -> str:
|
| 37 |
if not data or len(data) < 2:
|
| 38 |
return ""
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
def read_pdf_with_tables(filepath: str) -> List[LangChainDocument]:
|
| 55 |
docs: List[LangChainDocument] = []
|
|
@@ -61,9 +77,10 @@ def read_pdf_with_tables(filepath: str) -> List[LangChainDocument]:
|
|
| 61 |
table_texts: List[str] = []
|
| 62 |
if tables:
|
| 63 |
for table in tables:
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
| 67 |
|
| 68 |
full_content = text + "\n\n[BANG DU LIEU TRICH XUAT]:\n" + "\n".join(table_texts)
|
| 69 |
if full_content.strip():
|
|
@@ -74,11 +91,10 @@ def read_pdf_with_tables(filepath: str) -> List[LangChainDocument]:
|
|
| 74 |
)
|
| 75 |
)
|
| 76 |
except Exception as error:
|
| 77 |
-
logger.error("
|
| 78 |
|
| 79 |
return docs
|
| 80 |
|
| 81 |
-
|
| 82 |
def iter_block_items(parent):
|
| 83 |
if isinstance(parent, _Document):
|
| 84 |
parent_elm = parent.element.body
|
|
@@ -92,7 +108,6 @@ def iter_block_items(parent):
|
|
| 92 |
elif isinstance(child, CT_Tbl):
|
| 93 |
yield Table(child, parent)
|
| 94 |
|
| 95 |
-
|
| 96 |
def read_docx_with_tables(filepath: str) -> str:
|
| 97 |
doc = Document(filepath)
|
| 98 |
full_text: List[str] = []
|
|
@@ -108,13 +123,13 @@ def read_docx_with_tables(filepath: str) -> str:
|
|
| 108 |
row_data.append(clean_text(cell.text))
|
| 109 |
table_data.append(row_data)
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
| 114 |
|
| 115 |
return "\n".join(full_text)
|
| 116 |
|
| 117 |
-
|
| 118 |
def load_documents_from_file(filepath: str, filename: str) -> List[LangChainDocument]:
|
| 119 |
docs: List[LangChainDocument] = []
|
| 120 |
lower_name = filename.lower()
|
|
@@ -140,12 +155,10 @@ def load_documents_from_file(filepath: str, filename: str) -> List[LangChainDocu
|
|
| 140 |
logger.error("Loi doc %s: %s", filename, str(error)[:120])
|
| 141 |
return []
|
| 142 |
|
| 143 |
-
|
| 144 |
async def build_vectorstore_improved(
|
| 145 |
sync_coordinator: Any,
|
| 146 |
startup_wait_seconds: int = 5,
|
| 147 |
) -> Dict[str, Any]:
|
| 148 |
-
"""Supabase build step: trigger one initial sync and optionally wait for completion."""
|
| 149 |
if sync_coordinator is None:
|
| 150 |
raise ValueError("sync_coordinator is required")
|
| 151 |
|
|
@@ -180,12 +193,9 @@ async def build_vectorstore_improved(
|
|
| 180 |
"timed_out": True,
|
| 181 |
}
|
| 182 |
|
| 183 |
-
|
| 184 |
def load_vectorstore_improved(sync_coordinator: Any) -> Dict[str, Any]:
|
| 185 |
-
"""Supabase load step: return current coordinator health snapshot."""
|
| 186 |
if sync_coordinator is None:
|
| 187 |
return {}
|
| 188 |
-
|
| 189 |
try:
|
| 190 |
state = sync_coordinator.get_health_snapshot()
|
| 191 |
return state if isinstance(state, dict) else {}
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import logging
|
| 3 |
import os
|
|
|
|
| 4 |
from typing import Any, Dict, List
|
| 5 |
|
| 6 |
import pdfplumber
|
|
|
|
| 12 |
from docx.text.paragraph import Paragraph
|
| 13 |
from langchain_core.documents import Document as LangChainDocument
|
| 14 |
|
| 15 |
+
from utils.text_utils import clean_text
|
| 16 |
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
| 19 |
+
def table_to_unrolled_text(data: List[List[str]], is_docx: bool = False) -> str:
|
| 20 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
if not data or len(data) < 2:
|
| 22 |
return ""
|
| 23 |
|
| 24 |
+
# Làm sạch dữ liệu ban đầu chuyển None thành chuỗi rỗng
|
| 25 |
+
cleaned_data = []
|
| 26 |
+
for row in data:
|
| 27 |
+
cleaned_row = [str(cell).strip() if cell else "" for cell in row]
|
| 28 |
+
cleaned_data.append(cleaned_row)
|
| 29 |
+
|
| 30 |
+
num_cols = len(cleaned_data[0])
|
| 31 |
+
header_row = cleaned_data[0]
|
| 32 |
+
|
| 33 |
+
# CHỈ CHẠY FORWARD FILL NẾU KHÔNG PHẢI FILE WORD
|
| 34 |
+
if not is_docx:
|
| 35 |
+
# 2. Kỹ thuật Forward-Fill cho khu vực Header (Xử lý gộp cột - Colspan)
|
| 36 |
+
# Giả định hàng đầu tiên chắc chắn là Header
|
| 37 |
+
for i in range(1, num_cols):
|
| 38 |
+
if not header_row[i] and header_row[i-1]:
|
| 39 |
+
header_row[i] = header_row[i-1] # Kéo giá trị từ trái sang phải
|
| 40 |
+
|
| 41 |
+
# 3. Kỹ thuật Forward-Fill cho khu vực Dữ liệu (Xử lý gộp hàng - Rowspan)
|
| 42 |
+
for r in range(1, len(cleaned_data)):
|
| 43 |
+
for c in range(num_cols):
|
| 44 |
+
# Nếu ô hiện tại rỗng, kéo giá trị từ ô ngay bên trên xuống
|
| 45 |
+
if not cleaned_data[r][c] and cleaned_data[r-1][c]:
|
| 46 |
+
cleaned_data[r][c] = cleaned_data[r-1][c]
|
| 47 |
+
|
| 48 |
+
# 4. Trải phẳng bảng (Unrolling)
|
| 49 |
+
headers = cleaned_data[0]
|
| 50 |
+
unrolled_rows = []
|
| 51 |
+
|
| 52 |
+
for r in range(1, len(cleaned_data)):
|
| 53 |
+
row_values = cleaned_data[r]
|
| 54 |
+
row_text_parts = []
|
| 55 |
+
|
| 56 |
+
# Chỉ ghép những ô có dữ liệu thực sự (khác Header)
|
| 57 |
+
for c in range(min(len(headers), len(row_values))):
|
| 58 |
+
header_val = headers[c]
|
| 59 |
+
cell_val = row_values[c]
|
| 60 |
+
|
| 61 |
+
# Tránh lặp lại nếu dữ liệu vô tình giống hệt Header
|
| 62 |
+
if cell_val and cell_val != header_val:
|
| 63 |
+
row_text_parts.append(f"{header_val}: {cell_val}")
|
| 64 |
+
|
| 65 |
+
if row_text_parts:
|
| 66 |
+
unrolled_rows.append("- " + " | ".join(row_text_parts))
|
| 67 |
+
|
| 68 |
+
return "\n" + "\n".join(unrolled_rows) + "\n\n"
|
| 69 |
|
| 70 |
def read_pdf_with_tables(filepath: str) -> List[LangChainDocument]:
|
| 71 |
docs: List[LangChainDocument] = []
|
|
|
|
| 77 |
table_texts: List[str] = []
|
| 78 |
if tables:
|
| 79 |
for table in tables:
|
| 80 |
+
# Vẫn chạy Forward-Fill bình thường cho PDF
|
| 81 |
+
unrolled_table = table_to_unrolled_text(table, is_docx=False)
|
| 82 |
+
if unrolled_table:
|
| 83 |
+
table_texts.append(unrolled_table)
|
| 84 |
|
| 85 |
full_content = text + "\n\n[BANG DU LIEU TRICH XUAT]:\n" + "\n".join(table_texts)
|
| 86 |
if full_content.strip():
|
|
|
|
| 91 |
)
|
| 92 |
)
|
| 93 |
except Exception as error:
|
| 94 |
+
logger.error("Lỗi đọc PDF %s: %s", os.path.basename(filepath), error)
|
| 95 |
|
| 96 |
return docs
|
| 97 |
|
|
|
|
| 98 |
def iter_block_items(parent):
|
| 99 |
if isinstance(parent, _Document):
|
| 100 |
parent_elm = parent.element.body
|
|
|
|
| 108 |
elif isinstance(child, CT_Tbl):
|
| 109 |
yield Table(child, parent)
|
| 110 |
|
|
|
|
| 111 |
def read_docx_with_tables(filepath: str) -> str:
|
| 112 |
doc = Document(filepath)
|
| 113 |
full_text: List[str] = []
|
|
|
|
| 123 |
row_data.append(clean_text(cell.text))
|
| 124 |
table_data.append(row_data)
|
| 125 |
|
| 126 |
+
# CẮT FORWARD-FILL TẠI ĐÂY BẰNG is_docx=True
|
| 127 |
+
unrolled_table = table_to_unrolled_text(table_data, is_docx=True)
|
| 128 |
+
if unrolled_table:
|
| 129 |
+
full_text.append(f"\n{unrolled_table}\n")
|
| 130 |
|
| 131 |
return "\n".join(full_text)
|
| 132 |
|
|
|
|
| 133 |
def load_documents_from_file(filepath: str, filename: str) -> List[LangChainDocument]:
|
| 134 |
docs: List[LangChainDocument] = []
|
| 135 |
lower_name = filename.lower()
|
|
|
|
| 155 |
logger.error("Loi doc %s: %s", filename, str(error)[:120])
|
| 156 |
return []
|
| 157 |
|
|
|
|
| 158 |
async def build_vectorstore_improved(
|
| 159 |
sync_coordinator: Any,
|
| 160 |
startup_wait_seconds: int = 5,
|
| 161 |
) -> Dict[str, Any]:
|
|
|
|
| 162 |
if sync_coordinator is None:
|
| 163 |
raise ValueError("sync_coordinator is required")
|
| 164 |
|
|
|
|
| 193 |
"timed_out": True,
|
| 194 |
}
|
| 195 |
|
|
|
|
| 196 |
def load_vectorstore_improved(sync_coordinator: Any) -> Dict[str, Any]:
|
|
|
|
| 197 |
if sync_coordinator is None:
|
| 198 |
return {}
|
|
|
|
| 199 |
try:
|
| 200 |
state = sync_coordinator.get_health_snapshot()
|
| 201 |
return state if isinstance(state, dict) else {}
|
requirements.txt
CHANGED
|
@@ -30,4 +30,5 @@ langchain-huggingface>=0.0.3,<0.1.0
|
|
| 30 |
#File Loaders
|
| 31 |
python-docx
|
| 32 |
pdfplumber
|
| 33 |
-
pypdf
|
|
|
|
|
|
| 30 |
#File Loaders
|
| 31 |
python-docx
|
| 32 |
pdfplumber
|
| 33 |
+
pypdf
|
| 34 |
+
pyvi
|
{core → services}/document_ingest_service.py
RENAMED
|
@@ -3,7 +3,6 @@ import os
|
|
| 3 |
import uuid
|
| 4 |
from datetime import datetime, timezone
|
| 5 |
from typing import List, Optional
|
| 6 |
-
|
| 7 |
from langchain_core.documents import Document as LangChainDocument
|
| 8 |
from qdrant_client import QdrantClient
|
| 9 |
from qdrant_client.http.exceptions import UnexpectedResponse
|
|
@@ -17,12 +16,12 @@ from qdrant_client.models import (
|
|
| 17 |
VectorParams,
|
| 18 |
)
|
| 19 |
|
| 20 |
-
from .chunking import smart_chunking
|
| 21 |
-
from .config import QDRANT_API_KEY, QDRANT_COLLECTION, QDRANT_URL
|
| 22 |
-
from .document_db import Document, DocumentChunk, SessionLocal
|
| 23 |
-
from .models import embeddings
|
| 24 |
-
from .text_utils import clean_text
|
| 25 |
-
from .vectorstore import
|
| 26 |
|
| 27 |
logger = logging.getLogger(__name__)
|
| 28 |
|
|
@@ -74,11 +73,9 @@ def chunk_documents_for_ingest(
|
|
| 74 |
if not cleaned_docs:
|
| 75 |
return []
|
| 76 |
|
| 77 |
-
academic_year = extract_academic_year(source_relpath) or "ALL"
|
| 78 |
for doc in cleaned_docs:
|
| 79 |
metadata = doc.metadata.copy() if isinstance(doc.metadata, dict) else {}
|
| 80 |
metadata["source_relpath"] = source_relpath
|
| 81 |
-
metadata["academic_year"] = academic_year
|
| 82 |
doc.metadata = metadata
|
| 83 |
|
| 84 |
return [doc for doc in smart_chunking(cleaned_docs) if (doc.page_content or "").strip()]
|
|
@@ -252,7 +249,6 @@ def process_document_ingest(
|
|
| 252 |
"collection_name": target_collection,
|
| 253 |
"source_file": metadata.get("source_file") or source_name,
|
| 254 |
"source_relpath": metadata.get("source_relpath") or source_relpath,
|
| 255 |
-
"academic_year": metadata.get("academic_year") or "ALL",
|
| 256 |
"page_number": metadata.get("page_number"),
|
| 257 |
"source_updated_at": source_updated_at,
|
| 258 |
"source_etag": source_etag,
|
|
@@ -365,4 +361,4 @@ def delete_vectors_for_object_path(collection_name: str, object_path: str) -> bo
|
|
| 365 |
wait=True,
|
| 366 |
)
|
| 367 |
|
| 368 |
-
return True
|
|
|
|
| 3 |
import uuid
|
| 4 |
from datetime import datetime, timezone
|
| 5 |
from typing import List, Optional
|
|
|
|
| 6 |
from langchain_core.documents import Document as LangChainDocument
|
| 7 |
from qdrant_client import QdrantClient
|
| 8 |
from qdrant_client.http.exceptions import UnexpectedResponse
|
|
|
|
| 16 |
VectorParams,
|
| 17 |
)
|
| 18 |
|
| 19 |
+
from rag.chunking import smart_chunking
|
| 20 |
+
from core.config import QDRANT_API_KEY, QDRANT_COLLECTION, QDRANT_URL
|
| 21 |
+
from database.document_db import Document, DocumentChunk, SessionLocal
|
| 22 |
+
from rag.models import embeddings
|
| 23 |
+
from utils.text_utils import clean_text
|
| 24 |
+
from rag.vectorstore import load_documents_from_file
|
| 25 |
|
| 26 |
logger = logging.getLogger(__name__)
|
| 27 |
|
|
|
|
| 73 |
if not cleaned_docs:
|
| 74 |
return []
|
| 75 |
|
|
|
|
| 76 |
for doc in cleaned_docs:
|
| 77 |
metadata = doc.metadata.copy() if isinstance(doc.metadata, dict) else {}
|
| 78 |
metadata["source_relpath"] = source_relpath
|
|
|
|
| 79 |
doc.metadata = metadata
|
| 80 |
|
| 81 |
return [doc for doc in smart_chunking(cleaned_docs) if (doc.page_content or "").strip()]
|
|
|
|
| 249 |
"collection_name": target_collection,
|
| 250 |
"source_file": metadata.get("source_file") or source_name,
|
| 251 |
"source_relpath": metadata.get("source_relpath") or source_relpath,
|
|
|
|
| 252 |
"page_number": metadata.get("page_number"),
|
| 253 |
"source_updated_at": source_updated_at,
|
| 254 |
"source_etag": source_etag,
|
|
|
|
| 361 |
wait=True,
|
| 362 |
)
|
| 363 |
|
| 364 |
+
return True
|
{core → services}/supabase_sync_service.py
RENAMED
|
@@ -2,15 +2,14 @@ import asyncio
|
|
| 2 |
import json
|
| 3 |
import logging
|
| 4 |
import os
|
| 5 |
-
import re
|
| 6 |
import tempfile
|
| 7 |
import time
|
| 8 |
from datetime import datetime, timezone
|
| 9 |
from typing import Any, Dict, List, Optional
|
| 10 |
from urllib import error, parse, request
|
| 11 |
|
| 12 |
-
from .collection_utils import build_collection_name
|
| 13 |
-
from .document_db import (
|
| 14 |
Document,
|
| 15 |
DocumentChunk,
|
| 16 |
SessionLocal,
|
|
@@ -20,7 +19,7 @@ from .document_db import (
|
|
| 20 |
mark_document_sync_error_resolved,
|
| 21 |
utcnow,
|
| 22 |
)
|
| 23 |
-
from .document_ingest_service import delete_vectors_for_object_path, process_document_ingest
|
| 24 |
|
| 25 |
logger = logging.getLogger(__name__)
|
| 26 |
|
|
|
|
| 2 |
import json
|
| 3 |
import logging
|
| 4 |
import os
|
|
|
|
| 5 |
import tempfile
|
| 6 |
import time
|
| 7 |
from datetime import datetime, timezone
|
| 8 |
from typing import Any, Dict, List, Optional
|
| 9 |
from urllib import error, parse, request
|
| 10 |
|
| 11 |
+
from rag.collection_utils import build_collection_name
|
| 12 |
+
from database.document_db import (
|
| 13 |
Document,
|
| 14 |
DocumentChunk,
|
| 15 |
SessionLocal,
|
|
|
|
| 19 |
mark_document_sync_error_resolved,
|
| 20 |
utcnow,
|
| 21 |
)
|
| 22 |
+
from services.document_ingest_service import delete_vectors_for_object_path, process_document_ingest
|
| 23 |
|
| 24 |
logger = logging.getLogger(__name__)
|
| 25 |
|
{core → utils}/text_utils.py
RENAMED
|
@@ -7,8 +7,10 @@ def clean_text(text: str) -> str:
|
|
| 7 |
# Nối các từ bị gãy ngang do xuống dòng
|
| 8 |
text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)
|
| 9 |
|
| 10 |
-
#
|
| 11 |
-
text = re.sub(r'[
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# Chuẩn hóa khoảng trắng
|
| 14 |
text = re.sub(r'[ \t]+', ' ', text)
|
|
|
|
| 7 |
# Nối các từ bị gãy ngang do xuống dòng
|
| 8 |
text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)
|
| 9 |
|
| 10 |
+
# Loại bỏ các ký tự điều khiển không mong muốn, nhưng giữ lại các dấu câu thông thường
|
| 11 |
+
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
|
| 12 |
+
# Xóa các ký tự không nhìn thấy và các ký tự đặc biệt như zero-width space và BOM
|
| 13 |
+
text = text.replace('\u200b', '').replace('\ufeff', '')
|
| 14 |
|
| 15 |
# Chuẩn hóa khoảng trắng
|
| 16 |
text = re.sub(r'[ \t]+', ' ', text)
|