Spaces:
Sleeping
Sleeping
update data
Browse files- .gitattributes +5 -18
- core/prompting.py +1 -0
- core/qa_pipeline.py +7 -0
- core/vectorstore.py +4 -0
.gitattributes
CHANGED
|
@@ -33,21 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
data/
|
| 39 |
-
data/
|
| 40 |
-
data/
|
| 41 |
-
data/4.1.[[:space:]]QuyDinh_Ve_CDR_CNTT_Ban_hanh_theo_QD411-06-4-2022.pdf filter=lfs diff=lfs merge=lfs -text
|
| 42 |
-
data/7.[[:space:]]QĐ[[:space:]]đánh[[:space:]]giá[[:space:]]KQRL[[:space:]](Final[[:space:]]18-8-2016).doc filter=lfs diff=lfs merge=lfs -text
|
| 43 |
-
data/9.[[:space:]]QĐ[[:space:]]Khen[[:space:]]thưởng[[:space:]]-[[:space:]]KL[[:space:]](Final[[:space:]]10-8-2016).doc filter=lfs diff=lfs merge=lfs -text
|
| 44 |
-
data/10.[[:space:]]QĐ[[:space:]]1089[[:space:]]thi[[:space:]]OLP[[:space:]]môn[[:space:]]học[[:space:]](Final[[:space:]]10-5-2023).pdf filter=lfs diff=lfs merge=lfs -text
|
| 45 |
-
data/11.[[:space:]]QĐ[[:space:]]về[[:space:]]Học[[:space:]]phí[[:space:]]final[[:space:]](25-10-2021).pdf filter=lfs diff=lfs merge=lfs -text
|
| 46 |
-
data/12.[[:space:]]QD[[:space:]]ngoại[[:space:]]trú.pdf filter=lfs diff=lfs merge=lfs -text
|
| 47 |
-
data/2.[[:space:]]QĐ[[:space:]]về[[:space:]]tiếng[[:space:]]anh[[:space:]]CTTT.300921.QD.1315.pdf filter=lfs diff=lfs merge=lfs -text
|
| 48 |
-
data/3.[[:space:]]QD1767.TA[[:space:]]tăng[[:space:]]cường[[:space:]]ban[[:space:]]hanh.pdf filter=lfs diff=lfs merge=lfs -text
|
| 49 |
-
data/7.[[:space:]]QĐ[[:space:]]đánh[[:space:]]giá[[:space:]]KQRL[[:space:]](Final[[:space:]]18-8-2016).pdf filter=lfs diff=lfs merge=lfs -text
|
| 50 |
-
data/9.[[:space:]]QĐ[[:space:]]Khen[[:space:]]thưởng[[:space:]]-[[:space:]]KL[[:space:]](Final[[:space:]]10-8-2016).pdf filter=lfs diff=lfs merge=lfs -text
|
| 51 |
-
data/3784QD-DHTL__NCKH.pdf filter=lfs diff=lfs merge=lfs -text
|
| 52 |
-
data/4079QD-DHTL_dd_Olympia.pdf filter=lfs diff=lfs merge=lfs -text
|
| 53 |
-
data/8.[[:space:]]QĐ[[:space:]]ve[[:space:]]HBKKHT,[[:space:]]HBCS[[:space:]](final[[:space:]]12-5-2021).pdf filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
|
| 37 |
+
# Track source regulation files in all year folders under data/
|
| 38 |
+
data/**/*.pdf filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
data/**/*.doc filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
data/**/*.docx filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core/prompting.py
CHANGED
|
@@ -96,6 +96,7 @@ Về vấn đề [Chủ đề], theo **Điều [Số]**, các trường hợp ng
|
|
| 96 |
else:
|
| 97 |
topic_instr = ""
|
| 98 |
|
|
|
|
| 99 |
if year_scope:
|
| 100 |
year_instr = (
|
| 101 |
f"\n\n **RÀNG BUỘC NĂM HỌC (BẮT BUỘC):**\n"
|
|
|
|
| 96 |
else:
|
| 97 |
topic_instr = ""
|
| 98 |
|
| 99 |
+
# [YEAR-AWARE CHANGE] Rang buoc cau tra loi theo nam hoc duoc hoi.
|
| 100 |
if year_scope:
|
| 101 |
year_instr = (
|
| 102 |
f"\n\n **RÀNG BUỘC NĂM HỌC (BẮT BUỘC):**\n"
|
core/qa_pipeline.py
CHANGED
|
@@ -19,6 +19,7 @@ logger = logging.getLogger(__name__)
|
|
| 19 |
MAX_CONTEXT_CHARS = 12000
|
| 20 |
MAX_DOC_CHARS = 1800
|
| 21 |
MAX_OUT_CHARS = 3000
|
|
|
|
| 22 |
ACADEMIC_YEAR_PATTERN = re.compile(r"\b(20\d{2})\s*[-_/]\s*(20\d{2})\b")
|
| 23 |
SINGLE_YEAR_PATTERN = re.compile(r"\b(20\d{2})\b")
|
| 24 |
|
|
@@ -56,6 +57,7 @@ def normalize_academic_year(start_year: str, end_year: str) -> str:
|
|
| 56 |
return f"{int(start_year):04d}-{int(end_year):04d}"
|
| 57 |
|
| 58 |
|
|
|
|
| 59 |
def detect_requested_year(text: str) -> tuple[str, set]:
|
| 60 |
"""Phat hien nam hoc duoc nhac den trong cau hoi."""
|
| 61 |
requested_range = ""
|
|
@@ -98,6 +100,7 @@ def infer_doc_academic_year(doc) -> str:
|
|
| 98 |
return "ALL"
|
| 99 |
|
| 100 |
|
|
|
|
| 101 |
def filter_docs_by_year(docs: List, requested_range: str, mentioned_years: set) -> List:
|
| 102 |
if not requested_range and not mentioned_years:
|
| 103 |
return docs
|
|
@@ -238,6 +241,7 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever) -> Genera
|
|
| 238 |
|
| 239 |
logger.info(f" CÂU HỎI GỐC: {message}")
|
| 240 |
question = generate_standalone_query(message, history)
|
|
|
|
| 241 |
requested_year_range, mentioned_years = detect_requested_year(f"{message}\n{question}")
|
| 242 |
if requested_year_range:
|
| 243 |
logger.info(f"Lọc theo năm học yêu cầu: {requested_year_range}")
|
|
@@ -272,6 +276,7 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever) -> Genera
|
|
| 272 |
yield "Không tìm thấy thông tin liên quan trong tài liệu."
|
| 273 |
return
|
| 274 |
|
|
|
|
| 275 |
year_filtered_docs = filter_docs_by_year(all_docs, requested_year_range, mentioned_years)
|
| 276 |
if (requested_year_range or mentioned_years) and not year_filtered_docs:
|
| 277 |
if requested_year_range:
|
|
@@ -292,6 +297,7 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever) -> Genera
|
|
| 292 |
for doc in final_docs:
|
| 293 |
page = doc.metadata.get('page_number', 'N/A')
|
| 294 |
file_name = doc.metadata.get('source_file') or doc.metadata.get('source')
|
|
|
|
| 295 |
doc_year = infer_doc_academic_year(doc)
|
| 296 |
year_label = f"Năm {doc_year}" if doc_year != "ALL" else "Áp dụng nhiều năm"
|
| 297 |
source = f"[{year_label} | {os.path.basename(file_name)} | Trang {page}]" if file_name else f"[{year_label} | Trang {page}]"
|
|
@@ -303,6 +309,7 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever) -> Genera
|
|
| 303 |
|
| 304 |
context = "\n\n---\n\n".join(context_parts)
|
| 305 |
topic_hint = processed_data.get('topic') or processed_data.get('root_question') or question
|
|
|
|
| 306 |
if requested_year_range:
|
| 307 |
year_scope = requested_year_range
|
| 308 |
elif mentioned_years:
|
|
|
|
| 19 |
MAX_CONTEXT_CHARS = 12000
|
| 20 |
MAX_DOC_CHARS = 1800
|
| 21 |
MAX_OUT_CHARS = 3000
|
| 22 |
+
# [YEAR-AWARE CHANGE] Pattern nhan dien nam hoc trong cau hoi.
|
| 23 |
ACADEMIC_YEAR_PATTERN = re.compile(r"\b(20\d{2})\s*[-_/]\s*(20\d{2})\b")
|
| 24 |
SINGLE_YEAR_PATTERN = re.compile(r"\b(20\d{2})\b")
|
| 25 |
|
|
|
|
| 57 |
return f"{int(start_year):04d}-{int(end_year):04d}"
|
| 58 |
|
| 59 |
|
| 60 |
+
# [YEAR-AWARE CHANGE] Trich xuat nam yeu cau tu cau hoi.
|
| 61 |
def detect_requested_year(text: str) -> tuple[str, set]:
|
| 62 |
"""Phat hien nam hoc duoc nhac den trong cau hoi."""
|
| 63 |
requested_range = ""
|
|
|
|
| 100 |
return "ALL"
|
| 101 |
|
| 102 |
|
| 103 |
+
# [YEAR-AWARE CHANGE] Loc tai lieu theo metadata nam hoc.
|
| 104 |
def filter_docs_by_year(docs: List, requested_range: str, mentioned_years: set) -> List:
|
| 105 |
if not requested_range and not mentioned_years:
|
| 106 |
return docs
|
|
|
|
| 241 |
|
| 242 |
logger.info(f" CÂU HỎI GỐC: {message}")
|
| 243 |
question = generate_standalone_query(message, history)
|
| 244 |
+
# [YEAR-AWARE CHANGE] Xac dinh pham vi nam ma nguoi dung yeu cau.
|
| 245 |
requested_year_range, mentioned_years = detect_requested_year(f"{message}\n{question}")
|
| 246 |
if requested_year_range:
|
| 247 |
logger.info(f"Lọc theo năm học yêu cầu: {requested_year_range}")
|
|
|
|
| 276 |
yield "Không tìm thấy thông tin liên quan trong tài liệu."
|
| 277 |
return
|
| 278 |
|
| 279 |
+
# [YEAR-AWARE CHANGE] Loc tap docs theo nam truoc khi rerank.
|
| 280 |
year_filtered_docs = filter_docs_by_year(all_docs, requested_year_range, mentioned_years)
|
| 281 |
if (requested_year_range or mentioned_years) and not year_filtered_docs:
|
| 282 |
if requested_year_range:
|
|
|
|
| 297 |
for doc in final_docs:
|
| 298 |
page = doc.metadata.get('page_number', 'N/A')
|
| 299 |
file_name = doc.metadata.get('source_file') or doc.metadata.get('source')
|
| 300 |
+
# [YEAR-AWARE CHANGE] Gan nhan nam trong context de LLM bam dung nguon.
|
| 301 |
doc_year = infer_doc_academic_year(doc)
|
| 302 |
year_label = f"Năm {doc_year}" if doc_year != "ALL" else "Áp dụng nhiều năm"
|
| 303 |
source = f"[{year_label} | {os.path.basename(file_name)} | Trang {page}]" if file_name else f"[{year_label} | Trang {page}]"
|
|
|
|
| 309 |
|
| 310 |
context = "\n\n---\n\n".join(context_parts)
|
| 311 |
topic_hint = processed_data.get('topic') or processed_data.get('root_question') or question
|
| 312 |
+
# [YEAR-AWARE CHANGE] Truyen rang buoc nam vao prompt.
|
| 313 |
if requested_year_range:
|
| 314 |
year_scope = requested_year_range
|
| 315 |
elif mentioned_years:
|
core/vectorstore.py
CHANGED
|
@@ -26,6 +26,7 @@ logger = logging.getLogger(__name__)
|
|
| 26 |
|
| 27 |
CHUNKS_PICKLE = os.path.join(VECTOR_DIR, "chunks.pkl")
|
| 28 |
COLLECTION_NAME = "quy_che_db"
|
|
|
|
| 29 |
SUPPORTED_FORMATS = ('.pdf', '.doc', '.docx')
|
| 30 |
ACADEMIC_YEAR_PATTERN = re.compile(r"(20\d{2})\s*[-_]\s*(20\d{2})")
|
| 31 |
|
|
@@ -112,6 +113,7 @@ def enrich_chunk_metadata(chunks: List) -> bool:
|
|
| 112 |
return changed
|
| 113 |
|
| 114 |
|
|
|
|
| 115 |
def load_and_clean_all_docs() -> List[LangChainDocument]:
|
| 116 |
docs: List[LangChainDocument] = []
|
| 117 |
file_entries = discover_data_files()
|
|
@@ -285,6 +287,7 @@ def load_documents_from_file(filepath: str, filename: str) -> List:
|
|
| 285 |
logger.error(f" Lỗi đọc {filename}: {str(e)[:60]}")
|
| 286 |
return []
|
| 287 |
|
|
|
|
| 288 |
def build_vectorstore_improved(recreate_collection: bool = False) -> Tuple[QdrantVectorStore, List]:
|
| 289 |
logger.info(" Đang xây dựng vectorstore...")
|
| 290 |
docs = load_and_clean_all_docs()
|
|
@@ -362,6 +365,7 @@ def load_vectorstore_improved() -> Tuple[QdrantVectorStore, List]:
|
|
| 362 |
except Exception as e:
|
| 363 |
logger.error(f" Không thể cập nhật {CHUNKS_PICKLE}: {e}")
|
| 364 |
|
|
|
|
| 365 |
discovered_relpaths = {os.path.normpath(relpath) for _, _, relpath, _ in discover_data_files()}
|
| 366 |
chunk_relpaths = collect_chunk_relpaths(chunks)
|
| 367 |
missing_relpaths = sorted(discovered_relpaths - chunk_relpaths)
|
|
|
|
| 26 |
|
| 27 |
CHUNKS_PICKLE = os.path.join(VECTOR_DIR, "chunks.pkl")
|
| 28 |
COLLECTION_NAME = "quy_che_db"
|
| 29 |
+
# [YEAR-AWARE CHANGE] Ho tro quet de quy va gan metadata nam hoc.
|
| 30 |
SUPPORTED_FORMATS = ('.pdf', '.doc', '.docx')
|
| 31 |
ACADEMIC_YEAR_PATTERN = re.compile(r"(20\d{2})\s*[-_]\s*(20\d{2})")
|
| 32 |
|
|
|
|
| 113 |
return changed
|
| 114 |
|
| 115 |
|
| 116 |
+
# [YEAR-AWARE CHANGE] Gom doc tu toan bo thu muc data theo cau truc nam hoc.
|
| 117 |
def load_and_clean_all_docs() -> List[LangChainDocument]:
|
| 118 |
docs: List[LangChainDocument] = []
|
| 119 |
file_entries = discover_data_files()
|
|
|
|
| 287 |
logger.error(f" Lỗi đọc {filename}: {str(e)[:60]}")
|
| 288 |
return []
|
| 289 |
|
| 290 |
+
# [YEAR-AWARE CHANGE] Cho phep tao lai collection khi phat hien file moi.
|
| 291 |
def build_vectorstore_improved(recreate_collection: bool = False) -> Tuple[QdrantVectorStore, List]:
|
| 292 |
logger.info(" Đang xây dựng vectorstore...")
|
| 293 |
docs = load_and_clean_all_docs()
|
|
|
|
| 365 |
except Exception as e:
|
| 366 |
logger.error(f" Không thể cập nhật {CHUNKS_PICKLE}: {e}")
|
| 367 |
|
| 368 |
+
# [YEAR-AWARE CHANGE] Neu co file moi theo nam hoc, rebuild de dong bo Qdrant.
|
| 369 |
discovered_relpaths = {os.path.normpath(relpath) for _, _, relpath, _ in discover_data_files()}
|
| 370 |
chunk_relpaths = collect_chunk_relpaths(chunks)
|
| 371 |
missing_relpaths = sorted(discovered_relpaths - chunk_relpaths)
|