tai2805 commited on
Commit
befa093
·
1 Parent(s): 33f8c53

update data

Browse files
.gitattributes CHANGED
@@ -33,21 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- data/1.[[:space:]]QĐ-1226-Quy[[:space:]]che[[:space:]]dao[[:space:]]tao[[:space:]]dai[[:space:]]hoc-DHTL[[:space:]](ban[[:space:]]hanh).docx filter=lfs diff=lfs merge=lfs -text
37
- data/10.[[:space:]]QĐ[[:space:]]1089[[:space:]]thi[[:space:]]OLP[[:space:]]môn[[:space:]]học[[:space:]](Final[[:space:]]10-5-2023).doc filter=lfs diff=lfs merge=lfs -text
38
- data/11.[[:space:]]QĐ[[:space:]]về[[:space:]]Học[[:space:]]phí[[:space:]]final[[:space:]](25-10-2021).doc filter=lfs diff=lfs merge=lfs -text
39
- data/3.[[:space:]]QD1767.TA[[:space:]]tăng[[:space:]]cường[[:space:]]ban[[:space:]]hanh.doc filter=lfs diff=lfs merge=lfs -text
40
- data/4.[[:space:]]QD411_QD_DHTL-Chuan_Dau_Ra_CNTT.pdf filter=lfs diff=lfs merge=lfs -text
41
- data/4.1.[[:space:]]QuyDinh_Ve_CDR_CNTT_Ban_hanh_theo_QD411-06-4-2022.pdf filter=lfs diff=lfs merge=lfs -text
42
- data/7.[[:space:]]QĐ[[:space:]]đánh[[:space:]]giá[[:space:]]KQRL[[:space:]](Final[[:space:]]18-8-2016).doc filter=lfs diff=lfs merge=lfs -text
43
- data/9.[[:space:]]QĐ[[:space:]]Khen[[:space:]]thưởng[[:space:]]-[[:space:]]KL[[:space:]](Final[[:space:]]10-8-2016).doc filter=lfs diff=lfs merge=lfs -text
44
- data/10.[[:space:]]QĐ[[:space:]]1089[[:space:]]thi[[:space:]]OLP[[:space:]]môn[[:space:]]học[[:space:]](Final[[:space:]]10-5-2023).pdf filter=lfs diff=lfs merge=lfs -text
45
- data/11.[[:space:]]QĐ[[:space:]]về[[:space:]]Học[[:space:]]phí[[:space:]]final[[:space:]](25-10-2021).pdf filter=lfs diff=lfs merge=lfs -text
46
- data/12.[[:space:]]QD[[:space:]]ngoại[[:space:]]trú.pdf filter=lfs diff=lfs merge=lfs -text
47
- data/2.[[:space:]]QĐ[[:space:]]về[[:space:]]tiếng[[:space:]]anh[[:space:]]CTTT.300921.QD.1315.pdf filter=lfs diff=lfs merge=lfs -text
48
- data/3.[[:space:]]QD1767.TA[[:space:]]tăng[[:space:]]cường[[:space:]]ban[[:space:]]hanh.pdf filter=lfs diff=lfs merge=lfs -text
49
- data/7.[[:space:]]QĐ[[:space:]]đánh[[:space:]]giá[[:space:]]KQRL[[:space:]](Final[[:space:]]18-8-2016).pdf filter=lfs diff=lfs merge=lfs -text
50
- data/9.[[:space:]]QĐ[[:space:]]Khen[[:space:]]thưởng[[:space:]]-[[:space:]]KL[[:space:]](Final[[:space:]]10-8-2016).pdf filter=lfs diff=lfs merge=lfs -text
51
- data/3784QD-DHTL__NCKH.pdf filter=lfs diff=lfs merge=lfs -text
52
- data/4079QD-DHTL_dd_Olympia.pdf filter=lfs diff=lfs merge=lfs -text
53
- data/8.[[:space:]]QĐ[[:space:]]ve[[:space:]]HBKKHT,[[:space:]]HBCS[[:space:]](final[[:space:]]12-5-2021).pdf filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+
37
+ # Track source regulation files in all year folders under data/
38
+ data/**/*.pdf filter=lfs diff=lfs merge=lfs -text
39
+ data/**/*.doc filter=lfs diff=lfs merge=lfs -text
40
+ data/**/*.docx filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
core/prompting.py CHANGED
@@ -96,6 +96,7 @@ Về vấn đề [Chủ đề], theo **Điều [Số]**, các trường hợp ng
96
  else:
97
  topic_instr = ""
98
 
 
99
  if year_scope:
100
  year_instr = (
101
  f"\n\n **RÀNG BUỘC NĂM HỌC (BẮT BUỘC):**\n"
 
96
  else:
97
  topic_instr = ""
98
 
99
+ # [YEAR-AWARE CHANGE] Rang buoc cau tra loi theo nam hoc duoc hoi.
100
  if year_scope:
101
  year_instr = (
102
  f"\n\n **RÀNG BUỘC NĂM HỌC (BẮT BUỘC):**\n"
core/qa_pipeline.py CHANGED
@@ -19,6 +19,7 @@ logger = logging.getLogger(__name__)
19
  MAX_CONTEXT_CHARS = 12000
20
  MAX_DOC_CHARS = 1800
21
  MAX_OUT_CHARS = 3000
 
22
  ACADEMIC_YEAR_PATTERN = re.compile(r"\b(20\d{2})\s*[-_/]\s*(20\d{2})\b")
23
  SINGLE_YEAR_PATTERN = re.compile(r"\b(20\d{2})\b")
24
 
@@ -56,6 +57,7 @@ def normalize_academic_year(start_year: str, end_year: str) -> str:
56
  return f"{int(start_year):04d}-{int(end_year):04d}"
57
 
58
 
 
59
  def detect_requested_year(text: str) -> tuple[str, set]:
60
  """Phat hien nam hoc duoc nhac den trong cau hoi."""
61
  requested_range = ""
@@ -98,6 +100,7 @@ def infer_doc_academic_year(doc) -> str:
98
  return "ALL"
99
 
100
 
 
101
  def filter_docs_by_year(docs: List, requested_range: str, mentioned_years: set) -> List:
102
  if not requested_range and not mentioned_years:
103
  return docs
@@ -238,6 +241,7 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever) -> Genera
238
 
239
  logger.info(f" CÂU HỎI GỐC: {message}")
240
  question = generate_standalone_query(message, history)
 
241
  requested_year_range, mentioned_years = detect_requested_year(f"{message}\n{question}")
242
  if requested_year_range:
243
  logger.info(f"Lọc theo năm học yêu cầu: {requested_year_range}")
@@ -272,6 +276,7 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever) -> Genera
272
  yield "Không tìm thấy thông tin liên quan trong tài liệu."
273
  return
274
 
 
275
  year_filtered_docs = filter_docs_by_year(all_docs, requested_year_range, mentioned_years)
276
  if (requested_year_range or mentioned_years) and not year_filtered_docs:
277
  if requested_year_range:
@@ -292,6 +297,7 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever) -> Genera
292
  for doc in final_docs:
293
  page = doc.metadata.get('page_number', 'N/A')
294
  file_name = doc.metadata.get('source_file') or doc.metadata.get('source')
 
295
  doc_year = infer_doc_academic_year(doc)
296
  year_label = f"Năm {doc_year}" if doc_year != "ALL" else "Áp dụng nhiều năm"
297
  source = f"[{year_label} | {os.path.basename(file_name)} | Trang {page}]" if file_name else f"[{year_label} | Trang {page}]"
@@ -303,6 +309,7 @@ def ask_ai_stream_delta(message: str, history: List, hybrid_retriever) -> Genera
303
 
304
  context = "\n\n---\n\n".join(context_parts)
305
  topic_hint = processed_data.get('topic') or processed_data.get('root_question') or question
 
306
  if requested_year_range:
307
  year_scope = requested_year_range
308
  elif mentioned_years:
 
19
  MAX_CONTEXT_CHARS = 12000
20
  MAX_DOC_CHARS = 1800
21
  MAX_OUT_CHARS = 3000
22
+ # [YEAR-AWARE CHANGE] Pattern nhan dien nam hoc trong cau hoi.
23
  ACADEMIC_YEAR_PATTERN = re.compile(r"\b(20\d{2})\s*[-_/]\s*(20\d{2})\b")
24
  SINGLE_YEAR_PATTERN = re.compile(r"\b(20\d{2})\b")
25
 
 
57
  return f"{int(start_year):04d}-{int(end_year):04d}"
58
 
59
 
60
+ # [YEAR-AWARE CHANGE] Trich xuat nam yeu cau tu cau hoi.
61
  def detect_requested_year(text: str) -> tuple[str, set]:
62
  """Phat hien nam hoc duoc nhac den trong cau hoi."""
63
  requested_range = ""
 
100
  return "ALL"
101
 
102
 
103
+ # [YEAR-AWARE CHANGE] Loc tai lieu theo metadata nam hoc.
104
  def filter_docs_by_year(docs: List, requested_range: str, mentioned_years: set) -> List:
105
  if not requested_range and not mentioned_years:
106
  return docs
 
241
 
242
  logger.info(f" CÂU HỎI GỐC: {message}")
243
  question = generate_standalone_query(message, history)
244
+ # [YEAR-AWARE CHANGE] Xac dinh pham vi nam ma nguoi dung yeu cau.
245
  requested_year_range, mentioned_years = detect_requested_year(f"{message}\n{question}")
246
  if requested_year_range:
247
  logger.info(f"Lọc theo năm học yêu cầu: {requested_year_range}")
 
276
  yield "Không tìm thấy thông tin liên quan trong tài liệu."
277
  return
278
 
279
+ # [YEAR-AWARE CHANGE] Loc tap docs theo nam truoc khi rerank.
280
  year_filtered_docs = filter_docs_by_year(all_docs, requested_year_range, mentioned_years)
281
  if (requested_year_range or mentioned_years) and not year_filtered_docs:
282
  if requested_year_range:
 
297
  for doc in final_docs:
298
  page = doc.metadata.get('page_number', 'N/A')
299
  file_name = doc.metadata.get('source_file') or doc.metadata.get('source')
300
+ # [YEAR-AWARE CHANGE] Gan nhan nam trong context de LLM bam dung nguon.
301
  doc_year = infer_doc_academic_year(doc)
302
  year_label = f"Năm {doc_year}" if doc_year != "ALL" else "Áp dụng nhiều năm"
303
  source = f"[{year_label} | {os.path.basename(file_name)} | Trang {page}]" if file_name else f"[{year_label} | Trang {page}]"
 
309
 
310
  context = "\n\n---\n\n".join(context_parts)
311
  topic_hint = processed_data.get('topic') or processed_data.get('root_question') or question
312
+ # [YEAR-AWARE CHANGE] Truyen rang buoc nam vao prompt.
313
  if requested_year_range:
314
  year_scope = requested_year_range
315
  elif mentioned_years:
core/vectorstore.py CHANGED
@@ -26,6 +26,7 @@ logger = logging.getLogger(__name__)
26
 
27
  CHUNKS_PICKLE = os.path.join(VECTOR_DIR, "chunks.pkl")
28
  COLLECTION_NAME = "quy_che_db"
 
29
  SUPPORTED_FORMATS = ('.pdf', '.doc', '.docx')
30
  ACADEMIC_YEAR_PATTERN = re.compile(r"(20\d{2})\s*[-_]\s*(20\d{2})")
31
 
@@ -112,6 +113,7 @@ def enrich_chunk_metadata(chunks: List) -> bool:
112
  return changed
113
 
114
 
 
115
  def load_and_clean_all_docs() -> List[LangChainDocument]:
116
  docs: List[LangChainDocument] = []
117
  file_entries = discover_data_files()
@@ -285,6 +287,7 @@ def load_documents_from_file(filepath: str, filename: str) -> List:
285
  logger.error(f" Lỗi đọc {filename}: {str(e)[:60]}")
286
  return []
287
 
 
288
  def build_vectorstore_improved(recreate_collection: bool = False) -> Tuple[QdrantVectorStore, List]:
289
  logger.info(" Đang xây dựng vectorstore...")
290
  docs = load_and_clean_all_docs()
@@ -362,6 +365,7 @@ def load_vectorstore_improved() -> Tuple[QdrantVectorStore, List]:
362
  except Exception as e:
363
  logger.error(f" Không thể cập nhật {CHUNKS_PICKLE}: {e}")
364
 
 
365
  discovered_relpaths = {os.path.normpath(relpath) for _, _, relpath, _ in discover_data_files()}
366
  chunk_relpaths = collect_chunk_relpaths(chunks)
367
  missing_relpaths = sorted(discovered_relpaths - chunk_relpaths)
 
26
 
27
  CHUNKS_PICKLE = os.path.join(VECTOR_DIR, "chunks.pkl")
28
  COLLECTION_NAME = "quy_che_db"
29
+ # [YEAR-AWARE CHANGE] Ho tro quet de quy va gan metadata nam hoc.
30
  SUPPORTED_FORMATS = ('.pdf', '.doc', '.docx')
31
  ACADEMIC_YEAR_PATTERN = re.compile(r"(20\d{2})\s*[-_]\s*(20\d{2})")
32
 
 
113
  return changed
114
 
115
 
116
+ # [YEAR-AWARE CHANGE] Gom doc tu toan bo thu muc data theo cau truc nam hoc.
117
  def load_and_clean_all_docs() -> List[LangChainDocument]:
118
  docs: List[LangChainDocument] = []
119
  file_entries = discover_data_files()
 
287
  logger.error(f" Lỗi đọc {filename}: {str(e)[:60]}")
288
  return []
289
 
290
+ # [YEAR-AWARE CHANGE] Cho phep tao lai collection khi phat hien file moi.
291
  def build_vectorstore_improved(recreate_collection: bool = False) -> Tuple[QdrantVectorStore, List]:
292
  logger.info(" Đang xây dựng vectorstore...")
293
  docs = load_and_clean_all_docs()
 
365
  except Exception as e:
366
  logger.error(f" Không thể cập nhật {CHUNKS_PICKLE}: {e}")
367
 
368
+ # [YEAR-AWARE CHANGE] Neu co file moi theo nam hoc, rebuild de dong bo Qdrant.
369
  discovered_relpaths = {os.path.normpath(relpath) for _, _, relpath, _ in discover_data_files()}
370
  chunk_relpaths = collect_chunk_relpaths(chunks)
371
  missing_relpaths = sorted(discovered_relpaths - chunk_relpaths)