Spaces:

VietCat
/

RAGSample

Sleeping

VietCat commited on Jun 19, 2025

Commit

6211abf

1 Parent(s): 11c6c99

init project

Files changed (1) hide show

rag_core/chunker.py CHANGED Viewed

@@ -1,9 +1,22 @@
 import re
 from typing import List
 from rag_core.utils import log_timed
 @log_timed("chunking văn bản luật")
 def chunk_legal_text(text: str) -> List[str]:
-    pattern = r"(Chương\\s+[IVXLC]+:.*?|Điều\\s+\\d+\\..*?)(?=(Chương\\s+[IVXLC]+:|Điều\\s+\\d+\\.|$))"
     matches = re.findall(pattern, text, flags=re.DOTALL)
-    return [m[0].strip() for m in matches if len(m[0].strip()) > 30]

 import re
 from typing import List
 from rag_core.utils import log_timed
+import logging
 @log_timed("chunking văn bản luật")
 def chunk_legal_text(text: str) -> List[str]:
+    # Chunk theo "Chương" và "Điều"
+    pattern = r"(Chương\s+[IVXLC]+\s+.*?|Điều\s+\d+\..*?)(?=(Chương\s+[IVXLC]+\s+|Điều\s+\d+\.|$))"
     matches = re.findall(pattern, text, flags=re.DOTALL)
+    chunks = [m[0].strip() for m in matches if len(m[0].strip()) > 30]
+    if not chunks:
+        logging.warning("Không tìm thấy chunk theo Chương/Điều. Đang fallback sang chia theo đoạn văn.")
+        chunks = [p.strip() for p in text.split("\n\n") if len(p.strip()) > 100]
+    logging.info(f"Tổng số chunk sau khi xử lý: {len(chunks)}")
+    for i, c in enumerate(chunks[:2]):
+        logging.info(f"Mẫu chunk {i+1}:\n{c[:300]}...\n")
+    return chunks