Spaces:

VietCat
/

RAGSample

Sleeping

App Files Files Community

VietCat commited on Jun 20, 2025

Commit

3e013fc

1 Parent(s): 823b2a9

fix cache

Browse files

Files changed (1) hide show

rag_core/chunker.py +57 -57

rag_core/chunker.py CHANGED Viewed

@@ -7,68 +7,68 @@ from rag_core.utils import log_timed
 @log_timed("chunking văn bản luật")
 def chunk_legal_text(text: str) -> List[str]:
-    chapters = re.split(r"(Chương\s+[IVXLC]+\s+.*)", text)
-    nested = []
-    for i in range(1, len(chapters), 2):
-        chapter_title = chapters[i].strip()
-        chapter_body = chapters[i+1]
-        chapter = {"title": chapter_title, "articles": []}
-        articles = re.split(r"(Điều\s+\d+\..*?)\n", chapter_body)
-        for j in range(1, len(articles), 2):
-            article_title = articles[j].strip()
-            article_body = articles[j+1]
-            article = {"title": article_title, "clauses": []}
-            clause_blocks = re.split(r"\n\s*(\d+\..*?)", article_body)
-            if len(clause_blocks) < 2:
-                article["clauses"].append({"text": article_body.strip()})
-            else:
-                for k in range(1, len(clause_blocks), 2):
-                    clause_text = clause_blocks[k].strip() + "\n" + clause_blocks[k+1].strip()
-                    clause = {"text": clause_text, "points": []}
-                    point_blocks = re.split(r"\n\s*([a-zA-Z]\)|[a-zA-Z]\.)", clause_text)
-                    if len(point_blocks) < 2:
-                        clause["points"] = []
-                    else:
-                        points = []
-                        for m in range(1, len(point_blocks), 2):
-                            point = point_blocks[m].strip() + " " + point_blocks[m+1].strip()
-                            points.append(point)
-                        clause["points"] = points
                     article["clauses"].append(clause)
             chapter["articles"].append(article)
-        nested.append(chapter)
-    # Ghi cấu trúc nested ra file JSON (public file path)
-    json_path = "faiss_index/chunk_structure.json"
-    os.makedirs(os.path.dirname(json_path), exist_ok=True)
-    with open(json_path, "w", encoding="utf-8") as f:
-        json.dump(nested, f, ensure_ascii=False, indent=2)
-    logging.info(f"✅ Đã ghi cấu trúc nested vào {json_path}")
-    logging.info(f"📎 Link tải: /file/{json_path}")
-    # Flatten lại để trả về danh sách chunk đơn thuần
-    flat_chunks = []
-    for chapter in nested:
-        for article in chapter["articles"]:
-            if not article["clauses"]:
-                flat_chunks.append(f"{article['title']}\n\n")
                 continue
-            for clause in article["clauses"]:
-                if not clause["points"]:
-                    flat_chunks.append(f"{article['title']}\n{clause['text']}")
                 else:
-                    for point in clause["points"]:
-                        flat_chunks.append(f"{article['title']}\n{point}")
-    logging.info(f"Tổng số chunk sau khi xử lý: {len(flat_chunks)}")
-    for i, c in enumerate(flat_chunks[:2]):
-        logging.info(f"Mẫu chunk {i+1}:\n{c[:300]}...\n")
-    return flat_chunks

 @log_timed("chunking văn bản luật")
 def chunk_legal_text(text: str) -> List[str]:
+    # Mẫu regex cho cấp chương, điều, khoản, điểm
+    chapter_pattern = r"(Chương\s+[IVXLC]+\s+.+?)(?=(?:\nChương\s+[IVXLC]+\s+)|\Z)"
+    article_pattern = r"(Điều\s+\d+\..+?)(?=(?:\nĐiều\s+\d+\.|\nChương\s+[IVXLC]+|\Z))"
+    clause_pattern = r"(?:\n|\A)(\d+\..+?)(?=(?:\n\d+\.|\n[a-zA-Z]\)|\nĐiều\s+\d+\.|\nChương\s+[IVXLC]+|\Z))"
+    point_pattern = r"(?:\n|\A)([a-zA-Z][\)|\.].+?)(?=(?:\n[a-zA-Z][\)|\.]|\n\d+\.|\nĐiều\s+\d+\.|\nChương\s+[IVXLC]+|\Z))"
+    chapters = []
+    for chapter_match in re.finditer(chapter_pattern, text, flags=re.DOTALL):
+        chapter_text = chapter_match.group(1).strip()
+        chapter = {
+            "title": chapter_text.split("\n")[0],
+            "articles": []
+        }
+        article_block = chapter_text[len(chapter["title"]):].strip()
+        for article_match in re.finditer(article_pattern, article_block, flags=re.DOTALL):
+            article_text = article_match.group(1).strip()
+            article = {
+                "title": article_text.split("\n")[0],
+                "clauses": []
+            }
+            clause_block = article_text[len(article["title"]):].strip()
+            clauses = re.findall(clause_pattern, clause_block, flags=re.DOTALL)
+            if clauses:
+                for clause_text in clauses:
+                    clause = {
+                        "title": clause_text.split("\n")[0],
+                        "points": []
+                    }
+                    point_block = clause_text[len(clause["title"]):].strip()
+                    points = re.findall(point_pattern, point_block, flags=re.DOTALL)
+                    if points:
+                        clause["points"] = [p.strip() for p in points if len(p.strip()) > 10]
                     article["clauses"].append(clause)
+            else:
+                # Nếu không có clause, thêm luôn toàn bộ nội dung
+                article["clauses"] = []
             chapter["articles"].append(article)
+        chapters.append(chapter)
+    # Flatten để tạo chunks
+    chunks = []
+    for chapter in chapters:
+        for article in chapter.get("articles", []):
+            if not article.get("clauses"):
+                chunks.append(article["title"])
                 continue
+            for clause in article.get("clauses", []):
+                if not clause.get("points"):
+                    chunks.append(clause["title"])
                 else:
+                    for point in clause.get("points"):
+                        chunks.append(point)
+    # Log cấu trúc nested ra file JSON
+    json_path_local = "faiss_index/chunk_structure.json"
+    os.makedirs(os.path.dirname(json_path_local), exist_ok=True)
+    with open(json_path_local, "w", encoding="utf-8") as f:
+        json.dump(chapters, f, indent=2, ensure_ascii=False)
+    logging.info(f"✅ Đã ghi cấu trúc nested vào {json_path_local}")
+    # Gợi ý đường dẫn tải cho Hugging Face Spaces
+    download_link = "/file/faiss_index/chunk_structure.json"
+    logging.info(f"📎 Link tải JSON cấu trúc: {download_link}")
+    return chunks