Spaces:

VietCat
/

RAGSample

Sleeping

App Files Files Community

VietCat commited on Jun 20, 2025

Commit

823b2a9

1 Parent(s): 5186608

fix cache

Browse files

Files changed (1) hide show

rag_core/chunker.py +63 -39

rag_core/chunker.py CHANGED Viewed

@@ -5,46 +5,70 @@ import logging
 from typing import List
 from rag_core.utils import log_timed
 @log_timed("chunking văn bản luật")
 def chunk_legal_text(text: str) -> List[str]:
-    # Pattern bắt đầu theo thứ tự Chương, Điều, Khoản, Điểm
-    pattern = r"""(
-        (?:Chương\s+[IVXLC]+\s+.*?(?=\n(?:Điều\s+\d+\.|\Z)))|
-        (?:Điều\s+\d+\..*?(?=\n(?:\d+\.|Điều\s+\d+\.|Chương\s+[IVXLC]+|\Z)))|
-        (?:\n\s*\d+\..*?(?=\n\s*(?:\d+\.|[a-zA-Z]\)|[a-zA-Z]\.|Điều\s+\d+\.|Chương\s+[IVXLC]+|\Z)))|
-        (?:\n\s*[a-zA-Z]\)|[a-zA-Z]\..*?(?=\n\s*(?:[a-zA-Z]\)|[a-zA-Z]\.|\\d+\.|Điều\s+\d+\.|Chương\s+[IVXLC]+|\Z)))
-    )"""
-    matches = re.findall(pattern, text, flags=re.DOTALL | re.VERBOSE)
-    # Làm sạch và bỏ đoạn quá ngắn
-    chunks = [m.strip() for m in matches if len(m.strip()) > 30]
-    # Nếu không chunk được gì, fallback
-    if not chunks:
-        logging.warning("Không tìm thấy chunk theo cấu trúc luật. Fallback sang chia đoạn văn.")
-        chunks = [p.strip() for p in text.split("\n\n") if len(p.strip()) > 100]
-    logging.info(f"Tổng số chunk sau khi xử lý: {len(chunks)}")
-    for i, c in enumerate(chunks[:2]):
         logging.info(f"Mẫu chunk {i+1}:\n{c[:300]}...\n")
-    # Ghi cấu trúc nested vào file JSON để theo dõi
-    nested_chunks = [{"index": i + 1, "preview": c[:100]} for i, c in enumerate(chunks)]
-    # Ghi ra file JSON vào thư mục truy cập được
-    json_path_internal = "faiss_index/chunk_structure.json"
-    json_path_public = "/home/user/app/file/chunk_structure.json"
-    os.makedirs(os.path.dirname(json_path_internal), exist_ok=True)
-    os.makedirs(os.path.dirname(json_path_public), exist_ok=True)
-    with open(json_path_internal, "w", encoding="utf-8") as f:
-        json.dump(nested_chunks, f, ensure_ascii=False, indent=2)
-    with open(json_path_public, "w", encoding="utf-8") as f:
-        json.dump(nested_chunks, f, ensure_ascii=False, indent=2)
-    # Tự động tạo link tải từ domain hiện tại
-    hf_host = os.getenv("SPACE_HOST", "https://<your-space>.hf.space")  # fallback nếu không set
-    download_url = f"{hf_host}/file/chunk_structure.json"
-    logging.info(f"📄 Link tải JSON: {download_url}")
-    return chunks

 from typing import List
 from rag_core.utils import log_timed
 @log_timed("chunking văn bản luật")
 def chunk_legal_text(text: str) -> List[str]:
+    chapters = re.split(r"(Chương\s+[IVXLC]+\s+.*)", text)
+    nested = []
+    for i in range(1, len(chapters), 2):
+        chapter_title = chapters[i].strip()
+        chapter_body = chapters[i+1]
+        chapter = {"title": chapter_title, "articles": []}
+        articles = re.split(r"(Điều\s+\d+\..*?)\n", chapter_body)
+        for j in range(1, len(articles), 2):
+            article_title = articles[j].strip()
+            article_body = articles[j+1]
+            article = {"title": article_title, "clauses": []}
+            clause_blocks = re.split(r"\n\s*(\d+\..*?)", article_body)
+            if len(clause_blocks) < 2:
+                article["clauses"].append({"text": article_body.strip()})
+            else:
+                for k in range(1, len(clause_blocks), 2):
+                    clause_text = clause_blocks[k].strip() + "\n" + clause_blocks[k+1].strip()
+                    clause = {"text": clause_text, "points": []}
+                    point_blocks = re.split(r"\n\s*([a-zA-Z]\)|[a-zA-Z]\.)", clause_text)
+                    if len(point_blocks) < 2:
+                        clause["points"] = []
+                    else:
+                        points = []
+                        for m in range(1, len(point_blocks), 2):
+                            point = point_blocks[m].strip() + " " + point_blocks[m+1].strip()
+                            points.append(point)
+                        clause["points"] = points
+                    article["clauses"].append(clause)
+            chapter["articles"].append(article)
+        nested.append(chapter)
+    # Ghi cấu trúc nested ra file JSON (public file path)
+    json_path = "faiss_index/chunk_structure.json"
+    os.makedirs(os.path.dirname(json_path), exist_ok=True)
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(nested, f, ensure_ascii=False, indent=2)
+    logging.info(f"✅ Đã ghi cấu trúc nested vào {json_path}")
+    logging.info(f"📎 Link tải: /file/{json_path}")
+    # Flatten lại để trả về danh sách chunk đơn thuần
+    flat_chunks = []
+    for chapter in nested:
+        for article in chapter["articles"]:
+            if not article["clauses"]:
+                flat_chunks.append(f"{article['title']}\n\n")
+                continue
+            for clause in article["clauses"]:
+                if not clause["points"]:
+                    flat_chunks.append(f"{article['title']}\n{clause['text']}")
+                else:
+                    for point in clause["points"]:
+                        flat_chunks.append(f"{article['title']}\n{point}")
+    logging.info(f"Tổng số chunk sau khi xử lý: {len(flat_chunks)}")
+    for i, c in enumerate(flat_chunks[:2]):
         logging.info(f"Mẫu chunk {i+1}:\n{c[:300]}...\n")
+    return flat_chunks