fix cache
Browse files- rag_core/chunker.py +63 -39
rag_core/chunker.py
CHANGED
|
@@ -5,46 +5,70 @@ import logging
|
|
| 5 |
from typing import List
|
| 6 |
from rag_core.utils import log_timed
|
| 7 |
|
| 8 |
-
|
| 9 |
@log_timed("chunking văn bản luật")
|
| 10 |
def chunk_legal_text(text: str) -> List[str]:
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
logging.info(f"Mẫu chunk {i+1}:\n{c[:300]}...\n")
|
| 31 |
|
| 32 |
-
|
| 33 |
-
nested_chunks = [{"index": i + 1, "preview": c[:100]} for i, c in enumerate(chunks)]
|
| 34 |
-
|
| 35 |
-
# Ghi ra file JSON vào thư mục truy cập được
|
| 36 |
-
json_path_internal = "faiss_index/chunk_structure.json"
|
| 37 |
-
json_path_public = "/home/user/app/file/chunk_structure.json"
|
| 38 |
-
os.makedirs(os.path.dirname(json_path_internal), exist_ok=True)
|
| 39 |
-
os.makedirs(os.path.dirname(json_path_public), exist_ok=True)
|
| 40 |
-
with open(json_path_internal, "w", encoding="utf-8") as f:
|
| 41 |
-
json.dump(nested_chunks, f, ensure_ascii=False, indent=2)
|
| 42 |
-
with open(json_path_public, "w", encoding="utf-8") as f:
|
| 43 |
-
json.dump(nested_chunks, f, ensure_ascii=False, indent=2)
|
| 44 |
-
|
| 45 |
-
# Tự động tạo link tải từ domain hiện tại
|
| 46 |
-
hf_host = os.getenv("SPACE_HOST", "https://<your-space>.hf.space") # fallback nếu không set
|
| 47 |
-
download_url = f"{hf_host}/file/chunk_structure.json"
|
| 48 |
-
logging.info(f"📄 Link tải JSON: {download_url}")
|
| 49 |
-
|
| 50 |
-
return chunks
|
|
|
|
| 5 |
from typing import List
|
| 6 |
from rag_core.utils import log_timed
|
| 7 |
|
|
|
|
| 8 |
@log_timed("chunking văn bản luật")
|
| 9 |
def chunk_legal_text(text: str) -> List[str]:
|
| 10 |
+
chapters = re.split(r"(Chương\s+[IVXLC]+\s+.*)", text)
|
| 11 |
+
nested = []
|
| 12 |
+
|
| 13 |
+
for i in range(1, len(chapters), 2):
|
| 14 |
+
chapter_title = chapters[i].strip()
|
| 15 |
+
chapter_body = chapters[i+1]
|
| 16 |
+
chapter = {"title": chapter_title, "articles": []}
|
| 17 |
+
|
| 18 |
+
articles = re.split(r"(Điều\s+\d+\..*?)\n", chapter_body)
|
| 19 |
+
for j in range(1, len(articles), 2):
|
| 20 |
+
article_title = articles[j].strip()
|
| 21 |
+
article_body = articles[j+1]
|
| 22 |
+
article = {"title": article_title, "clauses": []}
|
| 23 |
+
|
| 24 |
+
clause_blocks = re.split(r"\n\s*(\d+\..*?)", article_body)
|
| 25 |
+
if len(clause_blocks) < 2:
|
| 26 |
+
article["clauses"].append({"text": article_body.strip()})
|
| 27 |
+
else:
|
| 28 |
+
for k in range(1, len(clause_blocks), 2):
|
| 29 |
+
clause_text = clause_blocks[k].strip() + "\n" + clause_blocks[k+1].strip()
|
| 30 |
+
clause = {"text": clause_text, "points": []}
|
| 31 |
+
|
| 32 |
+
point_blocks = re.split(r"\n\s*([a-zA-Z]\)|[a-zA-Z]\.)", clause_text)
|
| 33 |
+
if len(point_blocks) < 2:
|
| 34 |
+
clause["points"] = []
|
| 35 |
+
else:
|
| 36 |
+
points = []
|
| 37 |
+
for m in range(1, len(point_blocks), 2):
|
| 38 |
+
point = point_blocks[m].strip() + " " + point_blocks[m+1].strip()
|
| 39 |
+
points.append(point)
|
| 40 |
+
clause["points"] = points
|
| 41 |
+
|
| 42 |
+
article["clauses"].append(clause)
|
| 43 |
+
chapter["articles"].append(article)
|
| 44 |
+
nested.append(chapter)
|
| 45 |
+
|
| 46 |
+
# Ghi cấu trúc nested ra file JSON (public file path)
|
| 47 |
+
json_path = "faiss_index/chunk_structure.json"
|
| 48 |
+
os.makedirs(os.path.dirname(json_path), exist_ok=True)
|
| 49 |
+
with open(json_path, "w", encoding="utf-8") as f:
|
| 50 |
+
json.dump(nested, f, ensure_ascii=False, indent=2)
|
| 51 |
+
|
| 52 |
+
logging.info(f"✅ Đã ghi cấu trúc nested vào {json_path}")
|
| 53 |
+
logging.info(f"📎 Link tải: /file/{json_path}")
|
| 54 |
+
|
| 55 |
+
# Flatten lại để trả về danh sách chunk đơn thuần
|
| 56 |
+
flat_chunks = []
|
| 57 |
+
|
| 58 |
+
for chapter in nested:
|
| 59 |
+
for article in chapter["articles"]:
|
| 60 |
+
if not article["clauses"]:
|
| 61 |
+
flat_chunks.append(f"{article['title']}\n\n")
|
| 62 |
+
continue
|
| 63 |
+
for clause in article["clauses"]:
|
| 64 |
+
if not clause["points"]:
|
| 65 |
+
flat_chunks.append(f"{article['title']}\n{clause['text']}")
|
| 66 |
+
else:
|
| 67 |
+
for point in clause["points"]:
|
| 68 |
+
flat_chunks.append(f"{article['title']}\n{point}")
|
| 69 |
+
|
| 70 |
+
logging.info(f"Tổng số chunk sau khi xử lý: {len(flat_chunks)}")
|
| 71 |
+
for i, c in enumerate(flat_chunks[:2]):
|
| 72 |
logging.info(f"Mẫu chunk {i+1}:\n{c[:300]}...\n")
|
| 73 |
|
| 74 |
+
return flat_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|