fix cache
Browse files- rag_core/chunker.py +57 -57
rag_core/chunker.py
CHANGED
|
@@ -7,68 +7,68 @@ from rag_core.utils import log_timed
|
|
| 7 |
|
| 8 |
@log_timed("chunking văn bản luật")
|
| 9 |
def chunk_legal_text(text: str) -> List[str]:
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
chapter_body = chapters[i+1]
|
| 16 |
-
chapter = {"title": chapter_title, "articles": []}
|
| 17 |
-
|
| 18 |
-
articles = re.split(r"(Điều\s+\d+\..*?)\n", chapter_body)
|
| 19 |
-
for j in range(1, len(articles), 2):
|
| 20 |
-
article_title = articles[j].strip()
|
| 21 |
-
article_body = articles[j+1]
|
| 22 |
-
article = {"title": article_title, "clauses": []}
|
| 23 |
-
|
| 24 |
-
clause_blocks = re.split(r"\n\s*(\d+\..*?)", article_body)
|
| 25 |
-
if len(clause_blocks) < 2:
|
| 26 |
-
article["clauses"].append({"text": article_body.strip()})
|
| 27 |
-
else:
|
| 28 |
-
for k in range(1, len(clause_blocks), 2):
|
| 29 |
-
clause_text = clause_blocks[k].strip() + "\n" + clause_blocks[k+1].strip()
|
| 30 |
-
clause = {"text": clause_text, "points": []}
|
| 31 |
-
|
| 32 |
-
point_blocks = re.split(r"\n\s*([a-zA-Z]\)|[a-zA-Z]\.)", clause_text)
|
| 33 |
-
if len(point_blocks) < 2:
|
| 34 |
-
clause["points"] = []
|
| 35 |
-
else:
|
| 36 |
-
points = []
|
| 37 |
-
for m in range(1, len(point_blocks), 2):
|
| 38 |
-
point = point_blocks[m].strip() + " " + point_blocks[m+1].strip()
|
| 39 |
-
points.append(point)
|
| 40 |
-
clause["points"] = points
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
article["clauses"].append(clause)
|
|
|
|
|
|
|
|
|
|
| 43 |
chapter["articles"].append(article)
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
# Ghi cấu trúc nested ra file JSON (public file path)
|
| 47 |
-
json_path = "faiss_index/chunk_structure.json"
|
| 48 |
-
os.makedirs(os.path.dirname(json_path), exist_ok=True)
|
| 49 |
-
with open(json_path, "w", encoding="utf-8") as f:
|
| 50 |
-
json.dump(nested, f, ensure_ascii=False, indent=2)
|
| 51 |
-
|
| 52 |
-
logging.info(f"✅ Đã ghi cấu trúc nested vào {json_path}")
|
| 53 |
-
logging.info(f"📎 Link tải: /file/{json_path}")
|
| 54 |
|
| 55 |
-
# Flatten
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
flat_chunks.append(f"{article['title']}\n\n")
|
| 62 |
continue
|
| 63 |
-
for clause in article
|
| 64 |
-
if not clause
|
| 65 |
-
|
| 66 |
else:
|
| 67 |
-
for point in clause
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
|
| 74 |
-
return
|
|
|
|
| 7 |
|
| 8 |
@log_timed("chunking văn bản luật")
|
| 9 |
def chunk_legal_text(text: str) -> List[str]:
|
| 10 |
+
# Mẫu regex cho cấp chương, điều, khoản, điểm
|
| 11 |
+
chapter_pattern = r"(Chương\s+[IVXLC]+\s+.+?)(?=(?:\nChương\s+[IVXLC]+\s+)|\Z)"
|
| 12 |
+
article_pattern = r"(Điều\s+\d+\..+?)(?=(?:\nĐiều\s+\d+\.|\nChương\s+[IVXLC]+|\Z))"
|
| 13 |
+
clause_pattern = r"(?:\n|\A)(\d+\..+?)(?=(?:\n\d+\.|\n[a-zA-Z]\)|\nĐiều\s+\d+\.|\nChương\s+[IVXLC]+|\Z))"
|
| 14 |
+
point_pattern = r"(?:\n|\A)([a-zA-Z][\)|\.].+?)(?=(?:\n[a-zA-Z][\)|\.]|\n\d+\.|\nĐiều\s+\d+\.|\nChương\s+[IVXLC]+|\Z))"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
chapters = []
|
| 17 |
+
for chapter_match in re.finditer(chapter_pattern, text, flags=re.DOTALL):
|
| 18 |
+
chapter_text = chapter_match.group(1).strip()
|
| 19 |
+
chapter = {
|
| 20 |
+
"title": chapter_text.split("\n")[0],
|
| 21 |
+
"articles": []
|
| 22 |
+
}
|
| 23 |
+
article_block = chapter_text[len(chapter["title"]):].strip()
|
| 24 |
+
for article_match in re.finditer(article_pattern, article_block, flags=re.DOTALL):
|
| 25 |
+
article_text = article_match.group(1).strip()
|
| 26 |
+
article = {
|
| 27 |
+
"title": article_text.split("\n")[0],
|
| 28 |
+
"clauses": []
|
| 29 |
+
}
|
| 30 |
+
clause_block = article_text[len(article["title"]):].strip()
|
| 31 |
+
clauses = re.findall(clause_pattern, clause_block, flags=re.DOTALL)
|
| 32 |
+
if clauses:
|
| 33 |
+
for clause_text in clauses:
|
| 34 |
+
clause = {
|
| 35 |
+
"title": clause_text.split("\n")[0],
|
| 36 |
+
"points": []
|
| 37 |
+
}
|
| 38 |
+
point_block = clause_text[len(clause["title"]):].strip()
|
| 39 |
+
points = re.findall(point_pattern, point_block, flags=re.DOTALL)
|
| 40 |
+
if points:
|
| 41 |
+
clause["points"] = [p.strip() for p in points if len(p.strip()) > 10]
|
| 42 |
article["clauses"].append(clause)
|
| 43 |
+
else:
|
| 44 |
+
# Nếu không có clause, thêm luôn toàn bộ nội dung
|
| 45 |
+
article["clauses"] = []
|
| 46 |
chapter["articles"].append(article)
|
| 47 |
+
chapters.append(chapter)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
# Flatten để tạo chunks
|
| 50 |
+
chunks = []
|
| 51 |
+
for chapter in chapters:
|
| 52 |
+
for article in chapter.get("articles", []):
|
| 53 |
+
if not article.get("clauses"):
|
| 54 |
+
chunks.append(article["title"])
|
|
|
|
| 55 |
continue
|
| 56 |
+
for clause in article.get("clauses", []):
|
| 57 |
+
if not clause.get("points"):
|
| 58 |
+
chunks.append(clause["title"])
|
| 59 |
else:
|
| 60 |
+
for point in clause.get("points"):
|
| 61 |
+
chunks.append(point)
|
| 62 |
+
|
| 63 |
+
# Log cấu trúc nested ra file JSON
|
| 64 |
+
json_path_local = "faiss_index/chunk_structure.json"
|
| 65 |
+
os.makedirs(os.path.dirname(json_path_local), exist_ok=True)
|
| 66 |
+
with open(json_path_local, "w", encoding="utf-8") as f:
|
| 67 |
+
json.dump(chapters, f, indent=2, ensure_ascii=False)
|
| 68 |
+
logging.info(f"✅ Đã ghi cấu trúc nested vào {json_path_local}")
|
| 69 |
|
| 70 |
+
# Gợi ý đường dẫn tải cho Hugging Face Spaces
|
| 71 |
+
download_link = "/file/faiss_index/chunk_structure.json"
|
| 72 |
+
logging.info(f"📎 Link tải JSON cấu trúc: {download_link}")
|
| 73 |
|
| 74 |
+
return chunks
|