VietCat commited on
Commit
3e013fc
·
1 Parent(s): 823b2a9
Files changed (1) hide show
  1. rag_core/chunker.py +57 -57
rag_core/chunker.py CHANGED
@@ -7,68 +7,68 @@ from rag_core.utils import log_timed
7
 
8
  @log_timed("chunking văn bản luật")
9
  def chunk_legal_text(text: str) -> List[str]:
10
- chapters = re.split(r"(Chương\s+[IVXLC]+\s+.*)", text)
11
- nested = []
12
-
13
- for i in range(1, len(chapters), 2):
14
- chapter_title = chapters[i].strip()
15
- chapter_body = chapters[i+1]
16
- chapter = {"title": chapter_title, "articles": []}
17
-
18
- articles = re.split(r"(Điều\s+\d+\..*?)\n", chapter_body)
19
- for j in range(1, len(articles), 2):
20
- article_title = articles[j].strip()
21
- article_body = articles[j+1]
22
- article = {"title": article_title, "clauses": []}
23
-
24
- clause_blocks = re.split(r"\n\s*(\d+\..*?)", article_body)
25
- if len(clause_blocks) < 2:
26
- article["clauses"].append({"text": article_body.strip()})
27
- else:
28
- for k in range(1, len(clause_blocks), 2):
29
- clause_text = clause_blocks[k].strip() + "\n" + clause_blocks[k+1].strip()
30
- clause = {"text": clause_text, "points": []}
31
-
32
- point_blocks = re.split(r"\n\s*([a-zA-Z]\)|[a-zA-Z]\.)", clause_text)
33
- if len(point_blocks) < 2:
34
- clause["points"] = []
35
- else:
36
- points = []
37
- for m in range(1, len(point_blocks), 2):
38
- point = point_blocks[m].strip() + " " + point_blocks[m+1].strip()
39
- points.append(point)
40
- clause["points"] = points
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  article["clauses"].append(clause)
 
 
 
43
  chapter["articles"].append(article)
44
- nested.append(chapter)
45
-
46
- # Ghi cấu trúc nested ra file JSON (public file path)
47
- json_path = "faiss_index/chunk_structure.json"
48
- os.makedirs(os.path.dirname(json_path), exist_ok=True)
49
- with open(json_path, "w", encoding="utf-8") as f:
50
- json.dump(nested, f, ensure_ascii=False, indent=2)
51
-
52
- logging.info(f"✅ Đã ghi cấu trúc nested vào {json_path}")
53
- logging.info(f"📎 Link tải: /file/{json_path}")
54
 
55
- # Flatten lại để trả về danh sách chunk đơn thuần
56
- flat_chunks = []
57
-
58
- for chapter in nested:
59
- for article in chapter["articles"]:
60
- if not article["clauses"]:
61
- flat_chunks.append(f"{article['title']}\n\n")
62
  continue
63
- for clause in article["clauses"]:
64
- if not clause["points"]:
65
- flat_chunks.append(f"{article['title']}\n{clause['text']}")
66
  else:
67
- for point in clause["points"]:
68
- flat_chunks.append(f"{article['title']}\n{point}")
 
 
 
 
 
 
 
69
 
70
- logging.info(f"Tổng số chunk sau khi xử lý: {len(flat_chunks)}")
71
- for i, c in enumerate(flat_chunks[:2]):
72
- logging.info(f"Mẫu chunk {i+1}:\n{c[:300]}...\n")
73
 
74
- return flat_chunks
 
7
 
8
  @log_timed("chunking văn bản luật")
9
  def chunk_legal_text(text: str) -> List[str]:
10
+ # Mẫu regex cho cấp chương, điều, khoản, điểm
11
+ chapter_pattern = r"(Chương\s+[IVXLC]+\s+.+?)(?=(?:\nChương\s+[IVXLC]+\s+)|\Z)"
12
+ article_pattern = r"(Điều\s+\d+\..+?)(?=(?:\nĐiều\s+\d+\.|\nChương\s+[IVXLC]+|\Z))"
13
+ clause_pattern = r"(?:\n|\A)(\d+\..+?)(?=(?:\n\d+\.|\n[a-zA-Z]\)|\nĐiều\s+\d+\.|\nChương\s+[IVXLC]+|\Z))"
14
+ point_pattern = r"(?:\n|\A)([a-zA-Z][\)|\.].+?)(?=(?:\n[a-zA-Z][\)|\.]|\n\d+\.|\nĐiều\s+\d+\.|\nChương\s+[IVXLC]+|\Z))"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ chapters = []
17
+ for chapter_match in re.finditer(chapter_pattern, text, flags=re.DOTALL):
18
+ chapter_text = chapter_match.group(1).strip()
19
+ chapter = {
20
+ "title": chapter_text.split("\n")[0],
21
+ "articles": []
22
+ }
23
+ article_block = chapter_text[len(chapter["title"]):].strip()
24
+ for article_match in re.finditer(article_pattern, article_block, flags=re.DOTALL):
25
+ article_text = article_match.group(1).strip()
26
+ article = {
27
+ "title": article_text.split("\n")[0],
28
+ "clauses": []
29
+ }
30
+ clause_block = article_text[len(article["title"]):].strip()
31
+ clauses = re.findall(clause_pattern, clause_block, flags=re.DOTALL)
32
+ if clauses:
33
+ for clause_text in clauses:
34
+ clause = {
35
+ "title": clause_text.split("\n")[0],
36
+ "points": []
37
+ }
38
+ point_block = clause_text[len(clause["title"]):].strip()
39
+ points = re.findall(point_pattern, point_block, flags=re.DOTALL)
40
+ if points:
41
+ clause["points"] = [p.strip() for p in points if len(p.strip()) > 10]
42
  article["clauses"].append(clause)
43
+ else:
44
+ # Nếu không có clause, thêm luôn toàn bộ nội dung
45
+ article["clauses"] = []
46
  chapter["articles"].append(article)
47
+ chapters.append(chapter)
 
 
 
 
 
 
 
 
 
48
 
49
+ # Flatten để tạo chunks
50
+ chunks = []
51
+ for chapter in chapters:
52
+ for article in chapter.get("articles", []):
53
+ if not article.get("clauses"):
54
+ chunks.append(article["title"])
 
55
  continue
56
+ for clause in article.get("clauses", []):
57
+ if not clause.get("points"):
58
+ chunks.append(clause["title"])
59
  else:
60
+ for point in clause.get("points"):
61
+ chunks.append(point)
62
+
63
+ # Log cấu trúc nested ra file JSON
64
+ json_path_local = "faiss_index/chunk_structure.json"
65
+ os.makedirs(os.path.dirname(json_path_local), exist_ok=True)
66
+ with open(json_path_local, "w", encoding="utf-8") as f:
67
+ json.dump(chapters, f, indent=2, ensure_ascii=False)
68
+ logging.info(f"✅ Đã ghi cấu trúc nested vào {json_path_local}")
69
 
70
+ # Gợi ý đường dẫn tải cho Hugging Face Spaces
71
+ download_link = "/file/faiss_index/chunk_structure.json"
72
+ logging.info(f"📎 Link tải JSON cấu trúc: {download_link}")
73
 
74
+ return chunks