RAGSample / rag_core /chunker.py
VietCat's picture
fix log and download file
7bc48c0
import re
import json
import os
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
SECTION_RE = re.compile(r"^\s*(Điều\s+\d+[A-Z]?)\.?\s*(.*)")
CLAUSE_RE = re.compile(r"^\s*(\d+)\.?\s+(.*)")
POINT_RE = re.compile(r"^\s*([a-zA-Z])\)\s+(.*)")
PUBLIC_CHUNK_JSON_PATH = "faiss_index/chunk_structure.json"
def chunk_legal_text(text):
logging.info("📑 Bắt đầu chunk văn bản luật...")
articles = []
current_article = None
current_clause = None
for line in text.splitlines():
line = line.strip()
if not line:
continue
sec_match = SECTION_RE.match(line)
clause_match = CLAUSE_RE.match(line)
point_match = POINT_RE.match(line)
if sec_match:
if current_article:
articles.append(current_article)
current_article = {
"article": sec_match.group(1),
"title": sec_match.group(2),
"clauses": []
}
current_clause = None
elif clause_match and current_article:
current_clause = {
"clause": clause_match.group(1),
"text": clause_match.group(2),
"points": []
}
current_article["clauses"].append(current_clause)
elif point_match and current_clause:
current_clause["points"].append({
"point": point_match.group(1),
"text": point_match.group(2)
})
elif current_clause:
if current_clause["points"]:
current_clause["points"][-1]["text"] += " " + line
else:
current_clause["text"] += " " + line
elif current_article:
if current_article["clauses"]:
current_article["clauses"][-1]["text"] += " " + line
if current_article:
articles.append(current_article)
logging.info(f"🔎 Đã phân tích được {len(articles)} điều luật")
chunks = []
for article in articles:
article_header = f"{article['article']}. {article['title']}"
if not article.get("clauses"):
chunks.append(article_header)
continue
for clause in article.get("clauses", []):
clause_header = f"{article['article']}.{clause['clause']}: {clause['text']}"
if not clause.get("points"):
chunks.append(f"{article_header}\n{clause_header}")
continue
for point in clause.get("points", []):
chunks.append(f"{article_header}\n{clause_header}\n{point['point']}) {point['text']}")
try:
os.makedirs(os.path.dirname(PUBLIC_CHUNK_JSON_PATH), exist_ok=True)
with open(PUBLIC_CHUNK_JSON_PATH, "w", encoding="utf-8") as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
logging.info(f"✅ Đã ghi cấu trúc nested vào {PUBLIC_CHUNK_JSON_PATH}")
if os.path.exists(PUBLIC_CHUNK_JSON_PATH):
logging.info("📁 File chunk_structure.json đã được tạo thành công và có thể truy cập công khai.")
else:
logging.warning("⚠️ File chunk_structure.json không tồn tại sau khi ghi.")
except Exception as e:
logging.error(f"❌ Lỗi khi ghi file JSON: {e}")
return chunks