Spaces:

VietCat
/

RAGSample

Sleeping

App Files Files Community

RAGSample / rag_core /chunker.py

VietCat

fix log and download file

7bc48c0 8 months ago

raw

history blame contribute delete

3.4 kB

	import re
	import json
	import os
	import logging

	logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

	SECTION_RE = re.compile(r"^\s(Điều\s+\d+[A-Z]?)\.?\s(.*)")
	CLAUSE_RE = re.compile(r"^\s(\d+)\.?\s+(.)")
	POINT_RE = re.compile(r"^\s([a-zA-Z])\)\s+(.)")

	PUBLIC_CHUNK_JSON_PATH = "faiss_index/chunk_structure.json"


	def chunk_legal_text(text):
	logging.info("📑 Bắt đầu chunk văn bản luật...")
	articles = []
	current_article = None
	current_clause = None

	for line in text.splitlines():
	line = line.strip()
	if not line:
	continue

	sec_match = SECTION_RE.match(line)
	clause_match = CLAUSE_RE.match(line)
	point_match = POINT_RE.match(line)

	if sec_match:
	if current_article:
	articles.append(current_article)
	current_article = {
	"article": sec_match.group(1),
	"title": sec_match.group(2),
	"clauses": []
	}
	current_clause = None
	elif clause_match and current_article:
	current_clause = {
	"clause": clause_match.group(1),
	"text": clause_match.group(2),
	"points": []
	}
	current_article["clauses"].append(current_clause)
	elif point_match and current_clause:
	current_clause["points"].append({
	"point": point_match.group(1),
	"text": point_match.group(2)
	})
	elif current_clause:
	if current_clause["points"]:
	current_clause["points"][-1]["text"] += " " + line
	else:
	current_clause["text"] += " " + line
	elif current_article:
	if current_article["clauses"]:
	current_article["clauses"][-1]["text"] += " " + line

	if current_article:
	articles.append(current_article)

	logging.info(f"🔎 Đã phân tích được {len(articles)} điều luật")

	chunks = []
	for article in articles:
	article_header = f"{article['article']}. {article['title']}"
	if not article.get("clauses"):
	chunks.append(article_header)
	continue
	for clause in article.get("clauses", []):
	clause_header = f"{article['article']}.{clause['clause']}: {clause['text']}"
	if not clause.get("points"):
	chunks.append(f"{article_header}\n{clause_header}")
	continue
	for point in clause.get("points", []):
	chunks.append(f"{article_header}\n{clause_header}\n{point['point']}) {point['text']}")

	try:
	os.makedirs(os.path.dirname(PUBLIC_CHUNK_JSON_PATH), exist_ok=True)
	with open(PUBLIC_CHUNK_JSON_PATH, "w", encoding="utf-8") as f:
	json.dump(articles, f, ensure_ascii=False, indent=2)
	logging.info(f"✅ Đã ghi cấu trúc nested vào {PUBLIC_CHUNK_JSON_PATH}")
	if os.path.exists(PUBLIC_CHUNK_JSON_PATH):
	logging.info("📁 File chunk_structure.json đã được tạo thành công và có thể truy cập công khai.")
	else:
	logging.warning("⚠️ File chunk_structure.json không tồn tại sau khi ghi.")
	except Exception as e:
	logging.error(f"❌ Lỗi khi ghi file JSON: {e}")

	return chunks