import re import json import os import logging logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") SECTION_RE = re.compile(r"^\s*(Điều\s+\d+[A-Z]?)\.?\s*(.*)") CLAUSE_RE = re.compile(r"^\s*(\d+)\.?\s+(.*)") POINT_RE = re.compile(r"^\s*([a-zA-Z])\)\s+(.*)") PUBLIC_CHUNK_JSON_PATH = "faiss_index/chunk_structure.json" def chunk_legal_text(text): logging.info("📑 Bắt đầu chunk văn bản luật...") articles = [] current_article = None current_clause = None for line in text.splitlines(): line = line.strip() if not line: continue sec_match = SECTION_RE.match(line) clause_match = CLAUSE_RE.match(line) point_match = POINT_RE.match(line) if sec_match: if current_article: articles.append(current_article) current_article = { "article": sec_match.group(1), "title": sec_match.group(2), "clauses": [] } current_clause = None elif clause_match and current_article: current_clause = { "clause": clause_match.group(1), "text": clause_match.group(2), "points": [] } current_article["clauses"].append(current_clause) elif point_match and current_clause: current_clause["points"].append({ "point": point_match.group(1), "text": point_match.group(2) }) elif current_clause: if current_clause["points"]: current_clause["points"][-1]["text"] += " " + line else: current_clause["text"] += " " + line elif current_article: if current_article["clauses"]: current_article["clauses"][-1]["text"] += " " + line if current_article: articles.append(current_article) logging.info(f"🔎 Đã phân tích được {len(articles)} điều luật") chunks = [] for article in articles: article_header = f"{article['article']}. {article['title']}" if not article.get("clauses"): chunks.append(article_header) continue for clause in article.get("clauses", []): clause_header = f"{article['article']}.{clause['clause']}: {clause['text']}" if not clause.get("points"): chunks.append(f"{article_header}\n{clause_header}") continue for point in clause.get("points", []): chunks.append(f"{article_header}\n{clause_header}\n{point['point']}) {point['text']}") try: os.makedirs(os.path.dirname(PUBLIC_CHUNK_JSON_PATH), exist_ok=True) with open(PUBLIC_CHUNK_JSON_PATH, "w", encoding="utf-8") as f: json.dump(articles, f, ensure_ascii=False, indent=2) logging.info(f"✅ Đã ghi cấu trúc nested vào {PUBLIC_CHUNK_JSON_PATH}") if os.path.exists(PUBLIC_CHUNK_JSON_PATH): logging.info("📁 File chunk_structure.json đã được tạo thành công và có thể truy cập công khai.") else: logging.warning("⚠️ File chunk_structure.json không tồn tại sau khi ghi.") except Exception as e: logging.error(f"❌ Lỗi khi ghi file JSON: {e}") return chunks