| import re | |
| import json | |
| import os | |
| import logging | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") | |
| SECTION_RE = re.compile(r"^\s*(Điều\s+\d+[A-Z]?)\.?\s*(.*)") | |
| CLAUSE_RE = re.compile(r"^\s*(\d+)\.?\s+(.*)") | |
| POINT_RE = re.compile(r"^\s*([a-zA-Z])\)\s+(.*)") | |
| PUBLIC_CHUNK_JSON_PATH = "faiss_index/chunk_structure.json" | |
| def chunk_legal_text(text): | |
| logging.info("📑 Bắt đầu chunk văn bản luật...") | |
| articles = [] | |
| current_article = None | |
| current_clause = None | |
| for line in text.splitlines(): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| sec_match = SECTION_RE.match(line) | |
| clause_match = CLAUSE_RE.match(line) | |
| point_match = POINT_RE.match(line) | |
| if sec_match: | |
| if current_article: | |
| articles.append(current_article) | |
| current_article = { | |
| "article": sec_match.group(1), | |
| "title": sec_match.group(2), | |
| "clauses": [] | |
| } | |
| current_clause = None | |
| elif clause_match and current_article: | |
| current_clause = { | |
| "clause": clause_match.group(1), | |
| "text": clause_match.group(2), | |
| "points": [] | |
| } | |
| current_article["clauses"].append(current_clause) | |
| elif point_match and current_clause: | |
| current_clause["points"].append({ | |
| "point": point_match.group(1), | |
| "text": point_match.group(2) | |
| }) | |
| elif current_clause: | |
| if current_clause["points"]: | |
| current_clause["points"][-1]["text"] += " " + line | |
| else: | |
| current_clause["text"] += " " + line | |
| elif current_article: | |
| if current_article["clauses"]: | |
| current_article["clauses"][-1]["text"] += " " + line | |
| if current_article: | |
| articles.append(current_article) | |
| logging.info(f"🔎 Đã phân tích được {len(articles)} điều luật") | |
| chunks = [] | |
| for article in articles: | |
| article_header = f"{article['article']}. {article['title']}" | |
| if not article.get("clauses"): | |
| chunks.append(article_header) | |
| continue | |
| for clause in article.get("clauses", []): | |
| clause_header = f"{article['article']}.{clause['clause']}: {clause['text']}" | |
| if not clause.get("points"): | |
| chunks.append(f"{article_header}\n{clause_header}") | |
| continue | |
| for point in clause.get("points", []): | |
| chunks.append(f"{article_header}\n{clause_header}\n{point['point']}) {point['text']}") | |
| try: | |
| os.makedirs(os.path.dirname(PUBLIC_CHUNK_JSON_PATH), exist_ok=True) | |
| with open(PUBLIC_CHUNK_JSON_PATH, "w", encoding="utf-8") as f: | |
| json.dump(articles, f, ensure_ascii=False, indent=2) | |
| logging.info(f"✅ Đã ghi cấu trúc nested vào {PUBLIC_CHUNK_JSON_PATH}") | |
| if os.path.exists(PUBLIC_CHUNK_JSON_PATH): | |
| logging.info("📁 File chunk_structure.json đã được tạo thành công và có thể truy cập công khai.") | |
| else: | |
| logging.warning("⚠️ File chunk_structure.json không tồn tại sau khi ghi.") | |
| except Exception as e: | |
| logging.error(f"❌ Lỗi khi ghi file JSON: {e}") | |
| return chunks | |