Spaces:

VietCat
/

RAGSample

Sleeping

File size: 3,396 Bytes

import re
import json
import os
import logging

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

SECTION_RE = re.compile(r"^\s*(Điều\s+\d+[A-Z]?)\.?\s*(.*)")
CLAUSE_RE = re.compile(r"^\s*(\d+)\.?\s+(.*)")
POINT_RE = re.compile(r"^\s*([a-zA-Z])\)\s+(.*)")

PUBLIC_CHUNK_JSON_PATH = "faiss_index/chunk_structure.json"


def chunk_legal_text(text):
    logging.info("📑 Bắt đầu chunk văn bản luật...")
    articles = []
    current_article = None
    current_clause = None

    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue

        sec_match = SECTION_RE.match(line)
        clause_match = CLAUSE_RE.match(line)
        point_match = POINT_RE.match(line)

        if sec_match:
            if current_article:
                articles.append(current_article)
            current_article = {
                "article": sec_match.group(1),
                "title": sec_match.group(2),
                "clauses": []
            }
            current_clause = None
        elif clause_match and current_article:
            current_clause = {
                "clause": clause_match.group(1),
                "text": clause_match.group(2),
                "points": []
            }
            current_article["clauses"].append(current_clause)
        elif point_match and current_clause:
            current_clause["points"].append({
                "point": point_match.group(1),
                "text": point_match.group(2)
            })
        elif current_clause:
            if current_clause["points"]:
                current_clause["points"][-1]["text"] += " " + line
            else:
                current_clause["text"] += " " + line
        elif current_article:
            if current_article["clauses"]:
                current_article["clauses"][-1]["text"] += " " + line

    if current_article:
        articles.append(current_article)

    logging.info(f"🔎 Đã phân tích được {len(articles)} điều luật")

    chunks = []
    for article in articles:
        article_header = f"{article['article']}. {article['title']}"
        if not article.get("clauses"):
            chunks.append(article_header)
            continue
        for clause in article.get("clauses", []):
            clause_header = f"{article['article']}.{clause['clause']}: {clause['text']}"
            if not clause.get("points"):
                chunks.append(f"{article_header}\n{clause_header}")
                continue
            for point in clause.get("points", []):
                chunks.append(f"{article_header}\n{clause_header}\n{point['point']}) {point['text']}")

    try:
        os.makedirs(os.path.dirname(PUBLIC_CHUNK_JSON_PATH), exist_ok=True)
        with open(PUBLIC_CHUNK_JSON_PATH, "w", encoding="utf-8") as f:
            json.dump(articles, f, ensure_ascii=False, indent=2)
        logging.info(f"✅ Đã ghi cấu trúc nested vào {PUBLIC_CHUNK_JSON_PATH}")
        if os.path.exists(PUBLIC_CHUNK_JSON_PATH):
            logging.info("📁 File chunk_structure.json đã được tạo thành công và có thể truy cập công khai.")
        else:
            logging.warning("⚠️ File chunk_structure.json không tồn tại sau khi ghi.")
    except Exception as e:
        logging.error(f"❌ Lỗi khi ghi file JSON: {e}")

    return chunks