File size: 3,396 Bytes
ec666c4 5186608 7bc48c0 1853a5c ec666c4 7bc48c0 823b2a9 7bc48c0 3e013fc 7bc48c0 3e013fc 7bc48c0 823b2a9 3e013fc 7bc48c0 823b2a9 7bc48c0 823b2a9 7bc48c0 1853a5c 3e013fc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import re
import json
import os
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
SECTION_RE = re.compile(r"^\s*(Điều\s+\d+[A-Z]?)\.?\s*(.*)")
CLAUSE_RE = re.compile(r"^\s*(\d+)\.?\s+(.*)")
POINT_RE = re.compile(r"^\s*([a-zA-Z])\)\s+(.*)")
PUBLIC_CHUNK_JSON_PATH = "faiss_index/chunk_structure.json"
def chunk_legal_text(text):
logging.info("📑 Bắt đầu chunk văn bản luật...")
articles = []
current_article = None
current_clause = None
for line in text.splitlines():
line = line.strip()
if not line:
continue
sec_match = SECTION_RE.match(line)
clause_match = CLAUSE_RE.match(line)
point_match = POINT_RE.match(line)
if sec_match:
if current_article:
articles.append(current_article)
current_article = {
"article": sec_match.group(1),
"title": sec_match.group(2),
"clauses": []
}
current_clause = None
elif clause_match and current_article:
current_clause = {
"clause": clause_match.group(1),
"text": clause_match.group(2),
"points": []
}
current_article["clauses"].append(current_clause)
elif point_match and current_clause:
current_clause["points"].append({
"point": point_match.group(1),
"text": point_match.group(2)
})
elif current_clause:
if current_clause["points"]:
current_clause["points"][-1]["text"] += " " + line
else:
current_clause["text"] += " " + line
elif current_article:
if current_article["clauses"]:
current_article["clauses"][-1]["text"] += " " + line
if current_article:
articles.append(current_article)
logging.info(f"🔎 Đã phân tích được {len(articles)} điều luật")
chunks = []
for article in articles:
article_header = f"{article['article']}. {article['title']}"
if not article.get("clauses"):
chunks.append(article_header)
continue
for clause in article.get("clauses", []):
clause_header = f"{article['article']}.{clause['clause']}: {clause['text']}"
if not clause.get("points"):
chunks.append(f"{article_header}\n{clause_header}")
continue
for point in clause.get("points", []):
chunks.append(f"{article_header}\n{clause_header}\n{point['point']}) {point['text']}")
try:
os.makedirs(os.path.dirname(PUBLIC_CHUNK_JSON_PATH), exist_ok=True)
with open(PUBLIC_CHUNK_JSON_PATH, "w", encoding="utf-8") as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
logging.info(f"✅ Đã ghi cấu trúc nested vào {PUBLIC_CHUNK_JSON_PATH}")
if os.path.exists(PUBLIC_CHUNK_JSON_PATH):
logging.info("📁 File chunk_structure.json đã được tạo thành công và có thể truy cập công khai.")
else:
logging.warning("⚠️ File chunk_structure.json không tồn tại sau khi ghi.")
except Exception as e:
logging.error(f"❌ Lỗi khi ghi file JSON: {e}")
return chunks
|