ConstitutionAgent / data_tools /extract_articles.py
Meshyboi's picture
Upload 53 files
0cd3dc5 verified
import os
import json
import glob
import re
from typing import Dict, Any, List
def extract_articles_from_amendments():
# 1. Setup paths
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
data_dir = os.path.join(root_dir, "amendments_data")
output_dir = os.path.join(root_dir, "extracted_data")
# 2. Find all amendment files
json_files = sorted(glob.glob(os.path.join(data_dir, "amendment_*.json")))
print(f"Found {len(json_files)} amendment files.")
for file_path in json_files:
filename = os.path.basename(file_path)
# Load Raw Data
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
amendment_num = data.get("amendment_number", "unknown")
# Global Metadata
date_str = data.get("date", "")
year = int(date_str[:4]) if date_str and len(date_str) >= 4 else None
global_desc = data.get("description", "")
base_metadata = {
"amendment_number": amendment_num,
"year": year,
"date": date_str,
"amendment_title": data.get("title", ""),
"amendment_description": global_desc
}
# Create output folder: extracted_data/amendment_000/
amendment_folder = os.path.join(output_dir, f"amendment_{amendment_num:03d}")
os.makedirs(amendment_folder, exist_ok=True)
# --- A. Save Summary/Description ---
if global_desc:
summary_obj = {
"file_id": f"am_{amendment_num}_summary",
"content": global_desc,
"metadata": {
**base_metadata,
"type": "amendment_summary",
"affected_part": "SUMMARY"
}
}
with open(os.path.join(amendment_folder, "summary.json"), 'w', encoding='utf-8') as f:
json.dump(summary_obj, f, indent=2, ensure_ascii=False)
# --- B. Process Changes (Extract Articles) ---
changes = data.get("changes", [])
for idx, change in enumerate(changes):
added_lines = change.get("added", [])
if not added_lines:
continue
change_text = "\n".join([line.strip() for line in added_lines if line.strip()])
if not change_text:
continue
file_ref = change.get("file", f"change_{idx}")
affected_part = file_ref.replace(".txt", "") # e.g. PART05
# Identify sub-articles using Regex
# Pattern: "21. ", "300A. " at start of line
pattern = r'(?:\n|^)(\d+[A-Z]?)\.\s'
matches = list(re.finditer(pattern, change_text))
if matches:
# We found distinct articles
for i, match in enumerate(matches):
article_num = match.group(1)
start_idx = match.start()
end_idx = matches[i+1].start() if i + 1 < len(matches) else len(change_text)
article_content = change_text[start_idx:end_idx].strip()
safe_art_num = article_num.replace(".", "_") # Safety
article_obj = {
"file_id": f"am_{amendment_num}_art_{safe_art_num}",
"content": article_content,
"metadata": {
**base_metadata,
"type": "article_change",
"affected_part": affected_part,
"article_number": article_num,
"source_file": file_ref
}
}
# Save: PART03_art_14.json to avoid collisions with Schedule items
out_name = f"{affected_part}_art_{safe_art_num}.json"
with open(os.path.join(amendment_folder, out_name), 'w', encoding='utf-8') as f:
json.dump(article_obj, f, indent=2, ensure_ascii=False)
else:
# No specific articles found, save as generic part fragment
clean_ref = re.sub(r'[^a-zA-Z0-9]', '_', affected_part).lower()
frag_id = f"am_{amendment_num}_{clean_ref}_{idx}"
fragment_obj = {
"file_id": frag_id,
"content": change_text,
"metadata": {
**base_metadata,
"type": "article_change",
"affected_part": affected_part,
"source_file": file_ref
}
}
out_name = f"fragment_{clean_ref}_{idx}.json"
with open(os.path.join(amendment_folder, out_name), 'w', encoding='utf-8') as f:
json.dump(fragment_obj, f, indent=2, ensure_ascii=False)
print(f"\nExtraction complete! Check '{output_dir}'.")
if __name__ == "__main__":
extract_articles_from_amendments()