Spaces:

Meshyboi
/

ConstitutionAgent

Running

App Files Files Community

ConstitutionAgent / data_tools /extract_articles.py

Meshyboi

Upload 53 files

0cd3dc5 verified 18 days ago

raw

history blame contribute delete

5.25 kB

	import os
	import json
	import glob
	import re
	from typing import Dict, Any, List

	def extract_articles_from_amendments():
	# 1. Setup paths
	root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	data_dir = os.path.join(root_dir, "amendments_data")
	output_dir = os.path.join(root_dir, "extracted_data")

	# 2. Find all amendment files
	json_files = sorted(glob.glob(os.path.join(data_dir, "amendment_*.json")))
	print(f"Found {len(json_files)} amendment files.")

	for file_path in json_files:
	filename = os.path.basename(file_path)

	# Load Raw Data
	with open(file_path, 'r', encoding='utf-8') as f:
	data = json.load(f)

	amendment_num = data.get("amendment_number", "unknown")
	# Global Metadata
	date_str = data.get("date", "")
	year = int(date_str[:4]) if date_str and len(date_str) >= 4 else None
	global_desc = data.get("description", "")

	base_metadata = {
	"amendment_number": amendment_num,
	"year": year,
	"date": date_str,
	"amendment_title": data.get("title", ""),
	"amendment_description": global_desc
	}

	# Create output folder: extracted_data/amendment_000/
	amendment_folder = os.path.join(output_dir, f"amendment_{amendment_num:03d}")
	os.makedirs(amendment_folder, exist_ok=True)

	# --- A. Save Summary/Description ---
	if global_desc:
	summary_obj = {
	"file_id": f"am_{amendment_num}_summary",
	"content": global_desc,
	"metadata": {
	**base_metadata,
	"type": "amendment_summary",
	"affected_part": "SUMMARY"
	}
	}
	with open(os.path.join(amendment_folder, "summary.json"), 'w', encoding='utf-8') as f:
	json.dump(summary_obj, f, indent=2, ensure_ascii=False)

	# --- B. Process Changes (Extract Articles) ---
	changes = data.get("changes", [])

	for idx, change in enumerate(changes):
	added_lines = change.get("added", [])
	if not added_lines:
	continue

	change_text = "\n".join([line.strip() for line in added_lines if line.strip()])
	if not change_text:
	continue

	file_ref = change.get("file", f"change_{idx}")
	affected_part = file_ref.replace(".txt", "") # e.g. PART05

	# Identify sub-articles using Regex
	# Pattern: "21. ", "300A. " at start of line
	pattern = r'(?:\n\|^)(\d+[A-Z]?)\.\s'
	matches = list(re.finditer(pattern, change_text))

	if matches:
	# We found distinct articles
	for i, match in enumerate(matches):
	article_num = match.group(1)
	start_idx = match.start()
	end_idx = matches[i+1].start() if i + 1 < len(matches) else len(change_text)

	article_content = change_text[start_idx:end_idx].strip()
	safe_art_num = article_num.replace(".", "_") # Safety

	article_obj = {
	"file_id": f"am_{amendment_num}_art_{safe_art_num}",
	"content": article_content,
	"metadata": {
	**base_metadata,
	"type": "article_change",
	"affected_part": affected_part,
	"article_number": article_num,
	"source_file": file_ref
	}
	}

	# Save: PART03_art_14.json to avoid collisions with Schedule items
	out_name = f"{affected_part}_art_{safe_art_num}.json"
	with open(os.path.join(amendment_folder, out_name), 'w', encoding='utf-8') as f:
	json.dump(article_obj, f, indent=2, ensure_ascii=False)
	else:
	# No specific articles found, save as generic part fragment
	clean_ref = re.sub(r'[^a-zA-Z0-9]', '_', affected_part).lower()
	frag_id = f"am_{amendment_num}_{clean_ref}_{idx}"

	fragment_obj = {
	"file_id": frag_id,
	"content": change_text,
	"metadata": {
	**base_metadata,
	"type": "article_change",
	"affected_part": affected_part,
	"source_file": file_ref
	}
	}

	out_name = f"fragment_{clean_ref}_{idx}.json"
	with open(os.path.join(amendment_folder, out_name), 'w', encoding='utf-8') as f:
	json.dump(fragment_obj, f, indent=2, ensure_ascii=False)

	print(f"\nExtraction complete! Check '{output_dir}'.")

	if __name__ == "__main__":
	extract_articles_from_amendments()