import os import json import glob import re from typing import Dict, Any, List def extract_articles_from_amendments(): # 1. Setup paths root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) data_dir = os.path.join(root_dir, "amendments_data") output_dir = os.path.join(root_dir, "extracted_data") # 2. Find all amendment files json_files = sorted(glob.glob(os.path.join(data_dir, "amendment_*.json"))) print(f"Found {len(json_files)} amendment files.") for file_path in json_files: filename = os.path.basename(file_path) # Load Raw Data with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) amendment_num = data.get("amendment_number", "unknown") # Global Metadata date_str = data.get("date", "") year = int(date_str[:4]) if date_str and len(date_str) >= 4 else None global_desc = data.get("description", "") base_metadata = { "amendment_number": amendment_num, "year": year, "date": date_str, "amendment_title": data.get("title", ""), "amendment_description": global_desc } # Create output folder: extracted_data/amendment_000/ amendment_folder = os.path.join(output_dir, f"amendment_{amendment_num:03d}") os.makedirs(amendment_folder, exist_ok=True) # --- A. Save Summary/Description --- if global_desc: summary_obj = { "file_id": f"am_{amendment_num}_summary", "content": global_desc, "metadata": { **base_metadata, "type": "amendment_summary", "affected_part": "SUMMARY" } } with open(os.path.join(amendment_folder, "summary.json"), 'w', encoding='utf-8') as f: json.dump(summary_obj, f, indent=2, ensure_ascii=False) # --- B. Process Changes (Extract Articles) --- changes = data.get("changes", []) for idx, change in enumerate(changes): added_lines = change.get("added", []) if not added_lines: continue change_text = "\n".join([line.strip() for line in added_lines if line.strip()]) if not change_text: continue file_ref = change.get("file", f"change_{idx}") affected_part = file_ref.replace(".txt", "") # e.g. PART05 # Identify sub-articles using Regex # Pattern: "21. ", "300A. " at start of line pattern = r'(?:\n|^)(\d+[A-Z]?)\.\s' matches = list(re.finditer(pattern, change_text)) if matches: # We found distinct articles for i, match in enumerate(matches): article_num = match.group(1) start_idx = match.start() end_idx = matches[i+1].start() if i + 1 < len(matches) else len(change_text) article_content = change_text[start_idx:end_idx].strip() safe_art_num = article_num.replace(".", "_") # Safety article_obj = { "file_id": f"am_{amendment_num}_art_{safe_art_num}", "content": article_content, "metadata": { **base_metadata, "type": "article_change", "affected_part": affected_part, "article_number": article_num, "source_file": file_ref } } # Save: PART03_art_14.json to avoid collisions with Schedule items out_name = f"{affected_part}_art_{safe_art_num}.json" with open(os.path.join(amendment_folder, out_name), 'w', encoding='utf-8') as f: json.dump(article_obj, f, indent=2, ensure_ascii=False) else: # No specific articles found, save as generic part fragment clean_ref = re.sub(r'[^a-zA-Z0-9]', '_', affected_part).lower() frag_id = f"am_{amendment_num}_{clean_ref}_{idx}" fragment_obj = { "file_id": frag_id, "content": change_text, "metadata": { **base_metadata, "type": "article_change", "affected_part": affected_part, "source_file": file_ref } } out_name = f"fragment_{clean_ref}_{idx}.json" with open(os.path.join(amendment_folder, out_name), 'w', encoding='utf-8') as f: json.dump(fragment_obj, f, indent=2, ensure_ascii=False) print(f"\nExtraction complete! Check '{output_dir}'.") if __name__ == "__main__": extract_articles_from_amendments()