Spaces:
Running
Running
| import os | |
| import json | |
| import glob | |
| import re | |
| from typing import Dict, Any, List | |
| def extract_articles_from_amendments(): | |
| # 1. Setup paths | |
| root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| data_dir = os.path.join(root_dir, "amendments_data") | |
| output_dir = os.path.join(root_dir, "extracted_data") | |
| # 2. Find all amendment files | |
| json_files = sorted(glob.glob(os.path.join(data_dir, "amendment_*.json"))) | |
| print(f"Found {len(json_files)} amendment files.") | |
| for file_path in json_files: | |
| filename = os.path.basename(file_path) | |
| # Load Raw Data | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| amendment_num = data.get("amendment_number", "unknown") | |
| # Global Metadata | |
| date_str = data.get("date", "") | |
| year = int(date_str[:4]) if date_str and len(date_str) >= 4 else None | |
| global_desc = data.get("description", "") | |
| base_metadata = { | |
| "amendment_number": amendment_num, | |
| "year": year, | |
| "date": date_str, | |
| "amendment_title": data.get("title", ""), | |
| "amendment_description": global_desc | |
| } | |
| # Create output folder: extracted_data/amendment_000/ | |
| amendment_folder = os.path.join(output_dir, f"amendment_{amendment_num:03d}") | |
| os.makedirs(amendment_folder, exist_ok=True) | |
| # --- A. Save Summary/Description --- | |
| if global_desc: | |
| summary_obj = { | |
| "file_id": f"am_{amendment_num}_summary", | |
| "content": global_desc, | |
| "metadata": { | |
| **base_metadata, | |
| "type": "amendment_summary", | |
| "affected_part": "SUMMARY" | |
| } | |
| } | |
| with open(os.path.join(amendment_folder, "summary.json"), 'w', encoding='utf-8') as f: | |
| json.dump(summary_obj, f, indent=2, ensure_ascii=False) | |
| # --- B. Process Changes (Extract Articles) --- | |
| changes = data.get("changes", []) | |
| for idx, change in enumerate(changes): | |
| added_lines = change.get("added", []) | |
| if not added_lines: | |
| continue | |
| change_text = "\n".join([line.strip() for line in added_lines if line.strip()]) | |
| if not change_text: | |
| continue | |
| file_ref = change.get("file", f"change_{idx}") | |
| affected_part = file_ref.replace(".txt", "") # e.g. PART05 | |
| # Identify sub-articles using Regex | |
| # Pattern: "21. ", "300A. " at start of line | |
| pattern = r'(?:\n|^)(\d+[A-Z]?)\.\s' | |
| matches = list(re.finditer(pattern, change_text)) | |
| if matches: | |
| # We found distinct articles | |
| for i, match in enumerate(matches): | |
| article_num = match.group(1) | |
| start_idx = match.start() | |
| end_idx = matches[i+1].start() if i + 1 < len(matches) else len(change_text) | |
| article_content = change_text[start_idx:end_idx].strip() | |
| safe_art_num = article_num.replace(".", "_") # Safety | |
| article_obj = { | |
| "file_id": f"am_{amendment_num}_art_{safe_art_num}", | |
| "content": article_content, | |
| "metadata": { | |
| **base_metadata, | |
| "type": "article_change", | |
| "affected_part": affected_part, | |
| "article_number": article_num, | |
| "source_file": file_ref | |
| } | |
| } | |
| # Save: PART03_art_14.json to avoid collisions with Schedule items | |
| out_name = f"{affected_part}_art_{safe_art_num}.json" | |
| with open(os.path.join(amendment_folder, out_name), 'w', encoding='utf-8') as f: | |
| json.dump(article_obj, f, indent=2, ensure_ascii=False) | |
| else: | |
| # No specific articles found, save as generic part fragment | |
| clean_ref = re.sub(r'[^a-zA-Z0-9]', '_', affected_part).lower() | |
| frag_id = f"am_{amendment_num}_{clean_ref}_{idx}" | |
| fragment_obj = { | |
| "file_id": frag_id, | |
| "content": change_text, | |
| "metadata": { | |
| **base_metadata, | |
| "type": "article_change", | |
| "affected_part": affected_part, | |
| "source_file": file_ref | |
| } | |
| } | |
| out_name = f"fragment_{clean_ref}_{idx}.json" | |
| with open(os.path.join(amendment_folder, out_name), 'w', encoding='utf-8') as f: | |
| json.dump(fragment_obj, f, indent=2, ensure_ascii=False) | |
| print(f"\nExtraction complete! Check '{output_dir}'.") | |
| if __name__ == "__main__": | |
| extract_articles_from_amendments() | |