Spaces:

Meshyboi
/

ConstitutionAgent

Sleeping

File size: 5,245 Bytes

0cd3dc5

import os
import json
import glob
import re
from typing import Dict, Any, List

def extract_articles_from_amendments():
    # 1. Setup paths
    root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    data_dir = os.path.join(root_dir, "amendments_data")
    output_dir = os.path.join(root_dir, "extracted_data")
    
    # 2. Find all amendment files
    json_files = sorted(glob.glob(os.path.join(data_dir, "amendment_*.json")))
    print(f"Found {len(json_files)} amendment files.")
    
    for file_path in json_files:
        filename = os.path.basename(file_path)
        
        # Load Raw Data
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        amendment_num = data.get("amendment_number", "unknown")
        # Global Metadata
        date_str = data.get("date", "")
        year = int(date_str[:4]) if date_str and len(date_str) >= 4 else None
        global_desc = data.get("description", "")
        
        base_metadata = {
            "amendment_number": amendment_num,
            "year": year,
            "date": date_str,
            "amendment_title": data.get("title", ""),
            "amendment_description": global_desc
        }
        
        # Create output folder: extracted_data/amendment_000/
        amendment_folder = os.path.join(output_dir, f"amendment_{amendment_num:03d}")
        os.makedirs(amendment_folder, exist_ok=True)
        
        # --- A. Save Summary/Description ---
        if global_desc:
            summary_obj = {
                "file_id": f"am_{amendment_num}_summary",
                "content": global_desc,
                "metadata": {
                    **base_metadata,
                    "type": "amendment_summary",
                    "affected_part": "SUMMARY"
                }
            }
            with open(os.path.join(amendment_folder, "summary.json"), 'w', encoding='utf-8') as f:
                json.dump(summary_obj, f, indent=2, ensure_ascii=False)

        # --- B. Process Changes (Extract Articles) ---
        changes = data.get("changes", [])
        
        for idx, change in enumerate(changes):
            added_lines = change.get("added", [])
            if not added_lines:
                continue
            
            change_text = "\n".join([line.strip() for line in added_lines if line.strip()])
            if not change_text:
                continue

            file_ref = change.get("file", f"change_{idx}")
            affected_part = file_ref.replace(".txt", "") # e.g. PART05
            
            # Identify sub-articles using Regex
            # Pattern: "21. ", "300A. " at start of line
            pattern = r'(?:\n|^)(\d+[A-Z]?)\.\s'
            matches = list(re.finditer(pattern, change_text))
            
            if matches:
                # We found distinct articles
                for i, match in enumerate(matches):
                    article_num = match.group(1)
                    start_idx = match.start()
                    end_idx = matches[i+1].start() if i + 1 < len(matches) else len(change_text)
                    
                    article_content = change_text[start_idx:end_idx].strip()
                    safe_art_num = article_num.replace(".", "_") # Safety
                    
                    article_obj = {
                        "file_id": f"am_{amendment_num}_art_{safe_art_num}",
                        "content": article_content,
                        "metadata": {
                            **base_metadata,
                            "type": "article_change",
                            "affected_part": affected_part,
                            "article_number": article_num,
                            "source_file": file_ref
                        }
                    }
                    
                    # Save: PART03_art_14.json to avoid collisions with Schedule items
                    out_name = f"{affected_part}_art_{safe_art_num}.json"
                    with open(os.path.join(amendment_folder, out_name), 'w', encoding='utf-8') as f:
                        json.dump(article_obj, f, indent=2, ensure_ascii=False)
            else:
                # No specific articles found, save as generic part fragment
                clean_ref = re.sub(r'[^a-zA-Z0-9]', '_', affected_part).lower()
                frag_id = f"am_{amendment_num}_{clean_ref}_{idx}"
                
                fragment_obj = {
                    "file_id": frag_id,
                    "content": change_text,
                    "metadata": {
                        **base_metadata,
                        "type": "article_change",
                        "affected_part": affected_part,
                        "source_file": file_ref
                    }
                }
                
                out_name = f"fragment_{clean_ref}_{idx}.json"
                with open(os.path.join(amendment_folder, out_name), 'w', encoding='utf-8') as f:
                    json.dump(fragment_obj, f, indent=2, ensure_ascii=False)
                    
    print(f"\nExtraction complete! Check '{output_dir}'.")

if __name__ == "__main__":
    extract_articles_from_amendments()