File size: 5,245 Bytes
0cd3dc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
import json
import glob
import re
from typing import Dict, Any, List

def extract_articles_from_amendments():
    # 1. Setup paths
    root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    data_dir = os.path.join(root_dir, "amendments_data")
    output_dir = os.path.join(root_dir, "extracted_data")
    
    # 2. Find all amendment files
    json_files = sorted(glob.glob(os.path.join(data_dir, "amendment_*.json")))
    print(f"Found {len(json_files)} amendment files.")
    
    for file_path in json_files:
        filename = os.path.basename(file_path)
        
        # Load Raw Data
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        amendment_num = data.get("amendment_number", "unknown")
        # Global Metadata
        date_str = data.get("date", "")
        year = int(date_str[:4]) if date_str and len(date_str) >= 4 else None
        global_desc = data.get("description", "")
        
        base_metadata = {
            "amendment_number": amendment_num,
            "year": year,
            "date": date_str,
            "amendment_title": data.get("title", ""),
            "amendment_description": global_desc
        }
        
        # Create output folder: extracted_data/amendment_000/
        amendment_folder = os.path.join(output_dir, f"amendment_{amendment_num:03d}")
        os.makedirs(amendment_folder, exist_ok=True)
        
        # --- A. Save Summary/Description ---
        if global_desc:
            summary_obj = {
                "file_id": f"am_{amendment_num}_summary",
                "content": global_desc,
                "metadata": {
                    **base_metadata,
                    "type": "amendment_summary",
                    "affected_part": "SUMMARY"
                }
            }
            with open(os.path.join(amendment_folder, "summary.json"), 'w', encoding='utf-8') as f:
                json.dump(summary_obj, f, indent=2, ensure_ascii=False)

        # --- B. Process Changes (Extract Articles) ---
        changes = data.get("changes", [])
        
        for idx, change in enumerate(changes):
            added_lines = change.get("added", [])
            if not added_lines:
                continue
            
            change_text = "\n".join([line.strip() for line in added_lines if line.strip()])
            if not change_text:
                continue

            file_ref = change.get("file", f"change_{idx}")
            affected_part = file_ref.replace(".txt", "") # e.g. PART05
            
            # Identify sub-articles using Regex
            # Pattern: "21. ", "300A. " at start of line
            pattern = r'(?:\n|^)(\d+[A-Z]?)\.\s'
            matches = list(re.finditer(pattern, change_text))
            
            if matches:
                # We found distinct articles
                for i, match in enumerate(matches):
                    article_num = match.group(1)
                    start_idx = match.start()
                    end_idx = matches[i+1].start() if i + 1 < len(matches) else len(change_text)
                    
                    article_content = change_text[start_idx:end_idx].strip()
                    safe_art_num = article_num.replace(".", "_") # Safety
                    
                    article_obj = {
                        "file_id": f"am_{amendment_num}_art_{safe_art_num}",
                        "content": article_content,
                        "metadata": {
                            **base_metadata,
                            "type": "article_change",
                            "affected_part": affected_part,
                            "article_number": article_num,
                            "source_file": file_ref
                        }
                    }
                    
                    # Save: PART03_art_14.json to avoid collisions with Schedule items
                    out_name = f"{affected_part}_art_{safe_art_num}.json"
                    with open(os.path.join(amendment_folder, out_name), 'w', encoding='utf-8') as f:
                        json.dump(article_obj, f, indent=2, ensure_ascii=False)
            else:
                # No specific articles found, save as generic part fragment
                clean_ref = re.sub(r'[^a-zA-Z0-9]', '_', affected_part).lower()
                frag_id = f"am_{amendment_num}_{clean_ref}_{idx}"
                
                fragment_obj = {
                    "file_id": frag_id,
                    "content": change_text,
                    "metadata": {
                        **base_metadata,
                        "type": "article_change",
                        "affected_part": affected_part,
                        "source_file": file_ref
                    }
                }
                
                out_name = f"fragment_{clean_ref}_{idx}.json"
                with open(os.path.join(amendment_folder, out_name), 'w', encoding='utf-8') as f:
                    json.dump(fragment_obj, f, indent=2, ensure_ascii=False)
                    
    print(f"\nExtraction complete! Check '{output_dir}'.")

if __name__ == "__main__":
    extract_articles_from_amendments()