File size: 3,055 Bytes
0cd3dc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import json
import glob
from typing import List, Dict

def validate_extraction():
    root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    extracted_dir = os.path.join(root_dir, "extracted_data")
    
    amendment_folders = sorted(glob.glob(os.path.join(extracted_dir, "amendment_*")))
    
    total_files = 0
    errors = []
    stats = {
        "articles": 0,
        "fragments": 0,
        "summaries": 0
    }
    
    print(f"Scanning {len(amendment_folders)} amendment folders...\n")
    
    for folder in amendment_folders:
        amendment_name = os.path.basename(folder)
        files = glob.glob(os.path.join(folder, "*.json"))
        
        if not files:
            errors.append(f"[EMPTY FOLDER] {amendment_name} has no extracted files.")
            continue
            
        for file_path in files:
            total_files += 1
            filename = os.path.basename(file_path)
            
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
            except json.JSONDecodeError:
                errors.append(f"[INVALID JSON] {amendment_name}/{filename} could not be parsed.")
                continue
                
            # Check 1: Content
            content = data.get("content", "").strip()
            if not content:
                errors.append(f"[EMPTY CONTENT] {amendment_name}/{filename} has no content.")
            
            # Check 2: Metadata existence
            meta = data.get("metadata", {})
            if "amendment_number" not in meta:
                 errors.append(f"[MISSING META] {amendment_name}/{filename} missing amendment_number.")
            
            # Check 3: Type stats & Logic
            c_type = meta.get("type", "unknown")
            if c_type == "article_change":
                if "art_" in filename: # Should have article_number
                    stats["articles"] += 1
                    if "article_number" not in meta:
                        errors.append(f"[MISSING ART NUM] {amendment_name}/{filename} is an article file but missing article_number.")
                elif "fragment_" in filename:
                    stats["fragments"] += 1
            elif c_type == "amendment_summary":
                stats["summaries"] += 1
            else:
                errors.append(f"[UNKNOWN TYPE] {amendment_name}/{filename} has unknown type '{c_type}'.")

    # Report
    print("-" * 40)
    print(f"Total Files Scanned: {total_files}")
    print(f"  - Articles Found: {stats['articles']}")
    print(f"  - Fragments Found: {stats['fragments']}")
    print(f"  - Summaries Found: {stats['summaries']}")
    print("-" * 40)
    
    if errors:
        print(f"\nFound {len(errors)} ERRORS:")
        for e in errors:
            print(e)
        print("\n[FAIL] Validation found issues.")
        exit(1)
    else:
        print("\n[SUCCESS] All files look structurally valid.")
        exit(0)

if __name__ == "__main__":
    validate_extraction()