Spaces:

Meshyboi
/

ConstitutionAgent

Running

File size: 3,055 Bytes

0cd3dc5

import os
import json
import glob
from typing import List, Dict

def validate_extraction():
    root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    extracted_dir = os.path.join(root_dir, "extracted_data")
    
    amendment_folders = sorted(glob.glob(os.path.join(extracted_dir, "amendment_*")))
    
    total_files = 0
    errors = []
    stats = {
        "articles": 0,
        "fragments": 0,
        "summaries": 0
    }
    
    print(f"Scanning {len(amendment_folders)} amendment folders...\n")
    
    for folder in amendment_folders:
        amendment_name = os.path.basename(folder)
        files = glob.glob(os.path.join(folder, "*.json"))
        
        if not files:
            errors.append(f"[EMPTY FOLDER] {amendment_name} has no extracted files.")
            continue
            
        for file_path in files:
            total_files += 1
            filename = os.path.basename(file_path)
            
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
            except json.JSONDecodeError:
                errors.append(f"[INVALID JSON] {amendment_name}/{filename} could not be parsed.")
                continue
                
            # Check 1: Content
            content = data.get("content", "").strip()
            if not content:
                errors.append(f"[EMPTY CONTENT] {amendment_name}/{filename} has no content.")
            
            # Check 2: Metadata existence
            meta = data.get("metadata", {})
            if "amendment_number" not in meta:
                 errors.append(f"[MISSING META] {amendment_name}/{filename} missing amendment_number.")
            
            # Check 3: Type stats & Logic
            c_type = meta.get("type", "unknown")
            if c_type == "article_change":
                if "art_" in filename: # Should have article_number
                    stats["articles"] += 1
                    if "article_number" not in meta:
                        errors.append(f"[MISSING ART NUM] {amendment_name}/{filename} is an article file but missing article_number.")
                elif "fragment_" in filename:
                    stats["fragments"] += 1
            elif c_type == "amendment_summary":
                stats["summaries"] += 1
            else:
                errors.append(f"[UNKNOWN TYPE] {amendment_name}/{filename} has unknown type '{c_type}'.")

    # Report
    print("-" * 40)
    print(f"Total Files Scanned: {total_files}")
    print(f"  - Articles Found: {stats['articles']}")
    print(f"  - Fragments Found: {stats['fragments']}")
    print(f"  - Summaries Found: {stats['summaries']}")
    print("-" * 40)
    
    if errors:
        print(f"\nFound {len(errors)} ERRORS:")
        for e in errors:
            print(e)
        print("\n[FAIL] Validation found issues.")
        exit(1)
    else:
        print("\n[SUCCESS] All files look structurally valid.")
        exit(0)

if __name__ == "__main__":
    validate_extraction()