import os import json import glob from typing import List, Dict def validate_extraction(): root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) extracted_dir = os.path.join(root_dir, "extracted_data") amendment_folders = sorted(glob.glob(os.path.join(extracted_dir, "amendment_*"))) total_files = 0 errors = [] stats = { "articles": 0, "fragments": 0, "summaries": 0 } print(f"Scanning {len(amendment_folders)} amendment folders...\n") for folder in amendment_folders: amendment_name = os.path.basename(folder) files = glob.glob(os.path.join(folder, "*.json")) if not files: errors.append(f"[EMPTY FOLDER] {amendment_name} has no extracted files.") continue for file_path in files: total_files += 1 filename = os.path.basename(file_path) try: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) except json.JSONDecodeError: errors.append(f"[INVALID JSON] {amendment_name}/{filename} could not be parsed.") continue # Check 1: Content content = data.get("content", "").strip() if not content: errors.append(f"[EMPTY CONTENT] {amendment_name}/{filename} has no content.") # Check 2: Metadata existence meta = data.get("metadata", {}) if "amendment_number" not in meta: errors.append(f"[MISSING META] {amendment_name}/{filename} missing amendment_number.") # Check 3: Type stats & Logic c_type = meta.get("type", "unknown") if c_type == "article_change": if "art_" in filename: # Should have article_number stats["articles"] += 1 if "article_number" not in meta: errors.append(f"[MISSING ART NUM] {amendment_name}/{filename} is an article file but missing article_number.") elif "fragment_" in filename: stats["fragments"] += 1 elif c_type == "amendment_summary": stats["summaries"] += 1 else: errors.append(f"[UNKNOWN TYPE] {amendment_name}/{filename} has unknown type '{c_type}'.") # Report print("-" * 40) print(f"Total Files Scanned: {total_files}") print(f" - Articles Found: {stats['articles']}") print(f" - Fragments Found: {stats['fragments']}") print(f" - Summaries Found: {stats['summaries']}") print("-" * 40) if errors: print(f"\nFound {len(errors)} ERRORS:") for e in errors: print(e) print("\n[FAIL] Validation found issues.") exit(1) else: print("\n[SUCCESS] All files look structurally valid.") exit(0) if __name__ == "__main__": validate_extraction()