Spaces:
Running
Running
| import os | |
| import json | |
| import glob | |
| from typing import List, Dict | |
| def validate_extraction(): | |
| root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| extracted_dir = os.path.join(root_dir, "extracted_data") | |
| amendment_folders = sorted(glob.glob(os.path.join(extracted_dir, "amendment_*"))) | |
| total_files = 0 | |
| errors = [] | |
| stats = { | |
| "articles": 0, | |
| "fragments": 0, | |
| "summaries": 0 | |
| } | |
| print(f"Scanning {len(amendment_folders)} amendment folders...\n") | |
| for folder in amendment_folders: | |
| amendment_name = os.path.basename(folder) | |
| files = glob.glob(os.path.join(folder, "*.json")) | |
| if not files: | |
| errors.append(f"[EMPTY FOLDER] {amendment_name} has no extracted files.") | |
| continue | |
| for file_path in files: | |
| total_files += 1 | |
| filename = os.path.basename(file_path) | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| except json.JSONDecodeError: | |
| errors.append(f"[INVALID JSON] {amendment_name}/{filename} could not be parsed.") | |
| continue | |
| # Check 1: Content | |
| content = data.get("content", "").strip() | |
| if not content: | |
| errors.append(f"[EMPTY CONTENT] {amendment_name}/{filename} has no content.") | |
| # Check 2: Metadata existence | |
| meta = data.get("metadata", {}) | |
| if "amendment_number" not in meta: | |
| errors.append(f"[MISSING META] {amendment_name}/{filename} missing amendment_number.") | |
| # Check 3: Type stats & Logic | |
| c_type = meta.get("type", "unknown") | |
| if c_type == "article_change": | |
| if "art_" in filename: # Should have article_number | |
| stats["articles"] += 1 | |
| if "article_number" not in meta: | |
| errors.append(f"[MISSING ART NUM] {amendment_name}/{filename} is an article file but missing article_number.") | |
| elif "fragment_" in filename: | |
| stats["fragments"] += 1 | |
| elif c_type == "amendment_summary": | |
| stats["summaries"] += 1 | |
| else: | |
| errors.append(f"[UNKNOWN TYPE] {amendment_name}/{filename} has unknown type '{c_type}'.") | |
| # Report | |
| print("-" * 40) | |
| print(f"Total Files Scanned: {total_files}") | |
| print(f" - Articles Found: {stats['articles']}") | |
| print(f" - Fragments Found: {stats['fragments']}") | |
| print(f" - Summaries Found: {stats['summaries']}") | |
| print("-" * 40) | |
| if errors: | |
| print(f"\nFound {len(errors)} ERRORS:") | |
| for e in errors: | |
| print(e) | |
| print("\n[FAIL] Validation found issues.") | |
| exit(1) | |
| else: | |
| print("\n[SUCCESS] All files look structurally valid.") | |
| exit(0) | |
| if __name__ == "__main__": | |
| validate_extraction() | |