ConstitutionAgent / validation /validate_extraction.py
Meshyboi's picture
Upload 53 files
0cd3dc5 verified
import os
import json
import glob
from typing import List, Dict
def validate_extraction():
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
extracted_dir = os.path.join(root_dir, "extracted_data")
amendment_folders = sorted(glob.glob(os.path.join(extracted_dir, "amendment_*")))
total_files = 0
errors = []
stats = {
"articles": 0,
"fragments": 0,
"summaries": 0
}
print(f"Scanning {len(amendment_folders)} amendment folders...\n")
for folder in amendment_folders:
amendment_name = os.path.basename(folder)
files = glob.glob(os.path.join(folder, "*.json"))
if not files:
errors.append(f"[EMPTY FOLDER] {amendment_name} has no extracted files.")
continue
for file_path in files:
total_files += 1
filename = os.path.basename(file_path)
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
except json.JSONDecodeError:
errors.append(f"[INVALID JSON] {amendment_name}/{filename} could not be parsed.")
continue
# Check 1: Content
content = data.get("content", "").strip()
if not content:
errors.append(f"[EMPTY CONTENT] {amendment_name}/{filename} has no content.")
# Check 2: Metadata existence
meta = data.get("metadata", {})
if "amendment_number" not in meta:
errors.append(f"[MISSING META] {amendment_name}/{filename} missing amendment_number.")
# Check 3: Type stats & Logic
c_type = meta.get("type", "unknown")
if c_type == "article_change":
if "art_" in filename: # Should have article_number
stats["articles"] += 1
if "article_number" not in meta:
errors.append(f"[MISSING ART NUM] {amendment_name}/{filename} is an article file but missing article_number.")
elif "fragment_" in filename:
stats["fragments"] += 1
elif c_type == "amendment_summary":
stats["summaries"] += 1
else:
errors.append(f"[UNKNOWN TYPE] {amendment_name}/{filename} has unknown type '{c_type}'.")
# Report
print("-" * 40)
print(f"Total Files Scanned: {total_files}")
print(f" - Articles Found: {stats['articles']}")
print(f" - Fragments Found: {stats['fragments']}")
print(f" - Summaries Found: {stats['summaries']}")
print("-" * 40)
if errors:
print(f"\nFound {len(errors)} ERRORS:")
for e in errors:
print(e)
print("\n[FAIL] Validation found issues.")
exit(1)
else:
print("\n[SUCCESS] All files look structurally valid.")
exit(0)
if __name__ == "__main__":
validate_extraction()