Spaces:

Meshyboi
/

ConstitutionAgent

Running

App Files Files Community

ConstitutionAgent / validation /validate_extraction.py

Meshyboi

Upload 53 files

0cd3dc5 verified 21 days ago

raw

history blame contribute delete

3.06 kB

	import os
	import json
	import glob
	from typing import List, Dict

	def validate_extraction():
	root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	extracted_dir = os.path.join(root_dir, "extracted_data")

	amendment_folders = sorted(glob.glob(os.path.join(extracted_dir, "amendment_*")))

	total_files = 0
	errors = []
	stats = {
	"articles": 0,
	"fragments": 0,
	"summaries": 0
	}

	print(f"Scanning {len(amendment_folders)} amendment folders...\n")

	for folder in amendment_folders:
	amendment_name = os.path.basename(folder)
	files = glob.glob(os.path.join(folder, "*.json"))

	if not files:
	errors.append(f"[EMPTY FOLDER] {amendment_name} has no extracted files.")
	continue

	for file_path in files:
	total_files += 1
	filename = os.path.basename(file_path)

	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	data = json.load(f)
	except json.JSONDecodeError:
	errors.append(f"[INVALID JSON] {amendment_name}/{filename} could not be parsed.")
	continue

	# Check 1: Content
	content = data.get("content", "").strip()
	if not content:
	errors.append(f"[EMPTY CONTENT] {amendment_name}/{filename} has no content.")

	# Check 2: Metadata existence
	meta = data.get("metadata", {})
	if "amendment_number" not in meta:
	errors.append(f"[MISSING META] {amendment_name}/{filename} missing amendment_number.")

	# Check 3: Type stats & Logic
	c_type = meta.get("type", "unknown")
	if c_type == "article_change":
	if "art_" in filename: # Should have article_number
	stats["articles"] += 1
	if "article_number" not in meta:
	errors.append(f"[MISSING ART NUM] {amendment_name}/{filename} is an article file but missing article_number.")
	elif "fragment_" in filename:
	stats["fragments"] += 1
	elif c_type == "amendment_summary":
	stats["summaries"] += 1
	else:
	errors.append(f"[UNKNOWN TYPE] {amendment_name}/{filename} has unknown type '{c_type}'.")

	# Report
	print("-" * 40)
	print(f"Total Files Scanned: {total_files}")
	print(f" - Articles Found: {stats['articles']}")
	print(f" - Fragments Found: {stats['fragments']}")
	print(f" - Summaries Found: {stats['summaries']}")
	print("-" * 40)

	if errors:
	print(f"\nFound {len(errors)} ERRORS:")
	for e in errors:
	print(e)
	print("\n[FAIL] Validation found issues.")
	exit(1)
	else:
	print("\n[SUCCESS] All files look structurally valid.")
	exit(0)

	if __name__ == "__main__":
	validate_extraction()