Spaces:

Vaibuzzz
/

financial-intelligence-ai

Running

App Files Files Community

financial-intelligence-ai / scripts /validate_data.py

Vaibuzzz

Upload folder using huggingface_hub

10ff0db verified 3 days ago

raw

history blame contribute delete

3.13 kB

	"""Quick validation of with_anomalies.jsonl before Kaggle upload."""
	import json
	import os

	filepath = "data/training/with_anomalies.jsonl"
	docs = []
	with open(filepath, "r", encoding="utf-8") as f:
	for line in f:
	docs.append(json.loads(line.strip()))

	print(f"Total documents: {len(docs)}")
	print(f"File size: {os.path.getsize(filepath)/1024:.1f} KB")
	print()

	# Structure check
	d = docs[0]
	print("=== FIRST DOC STRUCTURE ===")
	print(f"Top-level keys: {list(d.keys())}")
	print(f"doc_type: {d.get('doc_type', 'MISSING')}")
	print(f"raw_text length: {len(d.get('raw_text', ''))} chars")
	gt = d.get("ground_truth", {})
	print(f"ground_truth keys: {list(gt.keys())}")
	print(f" common keys: {list(gt.get('common', {}).keys())}")
	print(f" line_items count: {len(gt.get('line_items', []))}")
	print(f" type_specific keys: {list(gt.get('type_specific', {}).keys())}")
	print(f" flags count: {len(gt.get('flags', []))}")
	print(f" confidence_score: {gt.get('confidence_score', 'MISSING')}")
	print()

	# Show a sample raw_text (first 300 chars)
	print("=== SAMPLE RAW TEXT (first 300 chars) ===")
	print(d.get("raw_text", "")[:300])
	print("...")
	print()

	# Show sample ground truth output
	print("=== SAMPLE GROUND TRUTH OUTPUT ===")
	print(json.dumps(gt, indent=2)[:600])
	print("...")
	print()

	# Distribution
	types = {}
	with_flags = 0
	total_flags = 0
	flag_cats = {}

	for doc in docs:
	dt = doc.get("doc_type", "?")
	types[dt] = types.get(dt, 0) + 1
	gt = doc.get("ground_truth", {})
	flags = gt.get("flags", [])
	if flags:
	with_flags += 1
	total_flags += len(flags)
	for f in flags:
	cat = f.get("category", "?")
	flag_cats[cat] = flag_cats.get(cat, 0) + 1

	print("=== DISTRIBUTION ===")
	for t, c in sorted(types.items()):
	print(f" {t:<20}: {c:>4} ({100*c/len(docs):.0f}%)")
	print()
	print(f"Docs with anomaly flags: {with_flags}/{len(docs)} ({100*with_flags/len(docs):.0f}%)")
	print(f"Total flags: {total_flags}")
	print(f"Flag categories:")
	for cat, cnt in sorted(flag_cats.items()):
	print(f" {cat:<25}: {cnt}")
	print()

	# Validate every doc
	errors = 0
	for i, doc in enumerate(docs):
	if "raw_text" not in doc:
	print(f" ERROR doc[{i}]: missing raw_text"); errors += 1
	if "ground_truth" not in doc:
	print(f" ERROR doc[{i}]: missing ground_truth"); errors += 1
	continue
	gt = doc.get("ground_truth", {})
	if "common" not in gt:
	print(f" ERROR doc[{i}]: missing common"); errors += 1
	if "flags" not in gt:
	print(f" ERROR doc[{i}]: missing flags"); errors += 1
	if "confidence_score" not in gt:
	print(f" ERROR doc[{i}]: missing confidence_score"); errors += 1
	common = gt.get("common", {})
	if "document_type" not in common:
	print(f" ERROR doc[{i}]: missing document_type"); errors += 1
	if "total_amount" not in common:
	print(f" ERROR doc[{i}]: missing total_amount"); errors += 1

	if errors == 0:
	print("=== VALIDATION: ✅ ALL 150 DOCS PASS ===")
	print("File is READY for Kaggle upload!")
	else:
	print(f"=== VALIDATION: ❌ {errors} ERRORS FOUND ===")