financial-intelligence-ai / scripts /validate_data.py
Vaibuzzz's picture
Upload folder using huggingface_hub
10ff0db verified
"""Quick validation of with_anomalies.jsonl before Kaggle upload."""
import json
import os
filepath = "data/training/with_anomalies.jsonl"
docs = []
with open(filepath, "r", encoding="utf-8") as f:
for line in f:
docs.append(json.loads(line.strip()))
print(f"Total documents: {len(docs)}")
print(f"File size: {os.path.getsize(filepath)/1024:.1f} KB")
print()
# Structure check
d = docs[0]
print("=== FIRST DOC STRUCTURE ===")
print(f"Top-level keys: {list(d.keys())}")
print(f"doc_type: {d.get('doc_type', 'MISSING')}")
print(f"raw_text length: {len(d.get('raw_text', ''))} chars")
gt = d.get("ground_truth", {})
print(f"ground_truth keys: {list(gt.keys())}")
print(f" common keys: {list(gt.get('common', {}).keys())}")
print(f" line_items count: {len(gt.get('line_items', []))}")
print(f" type_specific keys: {list(gt.get('type_specific', {}).keys())}")
print(f" flags count: {len(gt.get('flags', []))}")
print(f" confidence_score: {gt.get('confidence_score', 'MISSING')}")
print()
# Show a sample raw_text (first 300 chars)
print("=== SAMPLE RAW TEXT (first 300 chars) ===")
print(d.get("raw_text", "")[:300])
print("...")
print()
# Show sample ground truth output
print("=== SAMPLE GROUND TRUTH OUTPUT ===")
print(json.dumps(gt, indent=2)[:600])
print("...")
print()
# Distribution
types = {}
with_flags = 0
total_flags = 0
flag_cats = {}
for doc in docs:
dt = doc.get("doc_type", "?")
types[dt] = types.get(dt, 0) + 1
gt = doc.get("ground_truth", {})
flags = gt.get("flags", [])
if flags:
with_flags += 1
total_flags += len(flags)
for f in flags:
cat = f.get("category", "?")
flag_cats[cat] = flag_cats.get(cat, 0) + 1
print("=== DISTRIBUTION ===")
for t, c in sorted(types.items()):
print(f" {t:<20}: {c:>4} ({100*c/len(docs):.0f}%)")
print()
print(f"Docs with anomaly flags: {with_flags}/{len(docs)} ({100*with_flags/len(docs):.0f}%)")
print(f"Total flags: {total_flags}")
print(f"Flag categories:")
for cat, cnt in sorted(flag_cats.items()):
print(f" {cat:<25}: {cnt}")
print()
# Validate every doc
errors = 0
for i, doc in enumerate(docs):
if "raw_text" not in doc:
print(f" ERROR doc[{i}]: missing raw_text"); errors += 1
if "ground_truth" not in doc:
print(f" ERROR doc[{i}]: missing ground_truth"); errors += 1
continue
gt = doc.get("ground_truth", {})
if "common" not in gt:
print(f" ERROR doc[{i}]: missing common"); errors += 1
if "flags" not in gt:
print(f" ERROR doc[{i}]: missing flags"); errors += 1
if "confidence_score" not in gt:
print(f" ERROR doc[{i}]: missing confidence_score"); errors += 1
common = gt.get("common", {})
if "document_type" not in common:
print(f" ERROR doc[{i}]: missing document_type"); errors += 1
if "total_amount" not in common:
print(f" ERROR doc[{i}]: missing total_amount"); errors += 1
if errors == 0:
print("=== VALIDATION: ✅ ALL 150 DOCS PASS ===")
print("File is READY for Kaggle upload!")
else:
print(f"=== VALIDATION: ❌ {errors} ERRORS FOUND ===")