"""Quick validation of with_anomalies.jsonl before Kaggle upload.""" import json import os filepath = "data/training/with_anomalies.jsonl" docs = [] with open(filepath, "r", encoding="utf-8") as f: for line in f: docs.append(json.loads(line.strip())) print(f"Total documents: {len(docs)}") print(f"File size: {os.path.getsize(filepath)/1024:.1f} KB") print() # Structure check d = docs[0] print("=== FIRST DOC STRUCTURE ===") print(f"Top-level keys: {list(d.keys())}") print(f"doc_type: {d.get('doc_type', 'MISSING')}") print(f"raw_text length: {len(d.get('raw_text', ''))} chars") gt = d.get("ground_truth", {}) print(f"ground_truth keys: {list(gt.keys())}") print(f" common keys: {list(gt.get('common', {}).keys())}") print(f" line_items count: {len(gt.get('line_items', []))}") print(f" type_specific keys: {list(gt.get('type_specific', {}).keys())}") print(f" flags count: {len(gt.get('flags', []))}") print(f" confidence_score: {gt.get('confidence_score', 'MISSING')}") print() # Show a sample raw_text (first 300 chars) print("=== SAMPLE RAW TEXT (first 300 chars) ===") print(d.get("raw_text", "")[:300]) print("...") print() # Show sample ground truth output print("=== SAMPLE GROUND TRUTH OUTPUT ===") print(json.dumps(gt, indent=2)[:600]) print("...") print() # Distribution types = {} with_flags = 0 total_flags = 0 flag_cats = {} for doc in docs: dt = doc.get("doc_type", "?") types[dt] = types.get(dt, 0) + 1 gt = doc.get("ground_truth", {}) flags = gt.get("flags", []) if flags: with_flags += 1 total_flags += len(flags) for f in flags: cat = f.get("category", "?") flag_cats[cat] = flag_cats.get(cat, 0) + 1 print("=== DISTRIBUTION ===") for t, c in sorted(types.items()): print(f" {t:<20}: {c:>4} ({100*c/len(docs):.0f}%)") print() print(f"Docs with anomaly flags: {with_flags}/{len(docs)} ({100*with_flags/len(docs):.0f}%)") print(f"Total flags: {total_flags}") print(f"Flag categories:") for cat, cnt in sorted(flag_cats.items()): print(f" {cat:<25}: {cnt}") print() # Validate every doc errors = 0 for i, doc in enumerate(docs): if "raw_text" not in doc: print(f" ERROR doc[{i}]: missing raw_text"); errors += 1 if "ground_truth" not in doc: print(f" ERROR doc[{i}]: missing ground_truth"); errors += 1 continue gt = doc.get("ground_truth", {}) if "common" not in gt: print(f" ERROR doc[{i}]: missing common"); errors += 1 if "flags" not in gt: print(f" ERROR doc[{i}]: missing flags"); errors += 1 if "confidence_score" not in gt: print(f" ERROR doc[{i}]: missing confidence_score"); errors += 1 common = gt.get("common", {}) if "document_type" not in common: print(f" ERROR doc[{i}]: missing document_type"); errors += 1 if "total_amount" not in common: print(f" ERROR doc[{i}]: missing total_amount"); errors += 1 if errors == 0: print("=== VALIDATION: ✅ ALL 150 DOCS PASS ===") print("File is READY for Kaggle upload!") else: print(f"=== VALIDATION: ❌ {errors} ERRORS FOUND ===")