File size: 3,131 Bytes
10ff0db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""Quick validation of with_anomalies.jsonl before Kaggle upload."""
import json
import os

filepath = "data/training/with_anomalies.jsonl"
docs = []
with open(filepath, "r", encoding="utf-8") as f:
    for line in f:
        docs.append(json.loads(line.strip()))

print(f"Total documents: {len(docs)}")
print(f"File size: {os.path.getsize(filepath)/1024:.1f} KB")
print()

# Structure check
d = docs[0]
print("=== FIRST DOC STRUCTURE ===")
print(f"Top-level keys: {list(d.keys())}")
print(f"doc_type: {d.get('doc_type', 'MISSING')}")
print(f"raw_text length: {len(d.get('raw_text', ''))} chars")
gt = d.get("ground_truth", {})
print(f"ground_truth keys: {list(gt.keys())}")
print(f"  common keys: {list(gt.get('common', {}).keys())}")
print(f"  line_items count: {len(gt.get('line_items', []))}")
print(f"  type_specific keys: {list(gt.get('type_specific', {}).keys())}")
print(f"  flags count: {len(gt.get('flags', []))}")
print(f"  confidence_score: {gt.get('confidence_score', 'MISSING')}")
print()

# Show a sample raw_text (first 300 chars)
print("=== SAMPLE RAW TEXT (first 300 chars) ===")
print(d.get("raw_text", "")[:300])
print("...")
print()

# Show sample ground truth output
print("=== SAMPLE GROUND TRUTH OUTPUT ===")
print(json.dumps(gt, indent=2)[:600])
print("...")
print()

# Distribution
types = {}
with_flags = 0
total_flags = 0
flag_cats = {}

for doc in docs:
    dt = doc.get("doc_type", "?")
    types[dt] = types.get(dt, 0) + 1
    gt = doc.get("ground_truth", {})
    flags = gt.get("flags", [])
    if flags:
        with_flags += 1
        total_flags += len(flags)
        for f in flags:
            cat = f.get("category", "?")
            flag_cats[cat] = flag_cats.get(cat, 0) + 1

print("=== DISTRIBUTION ===")
for t, c in sorted(types.items()):
    print(f"  {t:<20}: {c:>4} ({100*c/len(docs):.0f}%)")
print()
print(f"Docs with anomaly flags: {with_flags}/{len(docs)} ({100*with_flags/len(docs):.0f}%)")
print(f"Total flags: {total_flags}")
print(f"Flag categories:")
for cat, cnt in sorted(flag_cats.items()):
    print(f"  {cat:<25}: {cnt}")
print()

# Validate every doc
errors = 0
for i, doc in enumerate(docs):
    if "raw_text" not in doc:
        print(f"  ERROR doc[{i}]: missing raw_text"); errors += 1
    if "ground_truth" not in doc:
        print(f"  ERROR doc[{i}]: missing ground_truth"); errors += 1
        continue
    gt = doc.get("ground_truth", {})
    if "common" not in gt:
        print(f"  ERROR doc[{i}]: missing common"); errors += 1
    if "flags" not in gt:
        print(f"  ERROR doc[{i}]: missing flags"); errors += 1
    if "confidence_score" not in gt:
        print(f"  ERROR doc[{i}]: missing confidence_score"); errors += 1
    common = gt.get("common", {})
    if "document_type" not in common:
        print(f"  ERROR doc[{i}]: missing document_type"); errors += 1
    if "total_amount" not in common:
        print(f"  ERROR doc[{i}]: missing total_amount"); errors += 1

if errors == 0:
    print("=== VALIDATION: ✅ ALL 150 DOCS PASS ===")
    print("File is READY for Kaggle upload!")
else:
    print(f"=== VALIDATION: ❌ {errors} ERRORS FOUND ===")