File size: 2,668 Bytes
64da886
c62a9fc
64da886
c62a9fc
 
64da886
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c62a9fc
 
 
 
64da886
c62a9fc
 
777e3bb
64da886
777e3bb
64da886
777e3bb
64da886
 
777e3bb
64da886
777e3bb
64da886
777e3bb
64da886
 
 
777e3bb
 
64da886
777e3bb
64da886
 
 
777e3bb
64da886
777e3bb
64da886
 
 
dc56b59
64da886
 
777e3bb
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# run_all_samples.py
import json
import os
from pathlib import Path
from fastapi.testclient import TestClient
from app import app

# -----------------------------
# HuggingFace dataset URLs
# -----------------------------
SAMPLE_URLS = [
    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_1.pdf",
    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_2.pdf",
    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_3.pdf",
    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_4.pdf",
    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_5.pdf",
    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_6.pdf",
    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_7.pdf",
    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_8.pdf",
    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_9.pdf",
    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_10.pdf",
    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_11.pdf",
    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_12.pdf",
    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_13.pdf",
    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_14.pdf",
    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_15.pdf",
]

BASE_DIR = Path(__file__).parent
RESULTS_DIR = BASE_DIR / "results"
RESULTS_DIR.mkdir(exist_ok=True)

client = TestClient(app)


def main():
    results = {}

    print("\n🚀 Running extractor on HuggingFace dataset...\n")

    for url in SAMPLE_URLS:
        fname = url.split("/")[-1]

        print(f"🔍 Processing: {fname}")

        resp = client.post("/extract-bill-data", json={"document": url})

        if resp.status_code != 200:
            print(f"❌ Error: {resp.status_code}")
            results[fname] = "API_ERROR"
            continue

        data = resp.json()

        out_file = RESULTS_DIR / f"{fname}.json"
        with open(out_file, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2)

        item_count = data["data"]["total_item_count"]

        results[fname] = item_count
        print(f"   → Saved: {out_file}")
        print(f"   → Items Extracted: {item_count}")

    print("\n📊 FINAL SUMMARY:")
    print(json.dumps(results, indent=2))


if __name__ == "__main__":
    main()