Datathon / run_all_samples.py
Sathvik-kota's picture
Upload folder using huggingface_hub
64da886 verified
# run_all_samples.py
import json
import os
from pathlib import Path
from fastapi.testclient import TestClient
from app import app
# -----------------------------
# HuggingFace dataset URLs
# -----------------------------
SAMPLE_URLS = [
"https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_1.pdf",
"https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_2.pdf",
"https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_3.pdf",
"https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_4.pdf",
"https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_5.pdf",
"https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_6.pdf",
"https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_7.pdf",
"https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_8.pdf",
"https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_9.pdf",
"https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_10.pdf",
"https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_11.pdf",
"https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_12.pdf",
"https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_13.pdf",
"https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_14.pdf",
"https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_15.pdf",
]
BASE_DIR = Path(__file__).parent
RESULTS_DIR = BASE_DIR / "results"
RESULTS_DIR.mkdir(exist_ok=True)
client = TestClient(app)
def main():
results = {}
print("\nπŸš€ Running extractor on HuggingFace dataset...\n")
for url in SAMPLE_URLS:
fname = url.split("/")[-1]
print(f"πŸ” Processing: {fname}")
resp = client.post("/extract-bill-data", json={"document": url})
if resp.status_code != 200:
print(f"❌ Error: {resp.status_code}")
results[fname] = "API_ERROR"
continue
data = resp.json()
out_file = RESULTS_DIR / f"{fname}.json"
with open(out_file, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
item_count = data["data"]["total_item_count"]
results[fname] = item_count
print(f" β†’ Saved: {out_file}")
print(f" β†’ Items Extracted: {item_count}")
print("\nπŸ“Š FINAL SUMMARY:")
print(json.dumps(results, indent=2))
if __name__ == "__main__":
main()