# run_all_samples.py import json import os from pathlib import Path from fastapi.testclient import TestClient from app import app # ----------------------------- # HuggingFace dataset URLs # ----------------------------- SAMPLE_URLS = [ "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_1.pdf", "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_2.pdf", "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_3.pdf", "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_4.pdf", "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_5.pdf", "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_6.pdf", "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_7.pdf", "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_8.pdf", "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_9.pdf", "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_10.pdf", "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_11.pdf", "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_12.pdf", "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_13.pdf", "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_14.pdf", "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_15.pdf", ] BASE_DIR = Path(__file__).parent RESULTS_DIR = BASE_DIR / "results" RESULTS_DIR.mkdir(exist_ok=True) client = TestClient(app) def main(): results = {} print("\nšŸš€ Running extractor on HuggingFace dataset...\n") for url in SAMPLE_URLS: fname = url.split("/")[-1] print(f"šŸ” Processing: {fname}") resp = client.post("/extract-bill-data", json={"document": url}) if resp.status_code != 200: print(f"āŒ Error: {resp.status_code}") results[fname] = "API_ERROR" continue data = resp.json() out_file = RESULTS_DIR / f"{fname}.json" with open(out_file, "w", encoding="utf-8") as f: json.dump(data, f, indent=2) item_count = data["data"]["total_item_count"] results[fname] = item_count print(f" → Saved: {out_file}") print(f" → Items Extracted: {item_count}") print("\nšŸ“Š FINAL SUMMARY:") print(json.dumps(results, indent=2)) if __name__ == "__main__": main()