Spaces:
Sleeping
Sleeping
| # run_all_samples.py | |
| import json | |
| import os | |
| from pathlib import Path | |
| from fastapi.testclient import TestClient | |
| from app import app | |
| # ----------------------------- | |
| # HuggingFace dataset URLs | |
| # ----------------------------- | |
| SAMPLE_URLS = [ | |
| "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_1.pdf", | |
| "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_2.pdf", | |
| "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_3.pdf", | |
| "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_4.pdf", | |
| "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_5.pdf", | |
| "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_6.pdf", | |
| "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_7.pdf", | |
| "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_8.pdf", | |
| "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_9.pdf", | |
| "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_10.pdf", | |
| "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_11.pdf", | |
| "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_12.pdf", | |
| "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_13.pdf", | |
| "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_14.pdf", | |
| "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_15.pdf", | |
| ] | |
| BASE_DIR = Path(__file__).parent | |
| RESULTS_DIR = BASE_DIR / "results" | |
| RESULTS_DIR.mkdir(exist_ok=True) | |
| client = TestClient(app) | |
| def main(): | |
| results = {} | |
| print("\nπ Running extractor on HuggingFace dataset...\n") | |
| for url in SAMPLE_URLS: | |
| fname = url.split("/")[-1] | |
| print(f"π Processing: {fname}") | |
| resp = client.post("/extract-bill-data", json={"document": url}) | |
| if resp.status_code != 200: | |
| print(f"β Error: {resp.status_code}") | |
| results[fname] = "API_ERROR" | |
| continue | |
| data = resp.json() | |
| out_file = RESULTS_DIR / f"{fname}.json" | |
| with open(out_file, "w", encoding="utf-8") as f: | |
| json.dump(data, f, indent=2) | |
| item_count = data["data"]["total_item_count"] | |
| results[fname] = item_count | |
| print(f" β Saved: {out_file}") | |
| print(f" β Items Extracted: {item_count}") | |
| print("\nπ FINAL SUMMARY:") | |
| print(json.dumps(results, indent=2)) | |
| if __name__ == "__main__": | |
| main() | |