Spaces:

Sathvik-kota
/

Datathon

Sleeping

App Files Files Community

Sathvik-kota commited on Nov 30, 2025

Commit

64da886

verified ·

1 Parent(s): 75c6ae4

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

app.py +3 -6
run_all_samples.py +45 -125

app.py CHANGED Viewed

@@ -752,10 +752,7 @@ def health_check():
 from fastapi import BackgroundTasks
 @app.get("/run-all-samples")
-async def run_all_samples_endpoint():
     import run_all_samples
-    run_all_samples.main()  # RUN DIRECTLY, NOT BACKGROUND
-    return {
-        "status": "completed",
-        "message": "All samples processed. Check results/ folder."
-    }

 from fastapi import BackgroundTasks
 @app.get("/run-all-samples")
+async def run_all_samples():
     import run_all_samples
+    run_all_samples.main()
+    return {"status": "done", "results_ready": True}

run_all_samples.py CHANGED Viewed

@@ -1,150 +1,70 @@
-# run_all_samples.py (with logging)
-import os
 import json
 from pathlib import Path
-from datetime import datetime
 from fastapi.testclient import TestClient
-from app import app as fastapi_app  # FIXED IMPORT
-from fastapi import HTTPException
-from fastapi.responses import PlainTextResponse
-# List files in samples/
-@app.get("/debug-list-samples")
-def debug_list_samples():
-    base = Path(__file__).parent
-    samples = base / "samples"
-    if not samples.exists():
-        raise HTTPException(status_code=404, detail="samples/ folder not found")
-    items = []
-    for p in sorted(samples.rglob("*")):
-        if p.is_file():
-            items.append(str(p.relative_to(base)))
-    return {"count": len(items), "files": items}
-# List files in results/ (what we expect run_all_samples to write)
-@app.get("/debug-list-results")
-def debug_list_results():
-    base = Path(__file__).parent
-    results = base / "results"
-    if not results.exists():
-        return {"count": 0, "files": []}
-    items = []
-    for p in sorted(results.rglob("*")):
-        if p.is_file():
-            items.append(str(p.relative_to(base)))
-    return {"count": len(items), "files": items}
-# Return tail of run_all_samples.log (first/last lines) as plain text
-@app.get("/debug-get-log", response_class=PlainTextResponse)
-def debug_get_log(lines: int = 200):
-    base = Path(__file__).parent
-    log_file = base / "run_all_samples.log"
-    if not log_file.exists():
-        return "NO LOG FILE"
-    try:
-        text = log_file.read_text(encoding="utf-8", errors="ignore")
-        parts = text.splitlines()
-        # return last `lines` lines
-        out = "\n".join(parts[-lines:])
-        return out
-    except Exception as e:
-        return f"ERROR reading log: {e}"
-# ----------------------------------------------------
-# Paths
-# ----------------------------------------------------
 BASE_DIR = Path(__file__).parent
-SAMPLES_DIR = BASE_DIR / "samples"
 RESULTS_DIR = BASE_DIR / "results"
-LOG_FILE = BASE_DIR / "run_all_samples.log"
 RESULTS_DIR.mkdir(exist_ok=True)
-# ----------------------------------------------------
-# Logging helper
-# ----------------------------------------------------
-def log(msg: str):
-    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    with open(LOG_FILE, "a", encoding="utf-8") as f:
-        f.write(f"[{timestamp}] {msg}\n")
-# ----------------------------------------------------
-# File filter
-# ----------------------------------------------------
-def is_bill_file(path: Path):
-    return path.suffix.lower() in [".pdf", ".png", ".jpg", ".jpeg"]
-# ----------------------------------------------------
-# Main extraction function
-# ----------------------------------------------------
 def main():
-    log("========== RUN STARTED ==========")
-    log(f"Samples directory: {SAMPLES_DIR}")
-    if not SAMPLES_DIR.exists():
-        log("ERROR: samples/ folder does not exist!")
-        return
-    client = TestClient(fastapi_app)
-    summary = {}
-    for fp in sorted(SAMPLES_DIR.rglob("*")):
-        if not fp.is_file() or not is_bill_file(fp):
-            continue
-        log(f"Processing file: {fp.name}")
-        file_url = f"file://{fp.resolve()}"
-        try:
-            res = client.post(
-                "/extract-bill-data",
-                json={"document": file_url},
-                timeout=180
-            )
-        except Exception as e:
-            log(f"ERROR calling extractor: {e}")
-            summary[fp.name] = "EXTRACTOR_ERROR"
             continue
-        if res.status_code != 200:
-            log(f"ERROR API returned status {res.status_code}")
-            summary[fp.name] = "API_ERROR"
-            continue
-        data = res.json()
-        out_path = RESULTS_DIR / (fp.stem + ".json")
-        try:
-            with open(out_path, "w", encoding="utf-8") as f:
-                json.dump(data, f, indent=2, ensure_ascii=False)
-            log(f"Saved result JSON: {out_path}")
-        except Exception as e:
-            log(f"ERROR writing output JSON: {e}")
-            continue
-        try:
-            total_items = data["data"]["total_item_count"]
-        except:
-            total_items = "N/A"
-        summary[fp.name] = total_items
-        log(f"Items Extracted: {total_items}")
-    # Summary
-    log("======== SUMMARY ========")
-    for fn, items in summary.items():
-        log(f"{fn}: {items}")
-    log("========== RUN FINISHED ==========\n")
-# Run manually if executed directly
 if __name__ == "__main__":
     main()

+# run_all_samples.py
 import json
+import os
 from pathlib import Path
 from fastapi.testclient import TestClient
+from app import app
+# -----------------------------
+# HuggingFace dataset URLs
+# -----------------------------
+SAMPLE_URLS = [
+    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_1.pdf",
+    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_2.pdf",
+    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_3.pdf",
+    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_4.pdf",
+    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_5.pdf",
+    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_6.pdf",
+    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_7.pdf",
+    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_8.pdf",
+    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_9.pdf",
+    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_10.pdf",
+    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_11.pdf",
+    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_12.pdf",
+    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_13.pdf",
+    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_14.pdf",
+    "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_15.pdf",
+]
 BASE_DIR = Path(__file__).parent
 RESULTS_DIR = BASE_DIR / "results"
 RESULTS_DIR.mkdir(exist_ok=True)
+client = TestClient(app)
 def main():
+    results = {}
+    print("\n🚀 Running extractor on HuggingFace dataset...\n")
+    for url in SAMPLE_URLS:
+        fname = url.split("/")[-1]
+        print(f"🔍 Processing: {fname}")
+        resp = client.post("/extract-bill-data", json={"document": url})
+        if resp.status_code != 200:
+            print(f"❌ Error: {resp.status_code}")
+            results[fname] = "API_ERROR"
             continue
+        data = resp.json()
+        out_file = RESULTS_DIR / f"{fname}.json"
+        with open(out_file, "w", encoding="utf-8") as f:
+            json.dump(data, f, indent=2)
+        item_count = data["data"]["total_item_count"]
+        results[fname] = item_count
+        print(f"   → Saved: {out_file}")
+        print(f"   → Items Extracted: {item_count}")
+    print("\n📊 FINAL SUMMARY:")
+    print(json.dumps(results, indent=2))
 if __name__ == "__main__":
     main()