Sathvik-kota commited on
Commit
64da886
·
verified ·
1 Parent(s): 75c6ae4

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. app.py +3 -6
  2. run_all_samples.py +45 -125
app.py CHANGED
@@ -752,10 +752,7 @@ def health_check():
752
  from fastapi import BackgroundTasks
753
 
754
  @app.get("/run-all-samples")
755
- async def run_all_samples_endpoint():
756
  import run_all_samples
757
- run_all_samples.main() # RUN DIRECTLY, NOT BACKGROUND
758
- return {
759
- "status": "completed",
760
- "message": "All samples processed. Check results/ folder."
761
- }
 
752
  from fastapi import BackgroundTasks
753
 
754
  @app.get("/run-all-samples")
755
+ async def run_all_samples():
756
  import run_all_samples
757
+ run_all_samples.main()
758
+ return {"status": "done", "results_ready": True}
 
 
 
run_all_samples.py CHANGED
@@ -1,150 +1,70 @@
1
- # run_all_samples.py (with logging)
2
- import os
3
  import json
 
4
  from pathlib import Path
5
- from datetime import datetime
6
-
7
  from fastapi.testclient import TestClient
8
- from app import app as fastapi_app # FIXED IMPORT
9
-
10
- from fastapi import HTTPException
11
- from fastapi.responses import PlainTextResponse
12
-
13
- # List files in samples/
14
- @app.get("/debug-list-samples")
15
- def debug_list_samples():
16
- base = Path(__file__).parent
17
- samples = base / "samples"
18
- if not samples.exists():
19
- raise HTTPException(status_code=404, detail="samples/ folder not found")
20
- items = []
21
- for p in sorted(samples.rglob("*")):
22
- if p.is_file():
23
- items.append(str(p.relative_to(base)))
24
- return {"count": len(items), "files": items}
25
-
26
- # List files in results/ (what we expect run_all_samples to write)
27
- @app.get("/debug-list-results")
28
- def debug_list_results():
29
- base = Path(__file__).parent
30
- results = base / "results"
31
- if not results.exists():
32
- return {"count": 0, "files": []}
33
- items = []
34
- for p in sorted(results.rglob("*")):
35
- if p.is_file():
36
- items.append(str(p.relative_to(base)))
37
- return {"count": len(items), "files": items}
38
-
39
- # Return tail of run_all_samples.log (first/last lines) as plain text
40
- @app.get("/debug-get-log", response_class=PlainTextResponse)
41
- def debug_get_log(lines: int = 200):
42
- base = Path(__file__).parent
43
- log_file = base / "run_all_samples.log"
44
- if not log_file.exists():
45
- return "NO LOG FILE"
46
- try:
47
- text = log_file.read_text(encoding="utf-8", errors="ignore")
48
- parts = text.splitlines()
49
- # return last `lines` lines
50
- out = "\n".join(parts[-lines:])
51
- return out
52
- except Exception as e:
53
- return f"ERROR reading log: {e}"
54
-
55
-
56
- # ----------------------------------------------------
57
- # Paths
58
- # ----------------------------------------------------
59
  BASE_DIR = Path(__file__).parent
60
- SAMPLES_DIR = BASE_DIR / "samples"
61
  RESULTS_DIR = BASE_DIR / "results"
62
- LOG_FILE = BASE_DIR / "run_all_samples.log"
63
-
64
  RESULTS_DIR.mkdir(exist_ok=True)
65
 
66
- # ----------------------------------------------------
67
- # Logging helper
68
- # ----------------------------------------------------
69
- def log(msg: str):
70
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
71
- with open(LOG_FILE, "a", encoding="utf-8") as f:
72
- f.write(f"[{timestamp}] {msg}\n")
73
-
74
-
75
- # ----------------------------------------------------
76
- # File filter
77
- # ----------------------------------------------------
78
- def is_bill_file(path: Path):
79
- return path.suffix.lower() in [".pdf", ".png", ".jpg", ".jpeg"]
80
 
81
 
82
- # ----------------------------------------------------
83
- # Main extraction function
84
- # ----------------------------------------------------
85
  def main():
86
- log("========== RUN STARTED ==========")
87
- log(f"Samples directory: {SAMPLES_DIR}")
88
 
89
- if not SAMPLES_DIR.exists():
90
- log("ERROR: samples/ folder does not exist!")
91
- return
92
 
93
- client = TestClient(fastapi_app)
 
94
 
95
- summary = {}
96
 
97
- for fp in sorted(SAMPLES_DIR.rglob("*")):
98
- if not fp.is_file() or not is_bill_file(fp):
99
- continue
100
-
101
- log(f"Processing file: {fp.name}")
102
 
103
- file_url = f"file://{fp.resolve()}"
104
-
105
- try:
106
- res = client.post(
107
- "/extract-bill-data",
108
- json={"document": file_url},
109
- timeout=180
110
- )
111
- except Exception as e:
112
- log(f"ERROR calling extractor: {e}")
113
- summary[fp.name] = "EXTRACTOR_ERROR"
114
  continue
115
 
116
- if res.status_code != 200:
117
- log(f"ERROR API returned status {res.status_code}")
118
- summary[fp.name] = "API_ERROR"
119
- continue
120
-
121
- data = res.json()
122
-
123
- out_path = RESULTS_DIR / (fp.stem + ".json")
124
- try:
125
- with open(out_path, "w", encoding="utf-8") as f:
126
- json.dump(data, f, indent=2, ensure_ascii=False)
127
- log(f"Saved result JSON: {out_path}")
128
- except Exception as e:
129
- log(f"ERROR writing output JSON: {e}")
130
- continue
131
 
132
- try:
133
- total_items = data["data"]["total_item_count"]
134
- except:
135
- total_items = "N/A"
136
 
137
- summary[fp.name] = total_items
138
- log(f"Items Extracted: {total_items}")
139
 
140
- # Summary
141
- log("======== SUMMARY ========")
142
- for fn, items in summary.items():
143
- log(f"{fn}: {items}")
144
 
145
- log("========== RUN FINISHED ==========\n")
 
146
 
147
 
148
- # Run manually if executed directly
149
  if __name__ == "__main__":
150
  main()
 
1
+ # run_all_samples.py
 
2
  import json
3
+ import os
4
  from pathlib import Path
 
 
5
  from fastapi.testclient import TestClient
6
+ from app import app
7
+
8
+ # -----------------------------
9
+ # HuggingFace dataset URLs
10
+ # -----------------------------
11
+ SAMPLE_URLS = [
12
+ "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_1.pdf",
13
+ "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_2.pdf",
14
+ "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_3.pdf",
15
+ "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_4.pdf",
16
+ "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_5.pdf",
17
+ "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_6.pdf",
18
+ "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_7.pdf",
19
+ "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_8.pdf",
20
+ "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_9.pdf",
21
+ "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_10.pdf",
22
+ "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_11.pdf",
23
+ "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_12.pdf",
24
+ "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_13.pdf",
25
+ "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_14.pdf",
26
+ "https://huggingface.co/datasets/Sathvik-kota/samples/resolve/main/train_sample_15.pdf",
27
+ ]
28
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  BASE_DIR = Path(__file__).parent
 
30
  RESULTS_DIR = BASE_DIR / "results"
 
 
31
  RESULTS_DIR.mkdir(exist_ok=True)
32
 
33
+ client = TestClient(app)
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
 
 
 
 
36
  def main():
37
+ results = {}
 
38
 
39
+ print("\n🚀 Running extractor on HuggingFace dataset...\n")
 
 
40
 
41
+ for url in SAMPLE_URLS:
42
+ fname = url.split("/")[-1]
43
 
44
+ print(f"🔍 Processing: {fname}")
45
 
46
+ resp = client.post("/extract-bill-data", json={"document": url})
 
 
 
 
47
 
48
+ if resp.status_code != 200:
49
+ print(f"❌ Error: {resp.status_code}")
50
+ results[fname] = "API_ERROR"
 
 
 
 
 
 
 
 
51
  continue
52
 
53
+ data = resp.json()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ out_file = RESULTS_DIR / f"{fname}.json"
56
+ with open(out_file, "w", encoding="utf-8") as f:
57
+ json.dump(data, f, indent=2)
 
58
 
59
+ item_count = data["data"]["total_item_count"]
 
60
 
61
+ results[fname] = item_count
62
+ print(f" Saved: {out_file}")
63
+ print(f" → Items Extracted: {item_count}")
 
64
 
65
+ print("\n📊 FINAL SUMMARY:")
66
+ print(json.dumps(results, indent=2))
67
 
68
 
 
69
  if __name__ == "__main__":
70
  main()