Spaces:
Sleeping
Sleeping
Commit
·
7b72f28
1
Parent(s):
5c95fcf
Update utilities/extract_tables.py
Browse files- utilities/extract_tables.py +20 -7
utilities/extract_tables.py
CHANGED
|
@@ -27,23 +27,36 @@
|
|
| 27 |
import os
|
| 28 |
import requests
|
| 29 |
|
| 30 |
-
|
|
|
|
| 31 |
|
| 32 |
def extract_tables_remote(state):
|
| 33 |
-
|
| 34 |
-
|
|
|
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
with open(path, "rb") as f:
|
| 40 |
-
files = {"file": (
|
| 41 |
data = {
|
| 42 |
-
"filename":
|
| 43 |
"start_page": state.get("start_page", 1),
|
| 44 |
"end_page": state.get("end_page", 1),
|
| 45 |
}
|
| 46 |
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
|
|
|
|
|
|
|
| 47 |
resp = requests.post(EXTRACT_TABLES_API, files=files, data=data, headers=headers)
|
| 48 |
|
| 49 |
if resp.status_code != 200:
|
|
|
|
| 27 |
import os
|
| 28 |
import requests
|
| 29 |
|
| 30 |
+
# Hardcoded API URL - DO NOT CHANGE
|
| 31 |
+
EXTRACT_TABLES_API = "https://point9-extract-text-and-table.hf.space/api/tables"
|
| 32 |
|
| 33 |
def extract_tables_remote(state):
|
| 34 |
+
"""
|
| 35 |
+
Extract tables from documents via API.
|
| 36 |
+
All file types are sent to the API (no local processing).
|
| 37 |
|
| 38 |
+
Args:
|
| 39 |
+
state: Dict with 'filename', 'temp_files', 'start_page', 'end_page'
|
| 40 |
|
| 41 |
+
Returns:
|
| 42 |
+
state: Dict with 'tables' key containing list of extracted tables
|
| 43 |
+
"""
|
| 44 |
+
filename = state["filename"]
|
| 45 |
+
path = state["temp_files"][filename]
|
| 46 |
+
|
| 47 |
+
if not os.path.exists(path):
|
| 48 |
+
raise RuntimeError(f"File not found: {path}")
|
| 49 |
+
|
| 50 |
with open(path, "rb") as f:
|
| 51 |
+
files = {"file": (filename, f, "application/pdf")}
|
| 52 |
data = {
|
| 53 |
+
"filename": filename,
|
| 54 |
"start_page": state.get("start_page", 1),
|
| 55 |
"end_page": state.get("end_page", 1),
|
| 56 |
}
|
| 57 |
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
|
| 58 |
+
|
| 59 |
+
# Call API and wait for response
|
| 60 |
resp = requests.post(EXTRACT_TABLES_API, files=files, data=data, headers=headers)
|
| 61 |
|
| 62 |
if resp.status_code != 200:
|