Spaces:
Sleeping
Sleeping
Commit
·
5c95fcf
1
Parent(s):
4ee1ff0
Update utilities/extract_text.py
Browse files- utilities/extract_text.py +19 -14
utilities/extract_text.py
CHANGED
|
@@ -26,31 +26,36 @@
|
|
| 26 |
import os
|
| 27 |
import requests
|
| 28 |
|
| 29 |
-
|
|
|
|
| 30 |
|
| 31 |
def extract_text_remote(state):
|
| 32 |
-
|
| 33 |
-
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
# If the filename in state doesn't have extension, check if it's in the temp_files path
|
| 39 |
-
if not file_extension:
|
| 40 |
-
# Try to get extension from the actual file path
|
| 41 |
-
_, file_extension = os.path.splitext(path)
|
| 42 |
-
if file_extension:
|
| 43 |
-
# Add extension to filename
|
| 44 |
-
filename = f"{filename}{file_extension}"
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
with open(path, "rb") as f:
|
| 47 |
files = {"file": (filename, f, "application/pdf")}
|
| 48 |
data = {
|
| 49 |
-
"filename": filename,
|
| 50 |
"start_page": state.get("start_page", 1),
|
| 51 |
"end_page": state.get("end_page", 1)
|
| 52 |
}
|
| 53 |
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
|
|
|
|
|
|
|
| 54 |
resp = requests.post(EXTRACT_TEXT_API, files=files, data=data, headers=headers)
|
| 55 |
|
| 56 |
if resp.status_code != 200:
|
|
|
|
| 26 |
import os
|
| 27 |
import requests
|
| 28 |
|
| 29 |
+
# Hardcoded API URL - DO NOT CHANGE
|
| 30 |
+
EXTRACT_TEXT_API = "https://point9-extract-text-and-table.hf.space/api/text"
|
| 31 |
|
| 32 |
def extract_text_remote(state):
|
| 33 |
+
"""
|
| 34 |
+
Extract text from documents via API.
|
| 35 |
+
All file types are sent to the API (no local processing).
|
| 36 |
|
| 37 |
+
Args:
|
| 38 |
+
state: Dict with 'filename', 'temp_files', 'start_page', 'end_page'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
+
Returns:
|
| 41 |
+
state: Dict with 'text' key containing extracted text
|
| 42 |
+
"""
|
| 43 |
+
filename = state["filename"]
|
| 44 |
+
path = state["temp_files"][filename]
|
| 45 |
+
|
| 46 |
+
if not os.path.exists(path):
|
| 47 |
+
raise RuntimeError(f"File not found: {path}")
|
| 48 |
+
|
| 49 |
with open(path, "rb") as f:
|
| 50 |
files = {"file": (filename, f, "application/pdf")}
|
| 51 |
data = {
|
| 52 |
+
"filename": filename,
|
| 53 |
"start_page": state.get("start_page", 1),
|
| 54 |
"end_page": state.get("end_page", 1)
|
| 55 |
}
|
| 56 |
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
|
| 57 |
+
|
| 58 |
+
# Call API and wait for response
|
| 59 |
resp = requests.post(EXTRACT_TEXT_API, files=files, data=data, headers=headers)
|
| 60 |
|
| 61 |
if resp.status_code != 200:
|