Spaces:
Sleeping
Sleeping
Commit
Β·
37b146d
1
Parent(s):
d68f17d
Update utilities/extract_text.py
Browse files- utilities/extract_text.py +13 -4
utilities/extract_text.py
CHANGED
|
@@ -49,10 +49,14 @@ def extract_text_remote(state):
|
|
| 49 |
# Get file size for debugging
|
| 50 |
file_size = os.path.getsize(path)
|
| 51 |
|
|
|
|
|
|
|
|
|
|
| 52 |
print(f"\n{'='*60}")
|
| 53 |
print(f"π EXTRACT TEXT API CALL")
|
| 54 |
print(f"{'='*60}")
|
| 55 |
print(f"File: {filename}")
|
|
|
|
| 56 |
print(f"Path: {path}")
|
| 57 |
print(f"Size: {file_size} bytes")
|
| 58 |
print(f"Start Page: {state.get('start_page', 1)}")
|
|
@@ -61,16 +65,21 @@ def extract_text_remote(state):
|
|
| 61 |
print(f"Auth Token: {'β Set' if os.getenv('HUGGINGFACE_API_TOKEN') else 'β Not Set'}")
|
| 62 |
|
| 63 |
with open(path, "rb") as f:
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
| 65 |
data = {
|
| 66 |
-
"filename": filename,
|
| 67 |
-
"start_page": state.get("start_page", 1),
|
| 68 |
-
"end_page": state.get("end_page", 1)
|
| 69 |
}
|
| 70 |
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
|
| 71 |
|
| 72 |
print(f"\nπ Sending request to API...")
|
|
|
|
| 73 |
print(f"Data params: {data}")
|
|
|
|
| 74 |
|
| 75 |
# Call API and wait for response
|
| 76 |
resp = requests.post(EXTRACT_TEXT_API, files=files, data=data, headers=headers, timeout=120)
|
|
|
|
| 49 |
# Get file size for debugging
|
| 50 |
file_size = os.path.getsize(path)
|
| 51 |
|
| 52 |
+
# Extract just the filename (not full path) to match curl format
|
| 53 |
+
file_basename = os.path.basename(path)
|
| 54 |
+
|
| 55 |
print(f"\n{'='*60}")
|
| 56 |
print(f"π EXTRACT TEXT API CALL")
|
| 57 |
print(f"{'='*60}")
|
| 58 |
print(f"File: {filename}")
|
| 59 |
+
print(f"Basename: {file_basename}")
|
| 60 |
print(f"Path: {path}")
|
| 61 |
print(f"Size: {file_size} bytes")
|
| 62 |
print(f"Start Page: {state.get('start_page', 1)}")
|
|
|
|
| 65 |
print(f"Auth Token: {'β Set' if os.getenv('HUGGINGFACE_API_TOKEN') else 'β Not Set'}")
|
| 66 |
|
| 67 |
with open(path, "rb") as f:
|
| 68 |
+
# IMPORTANT: Use basename for the file tuple (matches curl format)
|
| 69 |
+
files = {"file": (file_basename, f, "application/pdf")}
|
| 70 |
+
|
| 71 |
+
# IMPORTANT: Convert page numbers to strings (matches curl -F format)
|
| 72 |
data = {
|
| 73 |
+
"filename": file_basename, # Just filename, not full path
|
| 74 |
+
"start_page": str(state.get("start_page", 1)), # String, not int
|
| 75 |
+
"end_page": str(state.get("end_page", 1)) # String, not int
|
| 76 |
}
|
| 77 |
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
|
| 78 |
|
| 79 |
print(f"\nπ Sending request to API...")
|
| 80 |
+
print(f"File tuple: ('file', ('{file_basename}', <binary>, 'application/pdf'))")
|
| 81 |
print(f"Data params: {data}")
|
| 82 |
+
print(f"Data types: start_page={type(data['start_page'])}, end_page={type(data['end_page'])}")
|
| 83 |
|
| 84 |
# Call API and wait for response
|
| 85 |
resp = requests.post(EXTRACT_TEXT_API, files=files, data=data, headers=headers, timeout=120)
|