Spaces:

stellar413
/

masterllm

Sleeping

App Files Files Community

redhairedshanks1 commited on Dec 15, 2025

Commit

37b146d

1 Parent(s): d68f17d

Update utilities/extract_text.py

Browse files

Files changed (1) hide show

utilities/extract_text.py +13 -4

utilities/extract_text.py CHANGED Viewed

@@ -49,10 +49,14 @@ def extract_text_remote(state):
     # Get file size for debugging
     file_size = os.path.getsize(path)
     print(f"\n{'='*60}")
     print(f"📄 EXTRACT TEXT API CALL")
     print(f"{'='*60}")
     print(f"File: {filename}")
     print(f"Path: {path}")
     print(f"Size: {file_size} bytes")
     print(f"Start Page: {state.get('start_page', 1)}")
@@ -61,16 +65,21 @@ def extract_text_remote(state):
     print(f"Auth Token: {'✓ Set' if os.getenv('HUGGINGFACE_API_TOKEN') else '✗ Not Set'}")
     with open(path, "rb") as f:
-        files = {"file": (filename, f, "application/pdf")}
         data = {
-            "filename": filename,
-            "start_page": state.get("start_page", 1),
-            "end_page": state.get("end_page", 1)
         }
         headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
         print(f"\n🚀 Sending request to API...")
         print(f"Data params: {data}")
         # Call API and wait for response
         resp = requests.post(EXTRACT_TEXT_API, files=files, data=data, headers=headers, timeout=120)

     # Get file size for debugging
     file_size = os.path.getsize(path)
+    # Extract just the filename (not full path) to match curl format
+    file_basename = os.path.basename(path)
     print(f"\n{'='*60}")
     print(f"📄 EXTRACT TEXT API CALL")
     print(f"{'='*60}")
     print(f"File: {filename}")
+    print(f"Basename: {file_basename}")
     print(f"Path: {path}")
     print(f"Size: {file_size} bytes")
     print(f"Start Page: {state.get('start_page', 1)}")
     print(f"Auth Token: {'✓ Set' if os.getenv('HUGGINGFACE_API_TOKEN') else '✗ Not Set'}")
     with open(path, "rb") as f:
+        # IMPORTANT: Use basename for the file tuple (matches curl format)
+        files = {"file": (file_basename, f, "application/pdf")}
+        # IMPORTANT: Convert page numbers to strings (matches curl -F format)
         data = {
+            "filename": file_basename,  # Just filename, not full path
+            "start_page": str(state.get("start_page", 1)),  # String, not int
+            "end_page": str(state.get("end_page", 1))  # String, not int
         }
         headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
         print(f"\n🚀 Sending request to API...")
+        print(f"File tuple: ('file', ('{file_basename}', <binary>, 'application/pdf'))")
         print(f"Data params: {data}")
+        print(f"Data types: start_page={type(data['start_page'])}, end_page={type(data['end_page'])}")
         # Call API and wait for response
         resp = requests.post(EXTRACT_TEXT_API, files=files, data=data, headers=headers, timeout=120)