redhairedshanks1 commited on
Commit
5c95fcf
·
1 Parent(s): 4ee1ff0

Update utilities/extract_text.py

Browse files
Files changed (1) hide show
  1. utilities/extract_text.py +19 -14
utilities/extract_text.py CHANGED
@@ -26,31 +26,36 @@
26
  import os
27
  import requests
28
 
29
- EXTRACT_TEXT_API = "https://point9-extract-text-and-table.hf.space/api/text" # Replace with your space URL
 
30
 
31
  def extract_text_remote(state):
32
- filename = state["filename"]
33
- path = state["temp_files"][filename]
 
34
 
35
- # Get the file extension
36
- _, file_extension = os.path.splitext(filename)
37
-
38
- # If the filename in state doesn't have extension, check if it's in the temp_files path
39
- if not file_extension:
40
- # Try to get extension from the actual file path
41
- _, file_extension = os.path.splitext(path)
42
- if file_extension:
43
- # Add extension to filename
44
- filename = f"{filename}{file_extension}"
45
 
 
 
 
 
 
 
 
 
 
46
  with open(path, "rb") as f:
47
  files = {"file": (filename, f, "application/pdf")}
48
  data = {
49
- "filename": filename, # Now includes extension
50
  "start_page": state.get("start_page", 1),
51
  "end_page": state.get("end_page", 1)
52
  }
53
  headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
 
 
54
  resp = requests.post(EXTRACT_TEXT_API, files=files, data=data, headers=headers)
55
 
56
  if resp.status_code != 200:
 
26
  import os
27
  import requests
28
 
29
+ # Hardcoded API URL - DO NOT CHANGE
30
+ EXTRACT_TEXT_API = "https://point9-extract-text-and-table.hf.space/api/text"
31
 
32
  def extract_text_remote(state):
33
+ """
34
+ Extract text from documents via API.
35
+ All file types are sent to the API (no local processing).
36
 
37
+ Args:
38
+ state: Dict with 'filename', 'temp_files', 'start_page', 'end_page'
 
 
 
 
 
 
 
 
39
 
40
+ Returns:
41
+ state: Dict with 'text' key containing extracted text
42
+ """
43
+ filename = state["filename"]
44
+ path = state["temp_files"][filename]
45
+
46
+ if not os.path.exists(path):
47
+ raise RuntimeError(f"File not found: {path}")
48
+
49
  with open(path, "rb") as f:
50
  files = {"file": (filename, f, "application/pdf")}
51
  data = {
52
+ "filename": filename,
53
  "start_page": state.get("start_page", 1),
54
  "end_page": state.get("end_page", 1)
55
  }
56
  headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
57
+
58
+ # Call API and wait for response
59
  resp = requests.post(EXTRACT_TEXT_API, files=files, data=data, headers=headers)
60
 
61
  if resp.status_code != 200: