redhairedshanks1 commited on
Commit
7b72f28
·
1 Parent(s): 5c95fcf

Update utilities/extract_tables.py

Browse files
Files changed (1) hide show
  1. utilities/extract_tables.py +20 -7
utilities/extract_tables.py CHANGED
@@ -27,23 +27,36 @@
27
  import os
28
  import requests
29
 
30
- EXTRACT_TABLES_API = "https://point9-extract-text-and-table.hf.space/api/tables" # Replace with your space URL
 
31
 
32
  def extract_tables_remote(state):
33
- filename = state["filename"]
34
- path = state["temp_files"][filename]
 
35
 
36
- # Extract filename with extension from the path
37
- filename_with_extension = os.path.basename(path)
38
 
 
 
 
 
 
 
 
 
 
39
  with open(path, "rb") as f:
40
- files = {"file": (filename_with_extension, f, "application/pdf")}
41
  data = {
42
- "filename": filename_with_extension, # Use filename with extension
43
  "start_page": state.get("start_page", 1),
44
  "end_page": state.get("end_page", 1),
45
  }
46
  headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
 
 
47
  resp = requests.post(EXTRACT_TABLES_API, files=files, data=data, headers=headers)
48
 
49
  if resp.status_code != 200:
 
27
  import os
28
  import requests
29
 
30
+ # Hardcoded API URL - DO NOT CHANGE
31
+ EXTRACT_TABLES_API = "https://point9-extract-text-and-table.hf.space/api/tables"
32
 
33
  def extract_tables_remote(state):
34
+ """
35
+ Extract tables from documents via API.
36
+ All file types are sent to the API (no local processing).
37
 
38
+ Args:
39
+ state: Dict with 'filename', 'temp_files', 'start_page', 'end_page'
40
 
41
+ Returns:
42
+ state: Dict with 'tables' key containing list of extracted tables
43
+ """
44
+ filename = state["filename"]
45
+ path = state["temp_files"][filename]
46
+
47
+ if not os.path.exists(path):
48
+ raise RuntimeError(f"File not found: {path}")
49
+
50
  with open(path, "rb") as f:
51
+ files = {"file": (filename, f, "application/pdf")}
52
  data = {
53
+ "filename": filename,
54
  "start_page": state.get("start_page", 1),
55
  "end_page": state.get("end_page", 1),
56
  }
57
  headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
58
+
59
+ # Call API and wait for response
60
  resp = requests.post(EXTRACT_TABLES_API, files=files, data=data, headers=headers)
61
 
62
  if resp.status_code != 200: