# import os # import requests # EXTRACT_TABLES_API = "https://point9-extract-text-and-table.hf.space/api/tables" # Replace with your space URL # def extract_tables_remote(state): # filename = state["filename"] # path = state["temp_files"][filename] # with open(path, "rb") as f: # files = {"file": (filename, f, "application/pdf")} # data = { # "filename": filename, # "start_page": state.get("start_page", 1), # "end_page": state.get("end_page", 1), # } # headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"} # resp = requests.post(EXTRACT_TABLES_API, files=files, data=data, headers=headers) # if resp.status_code != 200: # raise RuntimeError(f"Extract tables API failed: {resp.text}") # js = resp.json() # state["tables"] = js.get("tables", js) # return state import os import requests # Hardcoded API URL - DO NOT CHANGE EXTRACT_TABLES_API = "https://point9-extract-text-and-table.hf.space/api/tables" def extract_tables_remote(state): """ Extract tables from documents via API. All file types are sent to the API (no local processing). Args: state: Dict with 'filename', 'temp_files', 'start_page', 'end_page' Returns: state: Dict with 'tables' key containing list of extracted tables """ filename = state["filename"] path = state["temp_files"][filename] if not os.path.exists(path): raise RuntimeError(f"File not found: {path}") # Extract just the filename (not full path) to match curl format file_basename = os.path.basename(path) file_size = os.path.getsize(path) print(f"\n{'='*60}") print(f"šŸ“Š EXTRACT TABLES API CALL") print(f"{'='*60}") print(f"File: {filename}") print(f"Basename: {file_basename}") print(f"Size: {file_size} bytes") print(f"API URL: {EXTRACT_TABLES_API}") with open(path, "rb") as f: # IMPORTANT: Use basename for the file tuple (matches curl format) files = {"file": (file_basename, f, "application/pdf")} # IMPORTANT: Convert page numbers to strings (matches curl -F format) data = { "filename": file_basename, # Just filename, not full path "start_page": str(state.get("start_page", 1)), # String, not int "end_page": str(state.get("end_page", 1)), # String, not int } headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"} print(f"šŸš€ Sending request...") print(f"Data params: {data}") # Call API and wait for response - 5 minute timeout for large files try: resp = requests.post(EXTRACT_TABLES_API, files=files, data=data, headers=headers, timeout=300) except requests.exceptions.Timeout: print(f"āŒ Request timed out after 300 seconds") raise RuntimeError("Table extraction API timed out after 5 minutes") except requests.exceptions.RequestException as e: print(f"āŒ Request exception: {str(e)}") raise RuntimeError(f"API request failed: {str(e)}") print(f"\nšŸ“„ API Response:") print(f"Status Code: {resp.status_code}") if resp.status_code != 200: print(f"āŒ Error: {resp.text[:500]}") raise RuntimeError(f"Extract tables API failed with status {resp.status_code}: {resp.text}") try: response_json = resp.json() print(f"Response keys: {list(response_json.keys())}") # IMPORTANT: API returns {"status": "completed", "result": [...tables...], "process_id": "..."} # NOT {"tables": [...]} tables = response_json.get("result", []) or response_json.get("tables", []) print(f"\nšŸ“Š Extraction Result:") print(f"API Status: {response_json.get('status', 'unknown')}") print(f"Process ID: {response_json.get('process_id', 'none')}") print(f"Tables found: {len(tables) if isinstance(tables, list) else 0}") print(f"{'='*60}\n") state["tables"] = tables return state except Exception as e: print(f"āŒ Error parsing response: {str(e)}") print(f"Raw response: {resp.text[:500]}") raise RuntimeError(f"Failed to parse API response: {str(e)}")