Spaces:
Sleeping
Sleeping
| # import os | |
| # import requests | |
| # EXTRACT_TABLES_API = "https://point9-extract-text-and-table.hf.space/api/tables" # Replace with your space URL | |
| # def extract_tables_remote(state): | |
| # filename = state["filename"] | |
| # path = state["temp_files"][filename] | |
| # with open(path, "rb") as f: | |
| # files = {"file": (filename, f, "application/pdf")} | |
| # data = { | |
| # "filename": filename, | |
| # "start_page": state.get("start_page", 1), | |
| # "end_page": state.get("end_page", 1), | |
| # } | |
| # headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"} | |
| # resp = requests.post(EXTRACT_TABLES_API, files=files, data=data, headers=headers) | |
| # if resp.status_code != 200: | |
| # raise RuntimeError(f"Extract tables API failed: {resp.text}") | |
| # js = resp.json() | |
| # state["tables"] = js.get("tables", js) | |
| # return state | |
| import os | |
| import requests | |
| # Hardcoded API URL - DO NOT CHANGE | |
| EXTRACT_TABLES_API = "https://point9-extract-text-and-table.hf.space/api/tables" | |
| def extract_tables_remote(state): | |
| """ | |
| Extract tables from documents via API. | |
| All file types are sent to the API (no local processing). | |
| Args: | |
| state: Dict with 'filename', 'temp_files', 'start_page', 'end_page' | |
| Returns: | |
| state: Dict with 'tables' key containing list of extracted tables | |
| """ | |
| filename = state["filename"] | |
| path = state["temp_files"][filename] | |
| if not os.path.exists(path): | |
| raise RuntimeError(f"File not found: {path}") | |
| # Extract just the filename (not full path) to match curl format | |
| file_basename = os.path.basename(path) | |
| file_size = os.path.getsize(path) | |
| print(f"\n{'='*60}") | |
| print(f"π EXTRACT TABLES API CALL") | |
| print(f"{'='*60}") | |
| print(f"File: {filename}") | |
| print(f"Basename: {file_basename}") | |
| print(f"Size: {file_size} bytes") | |
| print(f"API URL: {EXTRACT_TABLES_API}") | |
| with open(path, "rb") as f: | |
| # IMPORTANT: Use basename for the file tuple (matches curl format) | |
| files = {"file": (file_basename, f, "application/pdf")} | |
| # IMPORTANT: Convert page numbers to strings (matches curl -F format) | |
| data = { | |
| "filename": file_basename, # Just filename, not full path | |
| "start_page": str(state.get("start_page", 1)), # String, not int | |
| "end_page": str(state.get("end_page", 1)), # String, not int | |
| } | |
| headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"} | |
| print(f"π Sending request...") | |
| print(f"Data params: {data}") | |
| # Call API and wait for response - 5 minute timeout for large files | |
| try: | |
| resp = requests.post(EXTRACT_TABLES_API, files=files, data=data, headers=headers, timeout=300) | |
| except requests.exceptions.Timeout: | |
| print(f"β Request timed out after 300 seconds") | |
| raise RuntimeError("Table extraction API timed out after 5 minutes") | |
| except requests.exceptions.RequestException as e: | |
| print(f"β Request exception: {str(e)}") | |
| raise RuntimeError(f"API request failed: {str(e)}") | |
| print(f"\nπ₯ API Response:") | |
| print(f"Status Code: {resp.status_code}") | |
| if resp.status_code != 200: | |
| print(f"β Error: {resp.text[:500]}") | |
| raise RuntimeError(f"Extract tables API failed with status {resp.status_code}: {resp.text}") | |
| try: | |
| response_json = resp.json() | |
| print(f"Response keys: {list(response_json.keys())}") | |
| # IMPORTANT: API returns {"status": "completed", "result": [...tables...], "process_id": "..."} | |
| # NOT {"tables": [...]} | |
| tables = response_json.get("result", []) or response_json.get("tables", []) | |
| print(f"\nπ Extraction Result:") | |
| print(f"API Status: {response_json.get('status', 'unknown')}") | |
| print(f"Process ID: {response_json.get('process_id', 'none')}") | |
| print(f"Tables found: {len(tables) if isinstance(tables, list) else 0}") | |
| print(f"{'='*60}\n") | |
| state["tables"] = tables | |
| return state | |
| except Exception as e: | |
| print(f"β Error parsing response: {str(e)}") | |
| print(f"Raw response: {resp.text[:500]}") | |
| raise RuntimeError(f"Failed to parse API response: {str(e)}") |