Spaces:

stellar413
/

masterllm

Sleeping

App Files Files Community

masterllm / utilities /extract_tables.py

redhairedshanks1

Update utilities/extract_tables.py

1482837 2 months ago

raw

history blame contribute delete

4.38 kB

	# import os
	# import requests

	# EXTRACT_TABLES_API = "https://point9-extract-text-and-table.hf.space/api/tables" # Replace with your space URL

	# def extract_tables_remote(state):
	# filename = state["filename"]
	# path = state["temp_files"][filename]

	# with open(path, "rb") as f:
	# files = {"file": (filename, f, "application/pdf")}
	# data = {
	# "filename": filename,
	# "start_page": state.get("start_page", 1),
	# "end_page": state.get("end_page", 1),
	# }
	# headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
	# resp = requests.post(EXTRACT_TABLES_API, files=files, data=data, headers=headers)

	# if resp.status_code != 200:
	# raise RuntimeError(f"Extract tables API failed: {resp.text}")

	# js = resp.json()
	# state["tables"] = js.get("tables", js)
	# return state

	import os
	import requests

	# Hardcoded API URL - DO NOT CHANGE
	EXTRACT_TABLES_API = "https://point9-extract-text-and-table.hf.space/api/tables"

	def extract_tables_remote(state):
	"""
	Extract tables from documents via API.
	All file types are sent to the API (no local processing).

	Args:
	state: Dict with 'filename', 'temp_files', 'start_page', 'end_page'

	Returns:
	state: Dict with 'tables' key containing list of extracted tables
	"""
	filename = state["filename"]
	path = state["temp_files"][filename]

	if not os.path.exists(path):
	raise RuntimeError(f"File not found: {path}")

	# Extract just the filename (not full path) to match curl format
	file_basename = os.path.basename(path)
	file_size = os.path.getsize(path)

	print(f"\n{'='*60}")
	print(f"📊 EXTRACT TABLES API CALL")
	print(f"{'='*60}")
	print(f"File: {filename}")
	print(f"Basename: {file_basename}")
	print(f"Size: {file_size} bytes")
	print(f"API URL: {EXTRACT_TABLES_API}")

	with open(path, "rb") as f:
	# IMPORTANT: Use basename for the file tuple (matches curl format)
	files = {"file": (file_basename, f, "application/pdf")}

	# IMPORTANT: Convert page numbers to strings (matches curl -F format)
	data = {
	"filename": file_basename, # Just filename, not full path
	"start_page": str(state.get("start_page", 1)), # String, not int
	"end_page": str(state.get("end_page", 1)), # String, not int
	}
	headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}

	print(f"🚀 Sending request...")
	print(f"Data params: {data}")

	# Call API and wait for response - 5 minute timeout for large files
	try:
	resp = requests.post(EXTRACT_TABLES_API, files=files, data=data, headers=headers, timeout=300)
	except requests.exceptions.Timeout:
	print(f"❌ Request timed out after 300 seconds")
	raise RuntimeError("Table extraction API timed out after 5 minutes")
	except requests.exceptions.RequestException as e:
	print(f"❌ Request exception: {str(e)}")
	raise RuntimeError(f"API request failed: {str(e)}")

	print(f"\n📥 API Response:")
	print(f"Status Code: {resp.status_code}")

	if resp.status_code != 200:
	print(f"❌ Error: {resp.text[:500]}")
	raise RuntimeError(f"Extract tables API failed with status {resp.status_code}: {resp.text}")

	try:
	response_json = resp.json()
	print(f"Response keys: {list(response_json.keys())}")

	# IMPORTANT: API returns {"status": "completed", "result": [...tables...], "process_id": "..."}
	# NOT {"tables": [...]}
	tables = response_json.get("result", []) or response_json.get("tables", [])

	print(f"\n📊 Extraction Result:")
	print(f"API Status: {response_json.get('status', 'unknown')}")
	print(f"Process ID: {response_json.get('process_id', 'none')}")
	print(f"Tables found: {len(tables) if isinstance(tables, list) else 0}")
	print(f"{'='*60}\n")

	state["tables"] = tables
	return state

	except Exception as e:
	print(f"❌ Error parsing response: {str(e)}")
	print(f"Raw response: {resp.text[:500]}")
	raise RuntimeError(f"Failed to parse API response: {str(e)}")