Spaces:

pnstack
/

capture-page-tools

Sleeping

App Files Files Community

capture-page-tools / src /modules /apps /__init__.py

npv2k1

Enhance capture_page function with improved error handling and page load management

1caefe4 11 months ago

raw

history blame contribute delete

8.22 kB

	import gradio as gr
	import tempfile
	import os
	import time
	import traceback
	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.common.exceptions import TimeoutException
	from subprocess import PIPE, STDOUT
	import psutil

	print("Gradio app loaded.")

	def capture_page(url: str, output_file: str = "screenshot.png"):
	"""
	Captures a screenshot of the given webpage.

	:param url: The URL of the webpage to capture.
	:param output_file: The filename to save the screenshot.
	"""
	options = Options()

	# Use new headless mode and basic options for Docker
	options.add_argument('--headless=new')
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-dev-shm-usage')
	options.add_argument('--disable-gpu')
	options.add_argument('--disable-software-rasterizer')
	options.add_argument('--disable-extensions')
	options.add_argument('--disable-infobars')
	options.add_argument('--window-size=1920,1080')
	options.add_argument('--disable-features=NetworkService,NetworkServiceInProcess')
	options.add_argument('--disable-features=site-per-process')
	options.add_argument('--single-process')
	options.add_argument('--memory-pressure-off')
	options.add_argument('--disable-crash-reporter')
	options.add_argument('--disable-breakpad')
	options.add_argument('--ignore-certificate-errors')
	options.add_argument('--disable-setuid-sandbox')
	options.add_argument('--disable-web-security')
	options.add_argument('--shm-size=2g')

	# Set page load strategy to 'none' to avoid waiting indefinitely
	options.page_load_strategy = "none"

	# Set up Chrome service (ensure chromedriver is in your PATH)
	service = Service(
	log_output=PIPE,
	service_args=['--verbose']
	)

	driver = None
	try:
	print("Initializing Chrome...")
	driver = webdriver.Chrome(service=service, options=options)
	if not driver:
	raise Exception("Failed to initialize Chrome driver")

	print("Chrome initialized successfully")
	driver.implicitly_wait(5)

	try:
	# Set a 30-second timeout for page load
	driver.set_page_load_timeout(30)
	try:
	print(f"Navigating to URL: {url}")
	driver.get(url)
	except TimeoutException:
	print("Page load timed out. Proceeding with screenshot capture...")

	# Wait for the document ready state to be 'interactive' or 'complete'
	try:
	print("Waiting for document ready state...")
	WebDriverWait(driver, 30).until(
	lambda d: d.execute_script('return document.readyState') in ["interactive", "complete"]
	)
	except TimeoutException:
	print("Document did not reach ready state within timeout, proceeding anyway.")

	# Additional short delay to allow dynamic content to settle
	time.sleep(2)

	print("Taking screenshot...")
	driver.save_screenshot(output_file)
	print(f"Screenshot saved: {output_file}")
	return True

	except Exception as e:
	print(f"Error during page capture: {str(e)}")
	raise
	finally:
	print("Closing Chrome...")
	# Wrap cleanup in a try/except to prevent errors if the session is already closed
	if driver:
	try:
	driver.quit()
	except Exception as cleanup_error:
	print(f"Error during driver.quit(): {cleanup_error}")

	# Optionally clean up any lingering Chrome processes
	try:
	current_pid = os.getpid()
	current_process = psutil.Process(current_pid)
	for child in current_process.children(recursive=True):
	if 'chrome' in child.name().lower():
	child.terminate()
	except Exception as psutil_error:
	print(f"Error during process cleanup: {psutil_error}")

	except Exception as e:
	print(f"Error initializing Chrome: {str(e)}")
	raise Exception(f"Failed to initialize Chrome: {str(e)}")

	def capture_and_show(url: str):
	"""Capture webpage and return the image"""
	try:
	temp_dir = os.getenv('TMPDIR', '/tmp')
	try:
	os.makedirs(temp_dir, mode=0o777, exist_ok=True)
	print(f"Using temp directory: {temp_dir}")
	if not os.access(temp_dir, os.W_OK):
	print(f"Warning: Temp directory {temp_dir} is not writable")
	temp_dir = os.path.join('/tmp', f'chrome_screenshots_{os.getuid()}')
	os.makedirs(temp_dir, mode=0o777, exist_ok=True)
	print(f"Created user-specific temp directory: {temp_dir}")

	temp_path = os.path.join(temp_dir, f"screenshot_{os.urandom(8).hex()}.png")
	print(f"Temp file path: {temp_path}")

	success = capture_page(url, temp_path)
	if not success:
	print("Screenshot capture returned False")
	return None

	if not os.path.exists(temp_path):
	print("Screenshot file was not created")
	return None

	print("Screenshot captured successfully")
	return temp_path

	except OSError as e:
	print(f"OS Error: {str(e)}")
	print(f"Stack trace: {traceback.format_exc()}")
	return None

	except Exception as e:
	print(f"Error in capture_and_show: {str(e)}")
	print(f"Stack trace: {traceback.format_exc()}")
	return None

	def create_gradio_app():
	"""Create the main Gradio application with all components"""
	with gr.Blocks() as app:
	gr.Markdown("# Webpage Screenshot Capture")

	with gr.Row():
	url_input = gr.Textbox(
	label="Website URL",
	placeholder="Enter website URL (e.g., https://www.example.com)",
	scale=4
	)
	capture_btn = gr.Button("Capture", scale=1)

	with gr.Row():
	output_image = gr.Image(
	label="Captured Screenshot",
	type="filepath"
	)

	error_output = gr.Textbox(
	label="Error Message",
	visible=False,
	interactive=False
	)

	def capture_with_error(url):
	try:
	if not url:
	return None, gr.update(visible=True, value="Please enter a URL")
	if not url.startswith(('http://', 'https://')):
	return None, gr.update(visible=True, value="Please enter a valid URL starting with http:// or https://")

	result = capture_and_show(url)
	if result is None:
	return None, gr.update(visible=True, value="Failed to capture screenshot. Please check the URL and try again.")
	return result, gr.update(visible=False, value="")
	except Exception as e:
	return None, gr.update(visible=True, value=f"Error: {str(e)}")

	capture_btn.click(
	fn=capture_with_error,
	inputs=[url_input],
	outputs=[output_image, error_output]
	)

	return app

	app = create_gradio_app()

	server_port = 7860
	server_name = "0.0.0.0"

	def main():
	print("Starting Gradio server...")
	app.launch(
	server_name=server_name,
	server_port=server_port,
	share=False,
	auth=None,
	ssl_verify=False,
	show_error=True,
	favicon_path=None
	)

	main()