Spaces:

pnstack
/

capture-page-tools

Sleeping

App Files Files Community

npv2k1 commited on Feb 23, 2025

Commit

1caefe4

1 Parent(s): 6e30df1

Enhance capture_page function with improved error handling and page load management

Browse files

Files changed (1) hide show

src/modules/apps/__init__.py +63 -71

src/modules/apps/__init__.py CHANGED Viewed

@@ -1,13 +1,16 @@
 import gradio as gr
 import tempfile
 import os
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from subprocess import PIPE, STDOUT
-import traceback
 print("Gradio app loaded.")
@@ -20,67 +23,65 @@ def capture_page(url: str, output_file: str = "screenshot.png"):
     """
     options = Options()
-    # Basic options
-    options.add_argument('--headless=new')  # New headless mode
-    options.add_argument('--no-sandbox')  # Required in Docker
-    options.add_argument('--disable-dev-shm-usage')  # Required in Docker
-    # Performance and stability options
-    options.add_argument('--disable-gpu')  # Required in Docker
     options.add_argument('--disable-software-rasterizer')
     options.add_argument('--disable-extensions')
     options.add_argument('--disable-infobars')
-    # Resource configuration
     options.add_argument('--window-size=1920,1080')
     options.add_argument('--disable-features=NetworkService,NetworkServiceInProcess')
     options.add_argument('--disable-features=site-per-process')
-    # Memory and process settings
-    options.add_argument('--single-process')  # Run in single process mode
     options.add_argument('--memory-pressure-off')
     options.add_argument('--disable-crash-reporter')
-    options.add_argument('--disable-breakpad')  # Disable crash reporting
-    # Additional stability options
     options.add_argument('--ignore-certificate-errors')
     options.add_argument('--disable-setuid-sandbox')
     options.add_argument('--disable-web-security')
-    # Set specific shared memory /dev/shm size (if needed)
-    options.add_argument('--disable-dev-shm-usage')
     options.add_argument('--shm-size=2g')
-    # Set up Chrome service with explicit path to chromedriver and logging
     service = Service(
-        # executable_path='/usr/local/bin/chromedriver',
-        log_output=PIPE,  # Redirect logs to pipe
-        service_args=['--verbose']  # Enable verbose logging
     )
     try:
         print("Initializing Chrome...")
-        driver = webdriver.Chrome(
-            service=service,
-            options=options
-        )
         print("Chrome initialized successfully")
         try:
-            print(f"Navigating to URL: {url}")
-            driver.get(url)
-            # Wait for page load
-            print("Waiting for page to load...")
-            driver.implicitly_wait(10)  # Increased wait time
-            # Additional wait for dynamic content
-            from selenium.webdriver.support.ui import WebDriverWait
-            from selenium.webdriver.support import expected_conditions as EC
-            # WebDriverWait(driver, 10).until(
-            #     lambda d: d.execute_script('return document.readyState') == 'complete'
-            # )
             print("Taking screenshot...")
             driver.save_screenshot(output_file)
@@ -92,18 +93,22 @@ def capture_page(url: str, output_file: str = "screenshot.png"):
             raise
         finally:
             print("Closing Chrome...")
-            try:
-                driver.close()  # Close current window
-                driver.quit()   # Quit browser completely
-                import psutil   # For process cleanup
-                current_pid = os.getpid()
-                current_process = psutil.Process(current_pid)
-                children = current_process.children(recursive=True)
-                for child in children:
-                    if 'chrome' in child.name().lower():
-                        child.terminate()
-            except Exception as cleanup_error:
-                print(f"Error during cleanup: {cleanup_error}")
     except Exception as e:
         print(f"Error initializing Chrome: {str(e)}")
@@ -112,33 +117,24 @@ def capture_page(url: str, output_file: str = "screenshot.png"):
 def capture_and_show(url: str):
     """Capture webpage and return the image"""
     try:
-        # Get the temporary directory path (defaulting to /tmp if TMPDIR is not set)
         temp_dir = os.getenv('TMPDIR', '/tmp')
         try:
-            # Ensure temp directory exists and has correct permissions
             os.makedirs(temp_dir, mode=0o777, exist_ok=True)
             print(f"Using temp directory: {temp_dir}")
-            # Verify directory is writable
             if not os.access(temp_dir, os.W_OK):
                 print(f"Warning: Temp directory {temp_dir} is not writable")
-                # Try to create a user-specific temp directory instead
                 temp_dir = os.path.join('/tmp', f'chrome_screenshots_{os.getuid()}')
                 os.makedirs(temp_dir, mode=0o777, exist_ok=True)
                 print(f"Created user-specific temp directory: {temp_dir}")
-            # Create temporary file in the specified directory
             temp_path = os.path.join(temp_dir, f"screenshot_{os.urandom(8).hex()}.png")
             print(f"Temp file path: {temp_path}")
-            # Capture the webpage
             success = capture_page(url, temp_path)
             if not success:
                 print("Screenshot capture returned False")
                 return None
-            # Verify file was created
             if not os.path.exists(temp_path):
                 print("Screenshot file was not created")
                 return None
@@ -183,7 +179,6 @@ def create_gradio_app():
         def capture_with_error(url):
             try:
-                # Basic URL validation
                 if not url:
                     return None, gr.update(visible=True, value="Please enter a URL")
                 if not url.startswith(('http://', 'https://')):
@@ -196,7 +191,6 @@ def create_gradio_app():
             except Exception as e:
                 return None, gr.update(visible=True, value=f"Error: {str(e)}")
-        # Connect the components
         capture_btn.click(
             fn=capture_with_error,
             inputs=[url_input],
@@ -207,21 +201,19 @@ def create_gradio_app():
 app = create_gradio_app()
-# Configure server settings for Docker deployment
-server_port = 7860  # Standard Gradio port
-server_name = "0.0.0.0"  # Allow external connections
 def main():
-    """Launch the Gradio application"""
     print("Starting Gradio server...")
     app.launch(
         server_name=server_name,
         server_port=server_port,
-        share=False,  # Disable sharing as we're running in Docker
-        auth=None,    # Can be configured if authentication is needed
-        ssl_verify=False,  # Disable SSL verification for internal Docker network
         show_error=True,
         favicon_path=None
     )
-main()

 import gradio as gr
 import tempfile
 import os
+import time
+import traceback
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException
 from subprocess import PIPE, STDOUT
+import psutil
 print("Gradio app loaded.")
     """
     options = Options()
+    # Use new headless mode and basic options for Docker
+    options.add_argument('--headless=new')
+    options.add_argument('--no-sandbox')
+    options.add_argument('--disable-dev-shm-usage')
+    options.add_argument('--disable-gpu')
     options.add_argument('--disable-software-rasterizer')
     options.add_argument('--disable-extensions')
     options.add_argument('--disable-infobars')
     options.add_argument('--window-size=1920,1080')
     options.add_argument('--disable-features=NetworkService,NetworkServiceInProcess')
     options.add_argument('--disable-features=site-per-process')
+    options.add_argument('--single-process')
     options.add_argument('--memory-pressure-off')
     options.add_argument('--disable-crash-reporter')
+    options.add_argument('--disable-breakpad')
     options.add_argument('--ignore-certificate-errors')
     options.add_argument('--disable-setuid-sandbox')
     options.add_argument('--disable-web-security')
     options.add_argument('--shm-size=2g')
+    # Set page load strategy to 'none' to avoid waiting indefinitely
+    options.page_load_strategy = "none"
+    # Set up Chrome service (ensure chromedriver is in your PATH)
     service = Service(
+        log_output=PIPE,
+        service_args=['--verbose']
     )
+    driver = None
     try:
         print("Initializing Chrome...")
+        driver = webdriver.Chrome(service=service, options=options)
+        if not driver:
+            raise Exception("Failed to initialize Chrome driver")
         print("Chrome initialized successfully")
+        driver.implicitly_wait(5)
         try:
+            # Set a 30-second timeout for page load
+            driver.set_page_load_timeout(30)
+            try:
+                print(f"Navigating to URL: {url}")
+                driver.get(url)
+            except TimeoutException:
+                print("Page load timed out. Proceeding with screenshot capture...")
+            # Wait for the document ready state to be 'interactive' or 'complete'
+            try:
+                print("Waiting for document ready state...")
+                WebDriverWait(driver, 30).until(
+                    lambda d: d.execute_script('return document.readyState') in ["interactive", "complete"]
+                )
+            except TimeoutException:
+                print("Document did not reach ready state within timeout, proceeding anyway.")
+            # Additional short delay to allow dynamic content to settle
+            time.sleep(2)
             print("Taking screenshot...")
             driver.save_screenshot(output_file)
             raise
         finally:
             print("Closing Chrome...")
+            # Wrap cleanup in a try/except to prevent errors if the session is already closed
+            if driver:
+                try:
+                    driver.quit()
+                except Exception as cleanup_error:
+                    print(f"Error during driver.quit(): {cleanup_error}")
+                # Optionally clean up any lingering Chrome processes
+                try:
+                    current_pid = os.getpid()
+                    current_process = psutil.Process(current_pid)
+                    for child in current_process.children(recursive=True):
+                        if 'chrome' in child.name().lower():
+                            child.terminate()
+                except Exception as psutil_error:
+                    print(f"Error during process cleanup: {psutil_error}")
     except Exception as e:
         print(f"Error initializing Chrome: {str(e)}")
 def capture_and_show(url: str):
     """Capture webpage and return the image"""
     try:
         temp_dir = os.getenv('TMPDIR', '/tmp')
         try:
             os.makedirs(temp_dir, mode=0o777, exist_ok=True)
             print(f"Using temp directory: {temp_dir}")
             if not os.access(temp_dir, os.W_OK):
                 print(f"Warning: Temp directory {temp_dir} is not writable")
                 temp_dir = os.path.join('/tmp', f'chrome_screenshots_{os.getuid()}')
                 os.makedirs(temp_dir, mode=0o777, exist_ok=True)
                 print(f"Created user-specific temp directory: {temp_dir}")
             temp_path = os.path.join(temp_dir, f"screenshot_{os.urandom(8).hex()}.png")
             print(f"Temp file path: {temp_path}")
             success = capture_page(url, temp_path)
             if not success:
                 print("Screenshot capture returned False")
                 return None
             if not os.path.exists(temp_path):
                 print("Screenshot file was not created")
                 return None
         def capture_with_error(url):
             try:
                 if not url:
                     return None, gr.update(visible=True, value="Please enter a URL")
                 if not url.startswith(('http://', 'https://')):
             except Exception as e:
                 return None, gr.update(visible=True, value=f"Error: {str(e)}")
         capture_btn.click(
             fn=capture_with_error,
             inputs=[url_input],
 app = create_gradio_app()
+server_port = 7860
+server_name = "0.0.0.0"
 def main():
     print("Starting Gradio server...")
     app.launch(
         server_name=server_name,
         server_port=server_port,
+        share=False,
+        auth=None,
+        ssl_verify=False,
         show_error=True,
         favicon_path=None
     )
+main()