import streamlit as st import asyncio import subprocess import sys import os from crawl4ai import AsyncWebCrawler import nest_asyncio # Apply nest_asyncio to allow nested event loops nest_asyncio.apply() st.set_page_config(page_title="Web Crawler App", layout="wide") st.title("Web Crawler App") # Check if we're on Hugging Face space is_hf_space = os.environ.get("SPACE_ID") is not None # Function to install dependencies def install_playwright(): try: st.info("Installing Playwright and browser dependencies. This may take a few minutes...") subprocess.check_call([sys.executable, "-m", "pip", "install", "playwright"]) subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"]) st.success("Playwright and Chromium installed successfully!") return True except Exception as e: st.error(f"Failed to install dependencies: {str(e)}") return False # Display installation status section at the top for Hugging Face spaces if is_hf_space: st.info("Running on Hugging Face Space. Make sure browser dependencies are installed.") if st.button("Install Browser Dependencies"): success = install_playwright() if success: st.info("Please restart the application after installation completes.") # Function to check if playwright browser is available def check_browser_installed(): try: result = subprocess.run( [sys.executable, "-c", "from playwright.sync_api import sync_playwright; print('OK')"], capture_output=True, text=True ) return "OK" in result.stdout except: return False # Display browser status browser_installed = check_browser_installed() if browser_installed: st.success("Browser dependencies are installed.") else: st.warning("Browser dependencies may not be installed correctly. Crawling might fail.") # Input for URL url = st.text_input("Enter URL to crawl:", value="https://www.nbcnews.com/business") # Optional parameters with st.expander("Advanced Options"): max_depth = st.slider("Max Crawl Depth", min_value=1, max_value=5, value=1) timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=30) max_pages = st.number_input("Max Pages to Crawl", min_value=1, max_value=100, value=10) crawl_mode = st.selectbox("Crawl Mode", options=["browser", "requests"], index=0, help="'browser' uses Playwright (more capable but slower), 'requests' is faster but may miss content") # Function to run the crawler def run_async_crawler(url, max_depth=1, timeout=30, max_pages=10, mode="browser"): async def _run(): try: async with AsyncWebCrawler() as crawler: result = await crawler.arun( url=url, max_depth=max_depth, timeout=timeout, max_pages=max_pages, mode=mode ) return result.markdown, None except Exception as e: return None, str(e) # Use the current event loop with nest_asyncio applied return asyncio.get_event_loop().run_until_complete(_run()) # Button to start crawling if st.button("Start Crawling"): if not browser_installed and crawl_mode == "browser": st.warning("Browser dependencies not detected. Try installing them first or switch to 'requests' mode.") try: with st.spinner("Crawling in progress..."): result, error = run_async_crawler( url=url, max_depth=max_depth, timeout=timeout, max_pages=max_pages, mode=crawl_mode ) if error: st.error(f"Crawling failed: {error}") if "browser" in error.lower(): st.error("This appears to be a browser-related error.") if st.button("Attempt to install browser dependencies"): install_playwright() else: # Display the results st.subheader("Crawl Results") st.markdown(result) # Option to download results st.download_button( label="Download Results", data=result, file_name="crawl_results.md", mime="text/markdown" ) except Exception as e: st.error(f"An error occurred: {str(e)}") # Add footer with information st.markdown("---") st.info(""" This app uses the crawl4ai library to extract content from web pages. **Hugging Face Space Setup Instructions:** 1. After launching the space, click "Install Browser Dependencies" 2. Restart the space after installation completes 3. You should now be able to crawl websites with the browser mode """) # Add a sidebar with information about the dependencies with st.sidebar: st.header("About") st.write(""" This web crawler app requires Playwright and Chromium to be installed for the browser mode. If you're running on a Hugging Face space, use the "Install Browser Dependencies" button. If you're running locally, run: ``` python -m playwright install --with-deps chromium ``` """)