Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import asyncio | |
| import subprocess | |
| import sys | |
| import os | |
| from crawl4ai import AsyncWebCrawler | |
| import nest_asyncio | |
| # Apply nest_asyncio to allow nested event loops | |
| nest_asyncio.apply() | |
| st.set_page_config(page_title="Web Crawler App", layout="wide") | |
| st.title("Web Crawler App") | |
| # Check if we're on Hugging Face space | |
| is_hf_space = os.environ.get("SPACE_ID") is not None | |
| # Function to install dependencies | |
| def install_playwright(): | |
| try: | |
| st.info("Installing Playwright and browser dependencies. This may take a few minutes...") | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "playwright"]) | |
| subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"]) | |
| st.success("Playwright and Chromium installed successfully!") | |
| return True | |
| except Exception as e: | |
| st.error(f"Failed to install dependencies: {str(e)}") | |
| return False | |
| # Display installation status section at the top for Hugging Face spaces | |
| if is_hf_space: | |
| st.info("Running on Hugging Face Space. Make sure browser dependencies are installed.") | |
| if st.button("Install Browser Dependencies"): | |
| success = install_playwright() | |
| if success: | |
| st.info("Please restart the application after installation completes.") | |
| # Function to check if playwright browser is available | |
| def check_browser_installed(): | |
| try: | |
| result = subprocess.run( | |
| [sys.executable, "-c", "from playwright.sync_api import sync_playwright; print('OK')"], | |
| capture_output=True, | |
| text=True | |
| ) | |
| return "OK" in result.stdout | |
| except: | |
| return False | |
| # Display browser status | |
| browser_installed = check_browser_installed() | |
| if browser_installed: | |
| st.success("Browser dependencies are installed.") | |
| else: | |
| st.warning("Browser dependencies may not be installed correctly. Crawling might fail.") | |
| # Input for URL | |
| url = st.text_input("Enter URL to crawl:", value="https://www.nbcnews.com/business") | |
| # Optional parameters | |
| with st.expander("Advanced Options"): | |
| max_depth = st.slider("Max Crawl Depth", min_value=1, max_value=5, value=1) | |
| timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=30) | |
| max_pages = st.number_input("Max Pages to Crawl", min_value=1, max_value=100, value=10) | |
| crawl_mode = st.selectbox("Crawl Mode", options=["browser", "requests"], index=0, | |
| help="'browser' uses Playwright (more capable but slower), 'requests' is faster but may miss content") | |
| # Function to run the crawler | |
| def run_async_crawler(url, max_depth=1, timeout=30, max_pages=10, mode="browser"): | |
| async def _run(): | |
| try: | |
| async with AsyncWebCrawler() as crawler: | |
| result = await crawler.arun( | |
| url=url, | |
| max_depth=max_depth, | |
| timeout=timeout, | |
| max_pages=max_pages, | |
| mode=mode | |
| ) | |
| return result.markdown, None | |
| except Exception as e: | |
| return None, str(e) | |
| # Use the current event loop with nest_asyncio applied | |
| return asyncio.get_event_loop().run_until_complete(_run()) | |
| # Button to start crawling | |
| if st.button("Start Crawling"): | |
| if not browser_installed and crawl_mode == "browser": | |
| st.warning("Browser dependencies not detected. Try installing them first or switch to 'requests' mode.") | |
| try: | |
| with st.spinner("Crawling in progress..."): | |
| result, error = run_async_crawler( | |
| url=url, | |
| max_depth=max_depth, | |
| timeout=timeout, | |
| max_pages=max_pages, | |
| mode=crawl_mode | |
| ) | |
| if error: | |
| st.error(f"Crawling failed: {error}") | |
| if "browser" in error.lower(): | |
| st.error("This appears to be a browser-related error.") | |
| if st.button("Attempt to install browser dependencies"): | |
| install_playwright() | |
| else: | |
| # Display the results | |
| st.subheader("Crawl Results") | |
| st.markdown(result) | |
| # Option to download results | |
| st.download_button( | |
| label="Download Results", | |
| data=result, | |
| file_name="crawl_results.md", | |
| mime="text/markdown" | |
| ) | |
| except Exception as e: | |
| st.error(f"An error occurred: {str(e)}") | |
| # Add footer with information | |
| st.markdown("---") | |
| st.info(""" | |
| This app uses the crawl4ai library to extract content from web pages. | |
| **Hugging Face Space Setup Instructions:** | |
| 1. After launching the space, click "Install Browser Dependencies" | |
| 2. Restart the space after installation completes | |
| 3. You should now be able to crawl websites with the browser mode | |
| """) | |
| # Add a sidebar with information about the dependencies | |
| with st.sidebar: | |
| st.header("About") | |
| st.write(""" | |
| This web crawler app requires Playwright and Chromium to be installed for the browser mode. | |
| If you're running on a Hugging Face space, use the "Install Browser Dependencies" button. | |
| If you're running locally, run: | |
| ``` | |
| python -m playwright install --with-deps chromium | |
| ``` | |
| """) |