Craw4ai-example / app.py
rairo's picture
Update app.py
e1a4de5 verified
import streamlit as st
import asyncio
import subprocess
import sys
import os
from crawl4ai import AsyncWebCrawler
import nest_asyncio
# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()
st.set_page_config(page_title="Web Crawler App", layout="wide")
st.title("Web Crawler App")
# Check if we're on Hugging Face space
is_hf_space = os.environ.get("SPACE_ID") is not None
# Function to install dependencies
def install_playwright():
try:
st.info("Installing Playwright and browser dependencies. This may take a few minutes...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "playwright"])
subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"])
st.success("Playwright and Chromium installed successfully!")
return True
except Exception as e:
st.error(f"Failed to install dependencies: {str(e)}")
return False
# Display installation status section at the top for Hugging Face spaces
if is_hf_space:
st.info("Running on Hugging Face Space. Make sure browser dependencies are installed.")
if st.button("Install Browser Dependencies"):
success = install_playwright()
if success:
st.info("Please restart the application after installation completes.")
# Function to check if playwright browser is available
def check_browser_installed():
try:
result = subprocess.run(
[sys.executable, "-c", "from playwright.sync_api import sync_playwright; print('OK')"],
capture_output=True,
text=True
)
return "OK" in result.stdout
except:
return False
# Display browser status
browser_installed = check_browser_installed()
if browser_installed:
st.success("Browser dependencies are installed.")
else:
st.warning("Browser dependencies may not be installed correctly. Crawling might fail.")
# Input for URL
url = st.text_input("Enter URL to crawl:", value="https://www.nbcnews.com/business")
# Optional parameters
with st.expander("Advanced Options"):
max_depth = st.slider("Max Crawl Depth", min_value=1, max_value=5, value=1)
timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=30)
max_pages = st.number_input("Max Pages to Crawl", min_value=1, max_value=100, value=10)
crawl_mode = st.selectbox("Crawl Mode", options=["browser", "requests"], index=0,
help="'browser' uses Playwright (more capable but slower), 'requests' is faster but may miss content")
# Function to run the crawler
def run_async_crawler(url, max_depth=1, timeout=30, max_pages=10, mode="browser"):
async def _run():
try:
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
max_depth=max_depth,
timeout=timeout,
max_pages=max_pages,
mode=mode
)
return result.markdown, None
except Exception as e:
return None, str(e)
# Use the current event loop with nest_asyncio applied
return asyncio.get_event_loop().run_until_complete(_run())
# Button to start crawling
if st.button("Start Crawling"):
if not browser_installed and crawl_mode == "browser":
st.warning("Browser dependencies not detected. Try installing them first or switch to 'requests' mode.")
try:
with st.spinner("Crawling in progress..."):
result, error = run_async_crawler(
url=url,
max_depth=max_depth,
timeout=timeout,
max_pages=max_pages,
mode=crawl_mode
)
if error:
st.error(f"Crawling failed: {error}")
if "browser" in error.lower():
st.error("This appears to be a browser-related error.")
if st.button("Attempt to install browser dependencies"):
install_playwright()
else:
# Display the results
st.subheader("Crawl Results")
st.markdown(result)
# Option to download results
st.download_button(
label="Download Results",
data=result,
file_name="crawl_results.md",
mime="text/markdown"
)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
# Add footer with information
st.markdown("---")
st.info("""
This app uses the crawl4ai library to extract content from web pages.
**Hugging Face Space Setup Instructions:**
1. After launching the space, click "Install Browser Dependencies"
2. Restart the space after installation completes
3. You should now be able to crawl websites with the browser mode
""")
# Add a sidebar with information about the dependencies
with st.sidebar:
st.header("About")
st.write("""
This web crawler app requires Playwright and Chromium to be installed for the browser mode.
If you're running on a Hugging Face space, use the "Install Browser Dependencies" button.
If you're running locally, run:
```
python -m playwright install --with-deps chromium
```
""")