Craw4ai-example

Sleeping

App Files Files Community

Craw4ai-example / app.py

rairo

Update app.py

e1a4de5 verified 10 months ago

raw

history blame contribute delete

5.42 kB

	import streamlit as st
	import asyncio
	import subprocess
	import sys
	import os
	from crawl4ai import AsyncWebCrawler
	import nest_asyncio

	# Apply nest_asyncio to allow nested event loops
	nest_asyncio.apply()

	st.set_page_config(page_title="Web Crawler App", layout="wide")

	st.title("Web Crawler App")

	# Check if we're on Hugging Face space
	is_hf_space = os.environ.get("SPACE_ID") is not None

	# Function to install dependencies
	def install_playwright():
	try:
	st.info("Installing Playwright and browser dependencies. This may take a few minutes...")
	subprocess.check_call([sys.executable, "-m", "pip", "install", "playwright"])
	subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"])
	st.success("Playwright and Chromium installed successfully!")
	return True
	except Exception as e:
	st.error(f"Failed to install dependencies: {str(e)}")
	return False

	# Display installation status section at the top for Hugging Face spaces
	if is_hf_space:
	st.info("Running on Hugging Face Space. Make sure browser dependencies are installed.")

	if st.button("Install Browser Dependencies"):
	success = install_playwright()
	if success:
	st.info("Please restart the application after installation completes.")

	# Function to check if playwright browser is available
	def check_browser_installed():
	try:
	result = subprocess.run(
	[sys.executable, "-c", "from playwright.sync_api import sync_playwright; print('OK')"],
	capture_output=True,
	text=True
	)
	return "OK" in result.stdout
	except:
	return False

	# Display browser status
	browser_installed = check_browser_installed()
	if browser_installed:
	st.success("Browser dependencies are installed.")
	else:
	st.warning("Browser dependencies may not be installed correctly. Crawling might fail.")

	# Input for URL
	url = st.text_input("Enter URL to crawl:", value="https://www.nbcnews.com/business")

	# Optional parameters
	with st.expander("Advanced Options"):
	max_depth = st.slider("Max Crawl Depth", min_value=1, max_value=5, value=1)
	timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=30)
	max_pages = st.number_input("Max Pages to Crawl", min_value=1, max_value=100, value=10)
	crawl_mode = st.selectbox("Crawl Mode", options=["browser", "requests"], index=0,
	help="'browser' uses Playwright (more capable but slower), 'requests' is faster but may miss content")

	# Function to run the crawler
	def run_async_crawler(url, max_depth=1, timeout=30, max_pages=10, mode="browser"):
	async def _run():
	try:
	async with AsyncWebCrawler() as crawler:
	result = await crawler.arun(
	url=url,
	max_depth=max_depth,
	timeout=timeout,
	max_pages=max_pages,
	mode=mode
	)
	return result.markdown, None
	except Exception as e:
	return None, str(e)

	# Use the current event loop with nest_asyncio applied
	return asyncio.get_event_loop().run_until_complete(_run())

	# Button to start crawling
	if st.button("Start Crawling"):
	if not browser_installed and crawl_mode == "browser":
	st.warning("Browser dependencies not detected. Try installing them first or switch to 'requests' mode.")

	try:
	with st.spinner("Crawling in progress..."):
	result, error = run_async_crawler(
	url=url,
	max_depth=max_depth,
	timeout=timeout,
	max_pages=max_pages,
	mode=crawl_mode
	)

	if error:
	st.error(f"Crawling failed: {error}")

	if "browser" in error.lower():
	st.error("This appears to be a browser-related error.")
	if st.button("Attempt to install browser dependencies"):
	install_playwright()
	else:
	# Display the results
	st.subheader("Crawl Results")
	st.markdown(result)

	# Option to download results
	st.download_button(
	label="Download Results",
	data=result,
	file_name="crawl_results.md",
	mime="text/markdown"
	)
	except Exception as e:
	st.error(f"An error occurred: {str(e)}")

	# Add footer with information
	st.markdown("---")
	st.info("""
	This app uses the crawl4ai library to extract content from web pages.

	Hugging Face Space Setup Instructions:
	1. After launching the space, click "Install Browser Dependencies"
	2. Restart the space after installation completes
	3. You should now be able to crawl websites with the browser mode
	""")

	# Add a sidebar with information about the dependencies
	with st.sidebar:
	st.header("About")
	st.write("""
	This web crawler app requires Playwright and Chromium to be installed for the browser mode.

	If you're running on a Hugging Face space, use the "Install Browser Dependencies" button.

	If you're running locally, run:
	```
	python -m playwright install --with-deps chromium
	```
	""")