Spaces:
Sleeping
Sleeping
File size: 5,423 Bytes
25996b1 605e112 e1a4de5 605e112 fdadbb0 605e112 45e9c8a e5adc42 45e9c8a 25996b1 e1a4de5 45e9c8a 25996b1 45e9c8a e1a4de5 25996b1 45e9c8a e1a4de5 fdadbb0 e1a4de5 fdadbb0 45e9c8a e1a4de5 fdadbb0 e1a4de5 45e9c8a e1a4de5 fdadbb0 e1a4de5 fdadbb0 dff737d 45e9c8a e1a4de5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import streamlit as st
import asyncio
import subprocess
import sys
import os
from crawl4ai import AsyncWebCrawler
import nest_asyncio
# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()
st.set_page_config(page_title="Web Crawler App", layout="wide")
st.title("Web Crawler App")
# Check if we're on Hugging Face space
is_hf_space = os.environ.get("SPACE_ID") is not None
# Function to install dependencies
def install_playwright():
try:
st.info("Installing Playwright and browser dependencies. This may take a few minutes...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "playwright"])
subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"])
st.success("Playwright and Chromium installed successfully!")
return True
except Exception as e:
st.error(f"Failed to install dependencies: {str(e)}")
return False
# Display installation status section at the top for Hugging Face spaces
if is_hf_space:
st.info("Running on Hugging Face Space. Make sure browser dependencies are installed.")
if st.button("Install Browser Dependencies"):
success = install_playwright()
if success:
st.info("Please restart the application after installation completes.")
# Function to check if playwright browser is available
def check_browser_installed():
try:
result = subprocess.run(
[sys.executable, "-c", "from playwright.sync_api import sync_playwright; print('OK')"],
capture_output=True,
text=True
)
return "OK" in result.stdout
except:
return False
# Display browser status
browser_installed = check_browser_installed()
if browser_installed:
st.success("Browser dependencies are installed.")
else:
st.warning("Browser dependencies may not be installed correctly. Crawling might fail.")
# Input for URL
url = st.text_input("Enter URL to crawl:", value="https://www.nbcnews.com/business")
# Optional parameters
with st.expander("Advanced Options"):
max_depth = st.slider("Max Crawl Depth", min_value=1, max_value=5, value=1)
timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=30)
max_pages = st.number_input("Max Pages to Crawl", min_value=1, max_value=100, value=10)
crawl_mode = st.selectbox("Crawl Mode", options=["browser", "requests"], index=0,
help="'browser' uses Playwright (more capable but slower), 'requests' is faster but may miss content")
# Function to run the crawler
def run_async_crawler(url, max_depth=1, timeout=30, max_pages=10, mode="browser"):
async def _run():
try:
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
max_depth=max_depth,
timeout=timeout,
max_pages=max_pages,
mode=mode
)
return result.markdown, None
except Exception as e:
return None, str(e)
# Use the current event loop with nest_asyncio applied
return asyncio.get_event_loop().run_until_complete(_run())
# Button to start crawling
if st.button("Start Crawling"):
if not browser_installed and crawl_mode == "browser":
st.warning("Browser dependencies not detected. Try installing them first or switch to 'requests' mode.")
try:
with st.spinner("Crawling in progress..."):
result, error = run_async_crawler(
url=url,
max_depth=max_depth,
timeout=timeout,
max_pages=max_pages,
mode=crawl_mode
)
if error:
st.error(f"Crawling failed: {error}")
if "browser" in error.lower():
st.error("This appears to be a browser-related error.")
if st.button("Attempt to install browser dependencies"):
install_playwright()
else:
# Display the results
st.subheader("Crawl Results")
st.markdown(result)
# Option to download results
st.download_button(
label="Download Results",
data=result,
file_name="crawl_results.md",
mime="text/markdown"
)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
# Add footer with information
st.markdown("---")
st.info("""
This app uses the crawl4ai library to extract content from web pages.
**Hugging Face Space Setup Instructions:**
1. After launching the space, click "Install Browser Dependencies"
2. Restart the space after installation completes
3. You should now be able to crawl websites with the browser mode
""")
# Add a sidebar with information about the dependencies
with st.sidebar:
st.header("About")
st.write("""
This web crawler app requires Playwright and Chromium to be installed for the browser mode.
If you're running on a Hugging Face space, use the "Install Browser Dependencies" button.
If you're running locally, run:
```
python -m playwright install --with-deps chromium
```
""") |