Craw4ai-example

Sleeping

App Files Files Community

rairo commited on Apr 7, 2025

Commit

e1a4de5

verified ·

1 Parent(s): fdadbb0

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -24

app.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import streamlit as st
 import asyncio
 from crawl4ai import AsyncWebCrawler
 import nest_asyncio
@@ -10,6 +13,49 @@ st.set_page_config(page_title="Web Crawler App", layout="wide")
 st.title("Web Crawler App")
 # Input for URL
 url = st.text_input("Enter URL to crawl:", value="https://www.nbcnews.com/business")
@@ -18,48 +64,86 @@ with st.expander("Advanced Options"):
     max_depth = st.slider("Max Crawl Depth", min_value=1, max_value=5, value=1)
     timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=30)
     max_pages = st.number_input("Max Pages to Crawl", min_value=1, max_value=100, value=10)
 # Function to run the crawler
-def run_async_crawler(url, max_depth=1, timeout=30, max_pages=10):
     async def _run():
-        async with AsyncWebCrawler() as crawler:
-            result = await crawler.arun(
-                url=url,
-                max_depth=max_depth,
-                timeout=timeout,
-                max_pages=max_pages
-            )
-            return result.markdown
     # Use the current event loop with nest_asyncio applied
     return asyncio.get_event_loop().run_until_complete(_run())
 # Button to start crawling
 if st.button("Start Crawling"):
     try:
         with st.spinner("Crawling in progress..."):
-            result = run_async_crawler(
                 url=url,
                 max_depth=max_depth,
                 timeout=timeout,
-                max_pages=max_pages
             )
-            # Display the results
-            st.subheader("Crawl Results")
-            st.markdown(result)
-            # Option to download results
-            st.download_button(
-                label="Download Results",
-                data=result,
-                file_name="crawl_results.md",
-                mime="text/markdown"
-            )
     except Exception as e:
         st.error(f"An error occurred: {str(e)}")
-        st.error("If you're seeing browser launch errors, make sure you have the required dependencies installed.")
 # Add footer with information
 st.markdown("---")
-st.info("This app uses the crawl4ai library to extract content from web pages. The crawler may require additional dependencies if it's using a headless browser.")

 import streamlit as st
 import asyncio
+import subprocess
+import sys
+import os
 from crawl4ai import AsyncWebCrawler
 import nest_asyncio
 st.title("Web Crawler App")
+# Check if we're on Hugging Face space
+is_hf_space = os.environ.get("SPACE_ID") is not None
+# Function to install dependencies
+def install_playwright():
+    try:
+        st.info("Installing Playwright and browser dependencies. This may take a few minutes...")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "playwright"])
+        subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"])
+        st.success("Playwright and Chromium installed successfully!")
+        return True
+    except Exception as e:
+        st.error(f"Failed to install dependencies: {str(e)}")
+        return False
+# Display installation status section at the top for Hugging Face spaces
+if is_hf_space:
+    st.info("Running on Hugging Face Space. Make sure browser dependencies are installed.")
+    if st.button("Install Browser Dependencies"):
+        success = install_playwright()
+        if success:
+            st.info("Please restart the application after installation completes.")
+# Function to check if playwright browser is available
+def check_browser_installed():
+    try:
+        result = subprocess.run(
+            [sys.executable, "-c", "from playwright.sync_api import sync_playwright; print('OK')"],
+            capture_output=True,
+            text=True
+        )
+        return "OK" in result.stdout
+    except:
+        return False
+# Display browser status
+browser_installed = check_browser_installed()
+if browser_installed:
+    st.success("Browser dependencies are installed.")
+else:
+    st.warning("Browser dependencies may not be installed correctly. Crawling might fail.")
 # Input for URL
 url = st.text_input("Enter URL to crawl:", value="https://www.nbcnews.com/business")
     max_depth = st.slider("Max Crawl Depth", min_value=1, max_value=5, value=1)
     timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=30)
     max_pages = st.number_input("Max Pages to Crawl", min_value=1, max_value=100, value=10)
+    crawl_mode = st.selectbox("Crawl Mode", options=["browser", "requests"], index=0,
+                             help="'browser' uses Playwright (more capable but slower), 'requests' is faster but may miss content")
 # Function to run the crawler
+def run_async_crawler(url, max_depth=1, timeout=30, max_pages=10, mode="browser"):
     async def _run():
+        try:
+            async with AsyncWebCrawler() as crawler:
+                result = await crawler.arun(
+                    url=url,
+                    max_depth=max_depth,
+                    timeout=timeout,
+                    max_pages=max_pages,
+                    mode=mode
+                )
+                return result.markdown, None
+        except Exception as e:
+            return None, str(e)
     # Use the current event loop with nest_asyncio applied
     return asyncio.get_event_loop().run_until_complete(_run())
 # Button to start crawling
 if st.button("Start Crawling"):
+    if not browser_installed and crawl_mode == "browser":
+        st.warning("Browser dependencies not detected. Try installing them first or switch to 'requests' mode.")
     try:
         with st.spinner("Crawling in progress..."):
+            result, error = run_async_crawler(
                 url=url,
                 max_depth=max_depth,
                 timeout=timeout,
+                max_pages=max_pages,
+                mode=crawl_mode
             )
+            if error:
+                st.error(f"Crawling failed: {error}")
+                if "browser" in error.lower():
+                    st.error("This appears to be a browser-related error.")
+                    if st.button("Attempt to install browser dependencies"):
+                        install_playwright()
+            else:
+                # Display the results
+                st.subheader("Crawl Results")
+                st.markdown(result)
+                # Option to download results
+                st.download_button(
+                    label="Download Results",
+                    data=result,
+                    file_name="crawl_results.md",
+                    mime="text/markdown"
+                )
     except Exception as e:
         st.error(f"An error occurred: {str(e)}")
 # Add footer with information
 st.markdown("---")
+st.info("""
+This app uses the crawl4ai library to extract content from web pages.
+**Hugging Face Space Setup Instructions:**
+1. After launching the space, click "Install Browser Dependencies"
+2. Restart the space after installation completes
+3. You should now be able to crawl websites with the browser mode
+""")
+# Add a sidebar with information about the dependencies
+with st.sidebar:
+    st.header("About")
+    st.write("""
+    This web crawler app requires Playwright and Chromium to be installed for the browser mode.
+    If you're running on a Hugging Face space, use the "Install Browser Dependencies" button.
+    If you're running locally, run:
+    ```
+    python -m playwright install --with-deps chromium
+    ```
+    """)