File size: 5,423 Bytes
25996b1
605e112
e1a4de5
 
 
605e112
fdadbb0
 
 
 
605e112
45e9c8a
e5adc42
45e9c8a
25996b1
e1a4de5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45e9c8a
 
25996b1
45e9c8a
 
 
 
 
e1a4de5
 
25996b1
45e9c8a
e1a4de5
fdadbb0
e1a4de5
 
 
 
 
 
 
 
 
 
 
 
fdadbb0
 
 
45e9c8a
 
 
e1a4de5
 
 
fdadbb0
 
e1a4de5
45e9c8a
 
 
e1a4de5
 
fdadbb0
 
e1a4de5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdadbb0
 
dff737d
45e9c8a
 
e1a4de5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import streamlit as st
import asyncio
import subprocess
import sys
import os
from crawl4ai import AsyncWebCrawler
import nest_asyncio

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

st.set_page_config(page_title="Web Crawler App", layout="wide")

st.title("Web Crawler App")

# Check if we're on Hugging Face space
is_hf_space = os.environ.get("SPACE_ID") is not None

# Function to install dependencies
def install_playwright():
    try:
        st.info("Installing Playwright and browser dependencies. This may take a few minutes...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "playwright"])
        subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"])
        st.success("Playwright and Chromium installed successfully!")
        return True
    except Exception as e:
        st.error(f"Failed to install dependencies: {str(e)}")
        return False

# Display installation status section at the top for Hugging Face spaces
if is_hf_space:
    st.info("Running on Hugging Face Space. Make sure browser dependencies are installed.")
    
    if st.button("Install Browser Dependencies"):
        success = install_playwright()
        if success:
            st.info("Please restart the application after installation completes.")

# Function to check if playwright browser is available
def check_browser_installed():
    try:
        result = subprocess.run(
            [sys.executable, "-c", "from playwright.sync_api import sync_playwright; print('OK')"],
            capture_output=True,
            text=True
        )
        return "OK" in result.stdout
    except:
        return False

# Display browser status
browser_installed = check_browser_installed()
if browser_installed:
    st.success("Browser dependencies are installed.")
else:
    st.warning("Browser dependencies may not be installed correctly. Crawling might fail.")

# Input for URL
url = st.text_input("Enter URL to crawl:", value="https://www.nbcnews.com/business")

# Optional parameters
with st.expander("Advanced Options"):
    max_depth = st.slider("Max Crawl Depth", min_value=1, max_value=5, value=1)
    timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=30)
    max_pages = st.number_input("Max Pages to Crawl", min_value=1, max_value=100, value=10)
    crawl_mode = st.selectbox("Crawl Mode", options=["browser", "requests"], index=0,
                             help="'browser' uses Playwright (more capable but slower), 'requests' is faster but may miss content")

# Function to run the crawler
def run_async_crawler(url, max_depth=1, timeout=30, max_pages=10, mode="browser"):
    async def _run():
        try:
            async with AsyncWebCrawler() as crawler:
                result = await crawler.arun(
                    url=url,
                    max_depth=max_depth,
                    timeout=timeout,
                    max_pages=max_pages,
                    mode=mode
                )
                return result.markdown, None
        except Exception as e:
            return None, str(e)
    
    # Use the current event loop with nest_asyncio applied
    return asyncio.get_event_loop().run_until_complete(_run())

# Button to start crawling
if st.button("Start Crawling"):
    if not browser_installed and crawl_mode == "browser":
        st.warning("Browser dependencies not detected. Try installing them first or switch to 'requests' mode.")
    
    try:
        with st.spinner("Crawling in progress..."):
            result, error = run_async_crawler(
                url=url,
                max_depth=max_depth,
                timeout=timeout,
                max_pages=max_pages,
                mode=crawl_mode
            )
            
            if error:
                st.error(f"Crawling failed: {error}")
                
                if "browser" in error.lower():
                    st.error("This appears to be a browser-related error.")
                    if st.button("Attempt to install browser dependencies"):
                        install_playwright()
            else:
                # Display the results
                st.subheader("Crawl Results")
                st.markdown(result)
                
                # Option to download results
                st.download_button(
                    label="Download Results",
                    data=result,
                    file_name="crawl_results.md",
                    mime="text/markdown"
                )
    except Exception as e:
        st.error(f"An error occurred: {str(e)}")

# Add footer with information
st.markdown("---")
st.info("""
This app uses the crawl4ai library to extract content from web pages.

**Hugging Face Space Setup Instructions:**
1. After launching the space, click "Install Browser Dependencies"
2. Restart the space after installation completes
3. You should now be able to crawl websites with the browser mode
""")

# Add a sidebar with information about the dependencies
with st.sidebar:
    st.header("About")
    st.write("""
    This web crawler app requires Playwright and Chromium to be installed for the browser mode.
    
    If you're running on a Hugging Face space, use the "Install Browser Dependencies" button.
    
    If you're running locally, run:
    ```
    python -m playwright install --with-deps chromium
    ```
    """)