rairo commited on
Commit
e1a4de5
·
verified ·
1 Parent(s): fdadbb0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -24
app.py CHANGED
@@ -1,5 +1,8 @@
1
  import streamlit as st
2
  import asyncio
 
 
 
3
  from crawl4ai import AsyncWebCrawler
4
  import nest_asyncio
5
 
@@ -10,6 +13,49 @@ st.set_page_config(page_title="Web Crawler App", layout="wide")
10
 
11
  st.title("Web Crawler App")
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # Input for URL
14
  url = st.text_input("Enter URL to crawl:", value="https://www.nbcnews.com/business")
15
 
@@ -18,48 +64,86 @@ with st.expander("Advanced Options"):
18
  max_depth = st.slider("Max Crawl Depth", min_value=1, max_value=5, value=1)
19
  timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=30)
20
  max_pages = st.number_input("Max Pages to Crawl", min_value=1, max_value=100, value=10)
 
 
21
 
22
  # Function to run the crawler
23
- def run_async_crawler(url, max_depth=1, timeout=30, max_pages=10):
24
  async def _run():
25
- async with AsyncWebCrawler() as crawler:
26
- result = await crawler.arun(
27
- url=url,
28
- max_depth=max_depth,
29
- timeout=timeout,
30
- max_pages=max_pages
31
- )
32
- return result.markdown
 
 
 
 
33
 
34
  # Use the current event loop with nest_asyncio applied
35
  return asyncio.get_event_loop().run_until_complete(_run())
36
 
37
  # Button to start crawling
38
  if st.button("Start Crawling"):
 
 
 
39
  try:
40
  with st.spinner("Crawling in progress..."):
41
- result = run_async_crawler(
42
  url=url,
43
  max_depth=max_depth,
44
  timeout=timeout,
45
- max_pages=max_pages
 
46
  )
47
 
48
- # Display the results
49
- st.subheader("Crawl Results")
50
- st.markdown(result)
51
-
52
- # Option to download results
53
- st.download_button(
54
- label="Download Results",
55
- data=result,
56
- file_name="crawl_results.md",
57
- mime="text/markdown"
58
- )
 
 
 
 
 
 
 
 
59
  except Exception as e:
60
  st.error(f"An error occurred: {str(e)}")
61
- st.error("If you're seeing browser launch errors, make sure you have the required dependencies installed.")
62
 
63
  # Add footer with information
64
  st.markdown("---")
65
- st.info("This app uses the crawl4ai library to extract content from web pages. The crawler may require additional dependencies if it's using a headless browser.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import asyncio
3
+ import subprocess
4
+ import sys
5
+ import os
6
  from crawl4ai import AsyncWebCrawler
7
  import nest_asyncio
8
 
 
13
 
14
  st.title("Web Crawler App")
15
 
16
+ # Check if we're on Hugging Face space
17
+ is_hf_space = os.environ.get("SPACE_ID") is not None
18
+
19
+ # Function to install dependencies
20
+ def install_playwright():
21
+ try:
22
+ st.info("Installing Playwright and browser dependencies. This may take a few minutes...")
23
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "playwright"])
24
+ subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"])
25
+ st.success("Playwright and Chromium installed successfully!")
26
+ return True
27
+ except Exception as e:
28
+ st.error(f"Failed to install dependencies: {str(e)}")
29
+ return False
30
+
31
+ # Display installation status section at the top for Hugging Face spaces
32
+ if is_hf_space:
33
+ st.info("Running on Hugging Face Space. Make sure browser dependencies are installed.")
34
+
35
+ if st.button("Install Browser Dependencies"):
36
+ success = install_playwright()
37
+ if success:
38
+ st.info("Please restart the application after installation completes.")
39
+
40
+ # Function to check if playwright browser is available
41
+ def check_browser_installed():
42
+ try:
43
+ result = subprocess.run(
44
+ [sys.executable, "-c", "from playwright.sync_api import sync_playwright; print('OK')"],
45
+ capture_output=True,
46
+ text=True
47
+ )
48
+ return "OK" in result.stdout
49
+ except:
50
+ return False
51
+
52
+ # Display browser status
53
+ browser_installed = check_browser_installed()
54
+ if browser_installed:
55
+ st.success("Browser dependencies are installed.")
56
+ else:
57
+ st.warning("Browser dependencies may not be installed correctly. Crawling might fail.")
58
+
59
  # Input for URL
60
  url = st.text_input("Enter URL to crawl:", value="https://www.nbcnews.com/business")
61
 
 
64
  max_depth = st.slider("Max Crawl Depth", min_value=1, max_value=5, value=1)
65
  timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=30)
66
  max_pages = st.number_input("Max Pages to Crawl", min_value=1, max_value=100, value=10)
67
+ crawl_mode = st.selectbox("Crawl Mode", options=["browser", "requests"], index=0,
68
+ help="'browser' uses Playwright (more capable but slower), 'requests' is faster but may miss content")
69
 
70
  # Function to run the crawler
71
+ def run_async_crawler(url, max_depth=1, timeout=30, max_pages=10, mode="browser"):
72
  async def _run():
73
+ try:
74
+ async with AsyncWebCrawler() as crawler:
75
+ result = await crawler.arun(
76
+ url=url,
77
+ max_depth=max_depth,
78
+ timeout=timeout,
79
+ max_pages=max_pages,
80
+ mode=mode
81
+ )
82
+ return result.markdown, None
83
+ except Exception as e:
84
+ return None, str(e)
85
 
86
  # Use the current event loop with nest_asyncio applied
87
  return asyncio.get_event_loop().run_until_complete(_run())
88
 
89
  # Button to start crawling
90
  if st.button("Start Crawling"):
91
+ if not browser_installed and crawl_mode == "browser":
92
+ st.warning("Browser dependencies not detected. Try installing them first or switch to 'requests' mode.")
93
+
94
  try:
95
  with st.spinner("Crawling in progress..."):
96
+ result, error = run_async_crawler(
97
  url=url,
98
  max_depth=max_depth,
99
  timeout=timeout,
100
+ max_pages=max_pages,
101
+ mode=crawl_mode
102
  )
103
 
104
+ if error:
105
+ st.error(f"Crawling failed: {error}")
106
+
107
+ if "browser" in error.lower():
108
+ st.error("This appears to be a browser-related error.")
109
+ if st.button("Attempt to install browser dependencies"):
110
+ install_playwright()
111
+ else:
112
+ # Display the results
113
+ st.subheader("Crawl Results")
114
+ st.markdown(result)
115
+
116
+ # Option to download results
117
+ st.download_button(
118
+ label="Download Results",
119
+ data=result,
120
+ file_name="crawl_results.md",
121
+ mime="text/markdown"
122
+ )
123
  except Exception as e:
124
  st.error(f"An error occurred: {str(e)}")
 
125
 
126
  # Add footer with information
127
  st.markdown("---")
128
+ st.info("""
129
+ This app uses the crawl4ai library to extract content from web pages.
130
+
131
+ **Hugging Face Space Setup Instructions:**
132
+ 1. After launching the space, click "Install Browser Dependencies"
133
+ 2. Restart the space after installation completes
134
+ 3. You should now be able to crawl websites with the browser mode
135
+ """)
136
+
137
+ # Add a sidebar with information about the dependencies
138
+ with st.sidebar:
139
+ st.header("About")
140
+ st.write("""
141
+ This web crawler app requires Playwright and Chromium to be installed for the browser mode.
142
+
143
+ If you're running on a Hugging Face space, use the "Install Browser Dependencies" button.
144
+
145
+ If you're running locally, run:
146
+ ```
147
+ python -m playwright install --with-deps chromium
148
+ ```
149
+ """)