stepai / app.py
SurajJha21's picture
Update app.py
4fc8401 verified
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json
import streamlit as st
# Function to crawl a web page
def crawl(base_url, depth):
visited = set()
queue = [(base_url, 0)]
results = []
base_netloc = urlparse(base_url).netloc
while queue:
current_url, current_depth = queue.pop(0)
if current_depth > depth:
continue
if current_url in visited:
continue
visited.add(current_url)
try:
response = requests.get(current_url)
soup = BeautifulSoup(response.content, 'html.parser')
text = soup.get_text()
results.append({'url': current_url, 'content': text})
# Find all links on the page
for link in soup.find_all('a', href=True):
href = link['href']
full_url = urljoin(current_url, href)
# Check if the link is within the base domain
if urlparse(full_url).netloc == base_netloc:
if full_url not in visited:
queue.append((full_url, current_depth + 1))
except Exception as e:
print(f"Failed to fetch {current_url}: {e}")
return results
# Streamlit application
st.title("Custom Web Crawler Demo")
depth = st.slider("Depth", min_value=1, max_value=5, value=2)
base_url = st.text_input("Enter Base URL", "https://docs.nvidia.com/cuda/")
if st.button("Crawl"):
with st.spinner('Crawling...'):
data = crawl(base_url, depth)
st.write(f"Found {len(data)} pages")
# Optionally save the results to a JSON file
with open('crawled_data.json', 'w') as f:
json.dump(data, f, indent=2)
st.write(data)
# Display the first page's content for demo purposes
if data:
st.write("First page content:")
st.write(data[0]['content'])