Spaces:

SurajJha21
/

stepai

Sleeping

App Files Files Community

stepai / app.py

SurajJha21

Update app.py

4fc8401 verified over 1 year ago

raw

history blame contribute delete

1.96 kB

	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	import json
	import streamlit as st

	# Function to crawl a web page
	def crawl(base_url, depth):
	visited = set()
	queue = [(base_url, 0)]
	results = []
	base_netloc = urlparse(base_url).netloc

	while queue:
	current_url, current_depth = queue.pop(0)
	if current_depth > depth:
	continue

	if current_url in visited:
	continue

	visited.add(current_url)
	try:
	response = requests.get(current_url)
	soup = BeautifulSoup(response.content, 'html.parser')
	text = soup.get_text()

	results.append({'url': current_url, 'content': text})

	# Find all links on the page
	for link in soup.find_all('a', href=True):
	href = link['href']
	full_url = urljoin(current_url, href)
	# Check if the link is within the base domain
	if urlparse(full_url).netloc == base_netloc:
	if full_url not in visited:
	queue.append((full_url, current_depth + 1))

	except Exception as e:
	print(f"Failed to fetch {current_url}: {e}")

	return results

	# Streamlit application
	st.title("Custom Web Crawler Demo")

	depth = st.slider("Depth", min_value=1, max_value=5, value=2)
	base_url = st.text_input("Enter Base URL", "https://docs.nvidia.com/cuda/")

	if st.button("Crawl"):
	with st.spinner('Crawling...'):
	data = crawl(base_url, depth)
	st.write(f"Found {len(data)} pages")

	# Optionally save the results to a JSON file
	with open('crawled_data.json', 'w') as f:
	json.dump(data, f, indent=2)

	st.write(data)

	# Display the first page's content for demo purposes
	if data:
	st.write("First page content:")
	st.write(data[0]['content'])