import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse import json import streamlit as st # Function to crawl a web page def crawl(base_url, depth): visited = set() queue = [(base_url, 0)] results = [] base_netloc = urlparse(base_url).netloc while queue: current_url, current_depth = queue.pop(0) if current_depth > depth: continue if current_url in visited: continue visited.add(current_url) try: response = requests.get(current_url) soup = BeautifulSoup(response.content, 'html.parser') text = soup.get_text() results.append({'url': current_url, 'content': text}) # Find all links on the page for link in soup.find_all('a', href=True): href = link['href'] full_url = urljoin(current_url, href) # Check if the link is within the base domain if urlparse(full_url).netloc == base_netloc: if full_url not in visited: queue.append((full_url, current_depth + 1)) except Exception as e: print(f"Failed to fetch {current_url}: {e}") return results # Streamlit application st.title("Custom Web Crawler Demo") depth = st.slider("Depth", min_value=1, max_value=5, value=2) base_url = st.text_input("Enter Base URL", "https://docs.nvidia.com/cuda/") if st.button("Crawl"): with st.spinner('Crawling...'): data = crawl(base_url, depth) st.write(f"Found {len(data)} pages") # Optionally save the results to a JSON file with open('crawled_data.json', 'w') as f: json.dump(data, f, indent=2) st.write(data) # Display the first page's content for demo purposes if data: st.write("First page content:") st.write(data[0]['content'])