Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| import json | |
| import streamlit as st | |
| # Function to crawl a web page | |
| def crawl(base_url, depth): | |
| visited = set() | |
| queue = [(base_url, 0)] | |
| results = [] | |
| base_netloc = urlparse(base_url).netloc | |
| while queue: | |
| current_url, current_depth = queue.pop(0) | |
| if current_depth > depth: | |
| continue | |
| if current_url in visited: | |
| continue | |
| visited.add(current_url) | |
| try: | |
| response = requests.get(current_url) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| text = soup.get_text() | |
| results.append({'url': current_url, 'content': text}) | |
| # Find all links on the page | |
| for link in soup.find_all('a', href=True): | |
| href = link['href'] | |
| full_url = urljoin(current_url, href) | |
| # Check if the link is within the base domain | |
| if urlparse(full_url).netloc == base_netloc: | |
| if full_url not in visited: | |
| queue.append((full_url, current_depth + 1)) | |
| except Exception as e: | |
| print(f"Failed to fetch {current_url}: {e}") | |
| return results | |
| # Streamlit application | |
| st.title("Custom Web Crawler Demo") | |
| depth = st.slider("Depth", min_value=1, max_value=5, value=2) | |
| base_url = st.text_input("Enter Base URL", "https://docs.nvidia.com/cuda/") | |
| if st.button("Crawl"): | |
| with st.spinner('Crawling...'): | |
| data = crawl(base_url, depth) | |
| st.write(f"Found {len(data)} pages") | |
| # Optionally save the results to a JSON file | |
| with open('crawled_data.json', 'w') as f: | |
| json.dump(data, f, indent=2) | |
| st.write(data) | |
| # Display the first page's content for demo purposes | |
| if data: | |
| st.write("First page content:") | |
| st.write(data[0]['content']) | |