Spaces:

SurajJha21
/

stepai

Sleeping

File size: 1,956 Bytes

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json
import streamlit as st

# Function to crawl a web page
def crawl(base_url, depth):
    visited = set()
    queue = [(base_url, 0)]
    results = []
    base_netloc = urlparse(base_url).netloc

    while queue:
        current_url, current_depth = queue.pop(0)
        if current_depth > depth:
            continue

        if current_url in visited:
            continue
        
        visited.add(current_url)
        try:
            response = requests.get(current_url)
            soup = BeautifulSoup(response.content, 'html.parser')
            text = soup.get_text()

            results.append({'url': current_url, 'content': text})

            # Find all links on the page
            for link in soup.find_all('a', href=True):
                href = link['href']
                full_url = urljoin(current_url, href)
                # Check if the link is within the base domain
                if urlparse(full_url).netloc == base_netloc:
                    if full_url not in visited:
                        queue.append((full_url, current_depth + 1))

        except Exception as e:
            print(f"Failed to fetch {current_url}: {e}")

    return results

# Streamlit application
st.title("Custom Web Crawler Demo")

depth = st.slider("Depth", min_value=1, max_value=5, value=2)
base_url = st.text_input("Enter Base URL", "https://docs.nvidia.com/cuda/")

if st.button("Crawl"):
    with st.spinner('Crawling...'):
        data = crawl(base_url, depth)
        st.write(f"Found {len(data)} pages")

        # Optionally save the results to a JSON file
        with open('crawled_data.json', 'w') as f:
            json.dump(data, f, indent=2)

        st.write(data)

        # Display the first page's content for demo purposes
        if data:
            st.write("First page content:")
            st.write(data[0]['content'])