Spaces:

bonrix
/

WebsiteCrawler

Runtime error

App Files Files Community

WebsiteCrawler / app.py

bonrix

Create app.py

7c58167 almost 3 years ago

raw

history blame

4.95 kB

	import requests
	from bs4 import BeautifulSoup
	import xml.etree.ElementTree as ET
	import xml.dom.minidom
	import re
	import gradio as gr
	from urllib.parse import urlparse, urljoin


	def crawl_website(url):
	visited_urls = set()
	unique_urls = set()

	def crawl(url):
	# Check if URL has already been visited
	if url in visited_urls:
	return

	# Add URL to visited set
	visited_urls.add(url)

	# Extract domain from the given URL
	parsed_url = urlparse(url)
	base_url = parsed_url.scheme + "://" + parsed_url.netloc

	# Make a GET request to the URL
	try:
	response = requests.get(url)
	except requests.exceptions.RequestException:
	# Handle unreadable URLs
	return

	# Check if the request was successful
	if response.status_code == 200:
	# Print the currently crawling URL
	crawl_website.progress_textbox.append(f"Crawling: {url}")

	# Parse the HTML content using BeautifulSoup
	soup = BeautifulSoup(response.content, 'html.parser')

	# Add the URL to the set of unique URLs
	unique_urls.add(url)

	# Extract all the links on the page
	links = soup.find_all('a')

	# Visit each link
	for link in links:
	href = link.get('href')
	if href and not href.startswith('#'):
	# Construct the absolute URL by joining the base URL and the relative URL
	absolute_url = urljoin(url, href)
	parsed_absolute_url = urlparse(absolute_url)

	# Check if the URL points to a webpage (excluding image URLs) and ends with ".html" or ".htm"
	if parsed_absolute_url.netloc == parsed_url.netloc and parsed_absolute_url.path.endswith(
	('.html', '.htm')):
	try:
	# Visit the absolute URL
	crawl(absolute_url)
	except requests.exceptions.RequestException:
	# Handle unreadable URLs
	continue
	else:
	# Handle unsuccessful requests
	return

	# Call the crawl_website function with the desired URL
	crawl_website.progress_textbox = [] # Create a list to store progress lines
	crawl(url)

	# Remove "http://" URLs that have matching content after "http://" in "https://" URLs
	final_urls = set()
	for url in unique_urls:
	if url.startswith("http://"):
	remaining_url = url[len("http://"):]
	if "https://" + remaining_url in unique_urls:
	continue
	final_urls.add(url)

	# Create the XML sitemap
	urlset = ET.Element("urlset")
	urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")

	for url in final_urls:
	url_elem = ET.SubElement(urlset, "url")
	loc_elem = ET.SubElement(url_elem, "loc")
	loc_elem.text = url

	# Create the ElementTree object
	tree = ET.ElementTree(urlset)

	# Convert the ElementTree to a formatted string
	xml_str = xml.dom.minidom.parseString(ET.tostring(urlset)).toprettyxml(indent=" ")

	# Remove empty lines from the formatted XML string
	xml_str = "\n".join([line for line in xml_str.split("\n") if line.strip()])

	# Write the XML string to a file
	with open("sitemap.xml", "w") as file:
	file.write(xml_str)

	return "sitemap.xml"

	def extract_text_from_sitemap(sitemap_file):
	with open(sitemap_file, 'r') as file:
	sitemap_content = file.read()

	soup = BeautifulSoup(sitemap_content, 'xml')
	urls = [loc.text for loc in soup.find_all('loc')]

	extracted_text = ""
	for i, url in enumerate(urls):
	if url.lower().endswith(('.html', '.htm')):
	# Print the currently extracting URL
	crawl_website.progress_textbox.append(f"Extracting text: {url}")

	response = requests.get(url)
	soup = BeautifulSoup(response.text, 'html.parser')
	text = soup.get_text(separator=' ')
	extracted_text += f"\n{url}\n{text}\n\n"

	# Remove multiple whitespace
	extracted_text = re.sub(r'\s+', ' ', extracted_text)

	return extracted_text

	def gradio_interface(url):
	sitemap_file = crawl_website(url)
	extracted_text = extract_text_from_sitemap(sitemap_file)
	text_file_path = 'extracted_text.txt'

	with open(text_file_path, 'w', encoding='utf-8') as file:
	file.write(extracted_text)

	return "\n".join(crawl_website.progress_textbox), text_file_path


	with gr.Interface(fn=gradio_interface, inputs="text", outputs=["text", "file"],
	title="Website Crawler",
	description="Enter a website URL to crawl and extract text from web pages.") as iface:
	iface.launch(share=True)