Spaces:

bonrix
/

WebsiteCrawler

Runtime error

App Files Files Community

WebsiteCrawler / app.py

bonrix

Update app.py

79f2b70 over 2 years ago

raw

history blame contribute delete

7.93 kB

	import requests
	from bs4 import BeautifulSoup
	import xml.etree.ElementTree as ET
	import xml.dom.minidom
	import re
	import gradio as gr
	from urllib.parse import urlparse, urljoin
	import difflib


	def crawl_website(url):
	visited_urls = set()
	unique_urls = set()

	def crawl(url):
	# Check if URL has already been visited
	if url in visited_urls:
	return

	# Add URL to visited set
	visited_urls.add(url)

	# Extract domain from the given URL
	parsed_url = urlparse(url)
	base_url = parsed_url.scheme + "://" + parsed_url.netloc

	# Make a GET request to the URL
	try:
	response = requests.get(url)
	except requests.exceptions.RequestException:
	# Handle unreadable URLs
	return

	# Check if the request was successful
	if response.status_code == 200:
	# Print the currently crawling URL
	crawl_website.progress_textbox.append(f"Crawling: {url}")

	# Parse the HTML content using BeautifulSoup
	soup = BeautifulSoup(response.content, 'html.parser')

	# Add the URL to the set of unique URLs
	unique_urls.add(url)

	# Extract all the links on the page
	links = soup.find_all('a')

	# Visit each link
	for link in links:
	href = link.get('href')
	if href and not href.startswith('#'):
	# Construct the absolute URL by joining the base URL and the relative URL
	absolute_url = urljoin(url, href)
	parsed_absolute_url = urlparse(absolute_url)

	# Check if the URL points to a webpage (excluding image URLs) and ends with ".html" or ".htm"
	if parsed_absolute_url.netloc == parsed_url.netloc and parsed_absolute_url.path.endswith(
	('.html', '.htm')):
	try:
	# Visit the absolute URL
	crawl(absolute_url)
	except requests.exceptions.RequestException:
	# Handle unreadable URLs
	continue
	else:
	# Handle unsuccessful requests
	return

	# Call the crawl_website function with the desired URL
	crawl_website.progress_textbox = [] # Create a list to store progress lines
	crawl(url)

	# Remove "http://" URLs that have matching content after "http://" in "https://" URLs
	final_urls = set()
	for url in unique_urls:
	if url.startswith("http://"):
	remaining_url = url[len("http://"):]
	if "https://" + remaining_url in unique_urls:
	continue
	final_urls.add(url)

	# Create the XML sitemap
	urlset = ET.Element("urlset")
	urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")

	for url in final_urls:
	url_elem = ET.SubElement(urlset, "url")
	loc_elem = ET.SubElement(url_elem, "loc")
	loc_elem.text = url

	# Create the ElementTree object
	tree = ET.ElementTree(urlset)

	# Convert the ElementTree to a formatted string
	xml_str = xml.dom.minidom.parseString(ET.tostring(urlset)).toprettyxml(indent=" ")

	# Remove empty lines from the formatted XML string
	xml_str = "\n".join([line for line in xml_str.split("\n") if line.strip()])

	# Write the XML string to a file
	sitemap_file = "sitemap.xml"
	with open(sitemap_file, "w") as file:
	file.write(xml_str)

	return sitemap_file


	def extract_text_from_sitemap(sitemap_file):
	with open(sitemap_file, 'r') as file:
	sitemap_content = file.read()

	soup = BeautifulSoup(sitemap_content, 'xml')
	urls = [loc.text for loc in soup.find_all('loc')]

	extracted_text = ""
	for i, url in enumerate(urls):
	if url.lower().endswith(('.html', '.htm')):
	# Print the currently extracting URL
	crawl_website.progress_textbox.append(f"Extracting text: {url}")

	response = requests.get(url)
	soup = BeautifulSoup(response.text, 'html.parser')
	text = soup.get_text(separator=' ')
	extracted_text += f"\n{url}\n{text}\n\n"

	# Remove multiple whitespace
	extracted_text = re.sub(r'\s+', ' ', extracted_text)

	return extracted_text


	def gradio_interface(url):
	sitemap_file = crawl_website(url)
	extracted_text = extract_text_from_sitemap(sitemap_file)

	# Save the extracted text to a file
	text_file_path = 'extracted_text.txt'
	with open(text_file_path, 'w', encoding='utf-8') as file:
	file.write(extracted_text)

	return "\n".join(crawl_website.progress_textbox), text_file_path



	def extract_text_from_url1(url):
	response = requests.get(url)
	soup = BeautifulSoup(response.text, 'html.parser')
	text = soup.get_text(separator=' ')
	return f"<p><b>{url}</b></p>\n<p>{text.strip()}</p>\n"


	def extract_text_from_sitemap1(sitemap_file):
	with open(sitemap_file, 'r') as file:
	sitemap_content = file.read()

	soup = BeautifulSoup(sitemap_content, 'xml')
	urls = [loc.text for loc in soup.find_all('loc')]

	extracted_text = ""
	processed_urls = set()
	existing_text = ""
	for url in urls:
	if url.lower().endswith(('.html', '.htm')) and url not in processed_urls:
	text = extract_text_from_url1(url)
	diff = difflib.SequenceMatcher(None, existing_text, text)
	similarity = diff.ratio()
	if similarity < 0.95:
	extracted_text += text
	existing_text += text
	processed_urls.add(url)

	# Remove multiple whitespace
	extracted_text = re.sub(r'\s+', ' ', extracted_text)

	return extracted_text

	def generate_text_file1(url):
	sitemap_file = crawl_website(url)
	extracted_text = extract_text_from_sitemap1(sitemap_file)
	text_file_path = 'extracted_text.html'

	with open(text_file_path, 'w', encoding='utf-8') as file:
	file.write(f"<html><body>{extracted_text}</body></html>")

	return text_file_path

	# Define the Gradio interface
	def gradio_interface1(sitemap_file):
	output_file = generate_text_file1(sitemap_file)
	return output_file



	with gr.Blocks() as demo:
	gr.Markdown("A website URL is entered into a web crawling tool, which navigates through the site's pages and extracts text content from each page. This process enables users to gather information from multiple web pages quickly and efficiently, facilitating data analysis, research, or content extraction for various purposes.First website crawler generates the Text file and another website crawler is generates a HTML File, Once the server responds, the crawling tool fetches the HTML content of the webpage. The HTML is then parsed to extract the structured information present in the page's elements, such as headings, paragraphs, links, images, etc.Overall, web crawling is a valuable and respectful relationship between web crawlers and website owners.")
	with gr.Tab("Website Crawler(To generate a text file)"):
	text_input1 = gr.inputs.Textbox()
	progress_output = gr.outputs.Textbox(label="Progress")
	file_output1 = gr.outputs.File(label="Download Text")
	button1 = gr.Button("Website Crawler")

	with gr.Tab("Website Crawler(To generate a HTML file)"):
	text_input2 = gr.inputs.Textbox()
	file_output2 = gr.outputs.File(label="Download HTML File")
	button2 = gr.Button("Website Crawler")

	def crawl_and_extract_text(url):
	progress, file_path = gradio_interface(url)
	return progress, file_path

	button1.click(crawl_and_extract_text, inputs=text_input1, outputs=[progress_output, file_output1])
	button2.click(gradio_interface1, inputs=text_input2, outputs=file_output2)

	demo.launch()