Spaces:

bonrix
/

sitemap_to_SEO_tracking_Excel

Runtime error

App Files Files Community

sitemap_to_SEO_tracking_Excel / app.py

bonrix

Update app.py

7b786e3 over 2 years ago

raw

history blame contribute delete

5.87 kB

	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse, urljoin
	import pandas as pd
	from difflib import SequenceMatcher
	from xml.etree import ElementTree as ET
	from openpyxl import Workbook
	from openpyxl.styles import PatternFill
	from openpyxl.utils.dataframe import dataframe_to_rows
	import gradio as gr


	def crawl_website_from_sitemap(sitemap_url):
	visited_urls = set()
	data = {
	'URLs': [],
	'Title': [],
	'Keywords': [],
	'Description': []
	}

	def crawl(url):
	# Check if URL has already been visited
	if url in visited_urls:
	return

	# Add URL to visited set
	visited_urls.add(url)

	# Extract domain from the given URL
	parsed_url = urlparse(url)
	base_url = parsed_url.scheme + "://" + parsed_url.netloc

	# Make a GET request to the URL
	try:
	response = requests.get(url)
	except requests.exceptions.RequestException:
	# Handle unreadable URLs
	return

	# Check if the request was successful
	if response.status_code == 200:
	# Parse the HTML content using BeautifulSoup
	soup = BeautifulSoup(response.content, 'html.parser')

	# Extract the title
	title = soup.title.string if soup.title else ''

	# Check for duplicate or similar titles
	similar_titles = []
	for visited_title in data['Title']:
	similarity_ratio = SequenceMatcher(None, visited_title, title).ratio()
	if similarity_ratio > 0.8:
	similar_titles.append(visited_title)

	# Extract the meta keywords
	meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
	meta_keywords = meta_keywords['content'] if meta_keywords and 'content' in meta_keywords.attrs else ''

	# Extract the meta description
	meta_description = soup.find('meta', attrs={'name': 'description'})
	meta_description = meta_description[
	'content'] if meta_description and 'content' in meta_description.attrs else ''

	# Add the data to the dictionary
	data['URLs'].append(url)
	data['Title'].append(title)
	data['Keywords'].append(meta_keywords)
	data['Description'].append(meta_description)

	# Extract all the links on the page
	links = soup.find_all('a')

	# Visit each link
	for link in links:
	href = link.get('href')
	if href and not href.startswith('#'):
	# Construct the absolute URL by joining the base URL and the relative URL
	absolute_url = urljoin(url, href)
	parsed_absolute_url = urlparse(absolute_url)

	# Check if the URL points to a webpage (excluding image URLs) and ends with ".html" or ".htm"
	if parsed_absolute_url.netloc == parsed_url.netloc and parsed_absolute_url.path.endswith(
	('.html', '.htm')):
	try:
	# Visit the absolute URL
	crawl(absolute_url)
	except requests.exceptions.RequestException:
	# Handle unreadable URLs
	continue
	else:
	# Handle unsuccessful requests
	return

	# Fetch the sitemap XML content
	try:
	response = requests.get(sitemap_url)
	except requests.exceptions.RequestException:
	return "Error fetching sitemap"

	# Check if the request was successful
	if response.status_code == 200:
	# Parse the sitemap XML content
	xml_content = response.content
	root = ET.fromstring(xml_content)

	# Extract URLs from the sitemap
	urls = []
	for url_element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
	loc_element = url_element.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
	if loc_element is not None:
	url = loc_element.text
	urls.append(url)

	# Call the crawl function for each URL in the sitemap
	for url in urls:
	crawl(url)

	# Convert the data to a pandas DataFrame
	df = pd.DataFrame(data)

	# Get the domain name from the sitemap URL
	domain_name = urlparse(sitemap_url).netloc

	# Save the DataFrame to an Excel file
	wb = Workbook()
	ws = wb.active

	# Write the DataFrame to the worksheet
	for r in dataframe_to_rows(df, index=False, header=True):
	ws.append(r)

	# Highlight cells with similar titles in the "Title" column
	fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
	titles = df['Title']
	for row in ws.iter_rows(min_row=2, min_col=2, max_col=2):
	cell = row[0]
	if cell.value in titles:
	cell.fill = fill

	# Save the workbook as an Excel file
	file_path = f'{domain_name}.xlsx'
	wb.save(file_path)

	return file_path
	else:
	return "Error fetching sitemap"


	# Create a Gradio interface
	iface = gr.Interface(
	fn=crawl_website_from_sitemap,
	inputs="text",
	outputs="file",
	title="Sitemap to SEO Tracking Excel",
	description="To collect SEO data (Page URL, Title, Description, Keywords) from a sitemap URL, use a Python script with BeautifulSoup and pandas. The script crawls the sitemap, extracts data from each page, and exports the results to an Excel file for analysis.",
	allow_flagging=False,
	examples=[["http://www.embedded-innovations.com/sitemap.xml"]]
	)

	# Launch the Gradio interface
	iface.launch()