bonrix's picture
Update app.py
7b786e3
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import pandas as pd
from difflib import SequenceMatcher
from xml.etree import ElementTree as ET
from openpyxl import Workbook
from openpyxl.styles import PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows
import gradio as gr
def crawl_website_from_sitemap(sitemap_url):
visited_urls = set()
data = {
'URLs': [],
'Title': [],
'Keywords': [],
'Description': []
}
def crawl(url):
# Check if URL has already been visited
if url in visited_urls:
return
# Add URL to visited set
visited_urls.add(url)
# Extract domain from the given URL
parsed_url = urlparse(url)
base_url = parsed_url.scheme + "://" + parsed_url.netloc
# Make a GET request to the URL
try:
response = requests.get(url)
except requests.exceptions.RequestException:
# Handle unreadable URLs
return
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the title
title = soup.title.string if soup.title else ''
# Check for duplicate or similar titles
similar_titles = []
for visited_title in data['Title']:
similarity_ratio = SequenceMatcher(None, visited_title, title).ratio()
if similarity_ratio > 0.8:
similar_titles.append(visited_title)
# Extract the meta keywords
meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
meta_keywords = meta_keywords['content'] if meta_keywords and 'content' in meta_keywords.attrs else ''
# Extract the meta description
meta_description = soup.find('meta', attrs={'name': 'description'})
meta_description = meta_description[
'content'] if meta_description and 'content' in meta_description.attrs else ''
# Add the data to the dictionary
data['URLs'].append(url)
data['Title'].append(title)
data['Keywords'].append(meta_keywords)
data['Description'].append(meta_description)
# Extract all the links on the page
links = soup.find_all('a')
# Visit each link
for link in links:
href = link.get('href')
if href and not href.startswith('#'):
# Construct the absolute URL by joining the base URL and the relative URL
absolute_url = urljoin(url, href)
parsed_absolute_url = urlparse(absolute_url)
# Check if the URL points to a webpage (excluding image URLs) and ends with ".html" or ".htm"
if parsed_absolute_url.netloc == parsed_url.netloc and parsed_absolute_url.path.endswith(
('.html', '.htm')):
try:
# Visit the absolute URL
crawl(absolute_url)
except requests.exceptions.RequestException:
# Handle unreadable URLs
continue
else:
# Handle unsuccessful requests
return
# Fetch the sitemap XML content
try:
response = requests.get(sitemap_url)
except requests.exceptions.RequestException:
return "Error fetching sitemap"
# Check if the request was successful
if response.status_code == 200:
# Parse the sitemap XML content
xml_content = response.content
root = ET.fromstring(xml_content)
# Extract URLs from the sitemap
urls = []
for url_element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
loc_element = url_element.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
if loc_element is not None:
url = loc_element.text
urls.append(url)
# Call the crawl function for each URL in the sitemap
for url in urls:
crawl(url)
# Convert the data to a pandas DataFrame
df = pd.DataFrame(data)
# Get the domain name from the sitemap URL
domain_name = urlparse(sitemap_url).netloc
# Save the DataFrame to an Excel file
wb = Workbook()
ws = wb.active
# Write the DataFrame to the worksheet
for r in dataframe_to_rows(df, index=False, header=True):
ws.append(r)
# Highlight cells with similar titles in the "Title" column
fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
titles = df['Title']
for row in ws.iter_rows(min_row=2, min_col=2, max_col=2):
cell = row[0]
if cell.value in titles:
cell.fill = fill
# Save the workbook as an Excel file
file_path = f'{domain_name}.xlsx'
wb.save(file_path)
return file_path
else:
return "Error fetching sitemap"
# Create a Gradio interface
iface = gr.Interface(
fn=crawl_website_from_sitemap,
inputs="text",
outputs="file",
title="Sitemap to SEO Tracking Excel",
description="To collect SEO data (Page URL, Title, Description, Keywords) from a sitemap URL, use a Python script with BeautifulSoup and pandas. The script crawls the sitemap, extracts data from each page, and exports the results to an Excel file for analysis.",
allow_flagging=False,
examples=[["http://www.embedded-innovations.com/sitemap.xml"]]
)
# Launch the Gradio interface
iface.launch()