import requests from bs4 import BeautifulSoup from urllib.parse import urlparse, urljoin import pandas as pd from difflib import SequenceMatcher from xml.etree import ElementTree as ET from openpyxl import Workbook from openpyxl.styles import PatternFill from openpyxl.utils.dataframe import dataframe_to_rows import gradio as gr def crawl_website_from_sitemap(sitemap_url): visited_urls = set() data = { 'URLs': [], 'Title': [], 'Keywords': [], 'Description': [] } def crawl(url): # Check if URL has already been visited if url in visited_urls: return # Add URL to visited set visited_urls.add(url) # Extract domain from the given URL parsed_url = urlparse(url) base_url = parsed_url.scheme + "://" + parsed_url.netloc # Make a GET request to the URL try: response = requests.get(url) except requests.exceptions.RequestException: # Handle unreadable URLs return # Check if the request was successful if response.status_code == 200: # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(response.content, 'html.parser') # Extract the title title = soup.title.string if soup.title else '' # Check for duplicate or similar titles similar_titles = [] for visited_title in data['Title']: similarity_ratio = SequenceMatcher(None, visited_title, title).ratio() if similarity_ratio > 0.8: similar_titles.append(visited_title) # Extract the meta keywords meta_keywords = soup.find('meta', attrs={'name': 'keywords'}) meta_keywords = meta_keywords['content'] if meta_keywords and 'content' in meta_keywords.attrs else '' # Extract the meta description meta_description = soup.find('meta', attrs={'name': 'description'}) meta_description = meta_description[ 'content'] if meta_description and 'content' in meta_description.attrs else '' # Add the data to the dictionary data['URLs'].append(url) data['Title'].append(title) data['Keywords'].append(meta_keywords) data['Description'].append(meta_description) # Extract all the links on the page links = soup.find_all('a') # Visit each link for link in links: href = link.get('href') if href and not href.startswith('#'): # Construct the absolute URL by joining the base URL and the relative URL absolute_url = urljoin(url, href) parsed_absolute_url = urlparse(absolute_url) # Check if the URL points to a webpage (excluding image URLs) and ends with ".html" or ".htm" if parsed_absolute_url.netloc == parsed_url.netloc and parsed_absolute_url.path.endswith( ('.html', '.htm')): try: # Visit the absolute URL crawl(absolute_url) except requests.exceptions.RequestException: # Handle unreadable URLs continue else: # Handle unsuccessful requests return # Fetch the sitemap XML content try: response = requests.get(sitemap_url) except requests.exceptions.RequestException: return "Error fetching sitemap" # Check if the request was successful if response.status_code == 200: # Parse the sitemap XML content xml_content = response.content root = ET.fromstring(xml_content) # Extract URLs from the sitemap urls = [] for url_element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'): loc_element = url_element.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc') if loc_element is not None: url = loc_element.text urls.append(url) # Call the crawl function for each URL in the sitemap for url in urls: crawl(url) # Convert the data to a pandas DataFrame df = pd.DataFrame(data) # Get the domain name from the sitemap URL domain_name = urlparse(sitemap_url).netloc # Save the DataFrame to an Excel file wb = Workbook() ws = wb.active # Write the DataFrame to the worksheet for r in dataframe_to_rows(df, index=False, header=True): ws.append(r) # Highlight cells with similar titles in the "Title" column fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid") titles = df['Title'] for row in ws.iter_rows(min_row=2, min_col=2, max_col=2): cell = row[0] if cell.value in titles: cell.fill = fill # Save the workbook as an Excel file file_path = f'{domain_name}.xlsx' wb.save(file_path) return file_path else: return "Error fetching sitemap" # Create a Gradio interface iface = gr.Interface( fn=crawl_website_from_sitemap, inputs="text", outputs="file", title="Sitemap to SEO Tracking Excel", description="To collect SEO data (Page URL, Title, Description, Keywords) from a sitemap URL, use a Python script with BeautifulSoup and pandas. The script crawls the sitemap, extracts data from each page, and exports the results to an Excel file for analysis.", allow_flagging=False, examples=[["http://www.embedded-innovations.com/sitemap.xml"]] ) # Launch the Gradio interface iface.launch()