Spaces:
Runtime error
Runtime error
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urlparse, urljoin | |
| import pandas as pd | |
| from difflib import SequenceMatcher | |
| from xml.etree import ElementTree as ET | |
| from openpyxl import Workbook | |
| from openpyxl.styles import PatternFill | |
| from openpyxl.utils.dataframe import dataframe_to_rows | |
| import gradio as gr | |
| def crawl_website_from_sitemap(sitemap_url): | |
| visited_urls = set() | |
| data = { | |
| 'URLs': [], | |
| 'Title': [], | |
| 'Keywords': [], | |
| 'Description': [] | |
| } | |
| def crawl(url): | |
| # Check if URL has already been visited | |
| if url in visited_urls: | |
| return | |
| # Add URL to visited set | |
| visited_urls.add(url) | |
| # Extract domain from the given URL | |
| parsed_url = urlparse(url) | |
| base_url = parsed_url.scheme + "://" + parsed_url.netloc | |
| # Make a GET request to the URL | |
| try: | |
| response = requests.get(url) | |
| except requests.exceptions.RequestException: | |
| # Handle unreadable URLs | |
| return | |
| # Check if the request was successful | |
| if response.status_code == 200: | |
| # Parse the HTML content using BeautifulSoup | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Extract the title | |
| title = soup.title.string if soup.title else '' | |
| # Check for duplicate or similar titles | |
| similar_titles = [] | |
| for visited_title in data['Title']: | |
| similarity_ratio = SequenceMatcher(None, visited_title, title).ratio() | |
| if similarity_ratio > 0.8: | |
| similar_titles.append(visited_title) | |
| # Extract the meta keywords | |
| meta_keywords = soup.find('meta', attrs={'name': 'keywords'}) | |
| meta_keywords = meta_keywords['content'] if meta_keywords and 'content' in meta_keywords.attrs else '' | |
| # Extract the meta description | |
| meta_description = soup.find('meta', attrs={'name': 'description'}) | |
| meta_description = meta_description[ | |
| 'content'] if meta_description and 'content' in meta_description.attrs else '' | |
| # Add the data to the dictionary | |
| data['URLs'].append(url) | |
| data['Title'].append(title) | |
| data['Keywords'].append(meta_keywords) | |
| data['Description'].append(meta_description) | |
| # Extract all the links on the page | |
| links = soup.find_all('a') | |
| # Visit each link | |
| for link in links: | |
| href = link.get('href') | |
| if href and not href.startswith('#'): | |
| # Construct the absolute URL by joining the base URL and the relative URL | |
| absolute_url = urljoin(url, href) | |
| parsed_absolute_url = urlparse(absolute_url) | |
| # Check if the URL points to a webpage (excluding image URLs) and ends with ".html" or ".htm" | |
| if parsed_absolute_url.netloc == parsed_url.netloc and parsed_absolute_url.path.endswith( | |
| ('.html', '.htm')): | |
| try: | |
| # Visit the absolute URL | |
| crawl(absolute_url) | |
| except requests.exceptions.RequestException: | |
| # Handle unreadable URLs | |
| continue | |
| else: | |
| # Handle unsuccessful requests | |
| return | |
| # Fetch the sitemap XML content | |
| try: | |
| response = requests.get(sitemap_url) | |
| except requests.exceptions.RequestException: | |
| return "Error fetching sitemap" | |
| # Check if the request was successful | |
| if response.status_code == 200: | |
| # Parse the sitemap XML content | |
| xml_content = response.content | |
| root = ET.fromstring(xml_content) | |
| # Extract URLs from the sitemap | |
| urls = [] | |
| for url_element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'): | |
| loc_element = url_element.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc') | |
| if loc_element is not None: | |
| url = loc_element.text | |
| urls.append(url) | |
| # Call the crawl function for each URL in the sitemap | |
| for url in urls: | |
| crawl(url) | |
| # Convert the data to a pandas DataFrame | |
| df = pd.DataFrame(data) | |
| # Get the domain name from the sitemap URL | |
| domain_name = urlparse(sitemap_url).netloc | |
| # Save the DataFrame to an Excel file | |
| wb = Workbook() | |
| ws = wb.active | |
| # Write the DataFrame to the worksheet | |
| for r in dataframe_to_rows(df, index=False, header=True): | |
| ws.append(r) | |
| # Highlight cells with similar titles in the "Title" column | |
| fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid") | |
| titles = df['Title'] | |
| for row in ws.iter_rows(min_row=2, min_col=2, max_col=2): | |
| cell = row[0] | |
| if cell.value in titles: | |
| cell.fill = fill | |
| # Save the workbook as an Excel file | |
| file_path = f'{domain_name}.xlsx' | |
| wb.save(file_path) | |
| return file_path | |
| else: | |
| return "Error fetching sitemap" | |
| # Create a Gradio interface | |
| iface = gr.Interface( | |
| fn=crawl_website_from_sitemap, | |
| inputs="text", | |
| outputs="file", | |
| title="Sitemap to SEO Tracking Excel", | |
| description="To collect SEO data (Page URL, Title, Description, Keywords) from a sitemap URL, use a Python script with BeautifulSoup and pandas. The script crawls the sitemap, extracts data from each page, and exports the results to an Excel file for analysis.", | |
| allow_flagging=False, | |
| examples=[["http://www.embedded-innovations.com/sitemap.xml"]] | |
| ) | |
| # Launch the Gradio interface | |
| iface.launch() | |