Spaces:
Runtime error
Runtime error
| # import requests | |
| # from bs4 import BeautifulSoup | |
| # import xml.etree.ElementTree as ET | |
| # import openpyxl | |
| # import gradio as gr | |
| # def fetch_page_info(url): | |
| # response = requests.get(url) | |
| # if response.status_code == 200: | |
| # soup = BeautifulSoup(response.text, 'html.parser') | |
| # title = soup.find('title').get_text() if soup.find('title') else 'No title found' | |
| # keywords = soup.find('meta', {'name': 'keywords'}) | |
| # keywords = keywords.get('content') if keywords else 'No keywords found' | |
| # description = soup.find('meta', {'name': 'description'}) | |
| # description = description.get('content') if description else 'No description found' | |
| # return title, keywords, description | |
| # return None, None, None | |
| # def main_page(sitemap_url): | |
| # excel_file = None | |
| # if sitemap_url: | |
| # response = requests.get(sitemap_url) | |
| # if response.status_code == 200: | |
| # root = ET.fromstring(response.content) | |
| # title_to_urls = {} # Dictionary to store URLs grouped by title | |
| # for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"): | |
| # url = url_element.text | |
| # title, _, _ = fetch_page_info(url) # Fetch only title for comparison | |
| # if title in title_to_urls: | |
| # title_to_urls[title].append(url) | |
| # else: | |
| # title_to_urls[title] = [url] | |
| # workbook = openpyxl.Workbook() | |
| # sheet = workbook.active | |
| # sheet.append(["URL", "Title", "Keywords", "Description"]) | |
| # for title, urls in title_to_urls.items(): | |
| # if len(urls) > 1: # Only consider titles with multiple URLs | |
| # for url in urls: | |
| # fetched_title, keywords, description = fetch_page_info(url) | |
| # sheet.append([url, fetched_title, keywords, description]) | |
| # excel_file = "duplicate_titles.xlsx" | |
| # workbook.save(excel_file) | |
| # return excel_file | |
| # iface = gr.Interface( | |
| # fn=main_page, | |
| # inputs=[gr.inputs.Textbox(placeholder="Enter sitemap URL here")], | |
| # outputs="file", | |
| # live=True, | |
| # title="Duplicate Titles Finder and Excel Exporter", | |
| # description="Enter a sitemap URL to find duplicate titles and export the results to an Excel file.", | |
| # examples=[["http://www.embedded-innovations.com/sitemap.xml"]] | |
| # ) | |
| # if __name__ == "__main__": | |
| # iface.launch() | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urlparse, urljoin | |
| import pandas as pd | |
| from difflib import SequenceMatcher | |
| from xml.etree import ElementTree as ET | |
| import openpyxl | |
| from openpyxl import Workbook | |
| from openpyxl.styles import PatternFill | |
| from openpyxl.utils.dataframe import dataframe_to_rows | |
| import gradio as gr | |
| def fetch_and_save_to_excel(sitemap_url): | |
| def fetch_page_info(url): | |
| response = requests.get(url) | |
| if response.status_code == 200: | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| title = soup.find('title').get_text() if soup.find('title') else 'No title found' | |
| keywords = soup.find('meta', {'name': 'keywords'}) | |
| keywords = keywords.get('content') if keywords else 'No keywords found' | |
| description = soup.find('meta', {'name': 'description'}) | |
| description = description.get('content') if description else 'No description found' | |
| return title, keywords, description | |
| return None, None, None | |
| if sitemap_url: | |
| response = requests.get(sitemap_url) | |
| if response.status_code == 200: | |
| root = ET.fromstring(response.content) | |
| title_to_urls = {} # Dictionary to store URLs grouped by title | |
| for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"): | |
| url = url_element.text | |
| title, _, _ = fetch_page_info(url) # Fetch only title for comparison | |
| if title in title_to_urls: | |
| title_to_urls[title].append(url) | |
| else: | |
| title_to_urls[title] = [url] | |
| workbook = openpyxl.Workbook() | |
| sheet = workbook.active | |
| sheet.append(["URL", "Title", "Keywords", "Description"]) | |
| for title, urls in title_to_urls.items(): | |
| if len(urls) > 1: # Only consider titles with multiple URLs | |
| for url in urls: | |
| fetched_title, keywords, description = fetch_page_info(url) | |
| sheet.append([url, fetched_title, keywords, description]) | |
| excel_file = "duplicate_titles.xlsx" | |
| workbook.save(excel_file) | |
| return excel_file | |
| return None | |
| # Create a Gradio interface | |
| iface = gr.Interface( | |
| fn=fetch_and_save_to_excel, | |
| inputs="text", | |
| outputs="file", | |
| title="Duplicate Titles Finder and Excel Exporter", | |
| description="Enter a sitemap URL to find duplicate titles and export the results to an Excel file.", | |
| allow_flagging=False, | |
| examples=[["http://www.embedded-innovations.com/sitemap.xml"]] | |
| ) | |
| # Launch the Gradio interface | |
| iface.launch() | |