bonrix's picture
Update app.py
19b9793
# import requests
# from bs4 import BeautifulSoup
# import xml.etree.ElementTree as ET
# import openpyxl
# import gradio as gr
# def fetch_page_info(url):
# response = requests.get(url)
# if response.status_code == 200:
# soup = BeautifulSoup(response.text, 'html.parser')
# title = soup.find('title').get_text() if soup.find('title') else 'No title found'
# keywords = soup.find('meta', {'name': 'keywords'})
# keywords = keywords.get('content') if keywords else 'No keywords found'
# description = soup.find('meta', {'name': 'description'})
# description = description.get('content') if description else 'No description found'
# return title, keywords, description
# return None, None, None
# def main_page(sitemap_url):
# excel_file = None
# if sitemap_url:
# response = requests.get(sitemap_url)
# if response.status_code == 200:
# root = ET.fromstring(response.content)
# title_to_urls = {} # Dictionary to store URLs grouped by title
# for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
# url = url_element.text
# title, _, _ = fetch_page_info(url) # Fetch only title for comparison
# if title in title_to_urls:
# title_to_urls[title].append(url)
# else:
# title_to_urls[title] = [url]
# workbook = openpyxl.Workbook()
# sheet = workbook.active
# sheet.append(["URL", "Title", "Keywords", "Description"])
# for title, urls in title_to_urls.items():
# if len(urls) > 1: # Only consider titles with multiple URLs
# for url in urls:
# fetched_title, keywords, description = fetch_page_info(url)
# sheet.append([url, fetched_title, keywords, description])
# excel_file = "duplicate_titles.xlsx"
# workbook.save(excel_file)
# return excel_file
# iface = gr.Interface(
# fn=main_page,
# inputs=[gr.inputs.Textbox(placeholder="Enter sitemap URL here")],
# outputs="file",
# live=True,
# title="Duplicate Titles Finder and Excel Exporter",
# description="Enter a sitemap URL to find duplicate titles and export the results to an Excel file.",
# examples=[["http://www.embedded-innovations.com/sitemap.xml"]]
# )
# if __name__ == "__main__":
# iface.launch()
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import pandas as pd
from difflib import SequenceMatcher
from xml.etree import ElementTree as ET
import openpyxl
from openpyxl import Workbook
from openpyxl.styles import PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows
import gradio as gr
def fetch_and_save_to_excel(sitemap_url):
def fetch_page_info(url):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find('title').get_text() if soup.find('title') else 'No title found'
keywords = soup.find('meta', {'name': 'keywords'})
keywords = keywords.get('content') if keywords else 'No keywords found'
description = soup.find('meta', {'name': 'description'})
description = description.get('content') if description else 'No description found'
return title, keywords, description
return None, None, None
if sitemap_url:
response = requests.get(sitemap_url)
if response.status_code == 200:
root = ET.fromstring(response.content)
title_to_urls = {} # Dictionary to store URLs grouped by title
for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
url = url_element.text
title, _, _ = fetch_page_info(url) # Fetch only title for comparison
if title in title_to_urls:
title_to_urls[title].append(url)
else:
title_to_urls[title] = [url]
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.append(["URL", "Title", "Keywords", "Description"])
for title, urls in title_to_urls.items():
if len(urls) > 1: # Only consider titles with multiple URLs
for url in urls:
fetched_title, keywords, description = fetch_page_info(url)
sheet.append([url, fetched_title, keywords, description])
excel_file = "duplicate_titles.xlsx"
workbook.save(excel_file)
return excel_file
return None
# Create a Gradio interface
iface = gr.Interface(
fn=fetch_and_save_to_excel,
inputs="text",
outputs="file",
title="Duplicate Titles Finder and Excel Exporter",
description="Enter a sitemap URL to find duplicate titles and export the results to an Excel file.",
allow_flagging=False,
examples=[["http://www.embedded-innovations.com/sitemap.xml"]]
)
# Launch the Gradio interface
iface.launch()