Spaces:

bonrix
/

duplicate_titles_finder

Runtime error

App Files Files Community

duplicate_titles_finder / app.py

bonrix

Update app.py

19b9793 over 2 years ago

raw

history blame contribute delete

5.35 kB

	# import requests
	# from bs4 import BeautifulSoup
	# import xml.etree.ElementTree as ET
	# import openpyxl
	# import gradio as gr

	# def fetch_page_info(url):
	# response = requests.get(url)
	# if response.status_code == 200:
	# soup = BeautifulSoup(response.text, 'html.parser')
	# title = soup.find('title').get_text() if soup.find('title') else 'No title found'
	# keywords = soup.find('meta', {'name': 'keywords'})
	# keywords = keywords.get('content') if keywords else 'No keywords found'
	# description = soup.find('meta', {'name': 'description'})
	# description = description.get('content') if description else 'No description found'
	# return title, keywords, description
	# return None, None, None

	# def main_page(sitemap_url):
	# excel_file = None
	# if sitemap_url:
	# response = requests.get(sitemap_url)
	# if response.status_code == 200:
	# root = ET.fromstring(response.content)

	# title_to_urls = {} # Dictionary to store URLs grouped by title

	# for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
	# url = url_element.text
	# title, _, _ = fetch_page_info(url) # Fetch only title for comparison

	# if title in title_to_urls:
	# title_to_urls[title].append(url)
	# else:
	# title_to_urls[title] = [url]

	# workbook = openpyxl.Workbook()
	# sheet = workbook.active
	# sheet.append(["URL", "Title", "Keywords", "Description"])

	# for title, urls in title_to_urls.items():
	# if len(urls) > 1: # Only consider titles with multiple URLs
	# for url in urls:
	# fetched_title, keywords, description = fetch_page_info(url)
	# sheet.append([url, fetched_title, keywords, description])

	# excel_file = "duplicate_titles.xlsx"
	# workbook.save(excel_file)

	# return excel_file

	# iface = gr.Interface(
	# fn=main_page,
	# inputs=[gr.inputs.Textbox(placeholder="Enter sitemap URL here")],
	# outputs="file",
	# live=True,
	# title="Duplicate Titles Finder and Excel Exporter",
	# description="Enter a sitemap URL to find duplicate titles and export the results to an Excel file.",
	# examples=[["http://www.embedded-innovations.com/sitemap.xml"]]
	# )

	# if __name__ == "__main__":
	# iface.launch()




	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse, urljoin
	import pandas as pd
	from difflib import SequenceMatcher
	from xml.etree import ElementTree as ET
	import openpyxl
	from openpyxl import Workbook
	from openpyxl.styles import PatternFill
	from openpyxl.utils.dataframe import dataframe_to_rows
	import gradio as gr


	def fetch_and_save_to_excel(sitemap_url):
	def fetch_page_info(url):
	response = requests.get(url)
	if response.status_code == 200:
	soup = BeautifulSoup(response.text, 'html.parser')
	title = soup.find('title').get_text() if soup.find('title') else 'No title found'
	keywords = soup.find('meta', {'name': 'keywords'})
	keywords = keywords.get('content') if keywords else 'No keywords found'
	description = soup.find('meta', {'name': 'description'})
	description = description.get('content') if description else 'No description found'
	return title, keywords, description
	return None, None, None

	if sitemap_url:
	response = requests.get(sitemap_url)
	if response.status_code == 200:
	root = ET.fromstring(response.content)

	title_to_urls = {} # Dictionary to store URLs grouped by title

	for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
	url = url_element.text
	title, _, _ = fetch_page_info(url) # Fetch only title for comparison

	if title in title_to_urls:
	title_to_urls[title].append(url)
	else:
	title_to_urls[title] = [url]

	workbook = openpyxl.Workbook()
	sheet = workbook.active
	sheet.append(["URL", "Title", "Keywords", "Description"])

	for title, urls in title_to_urls.items():
	if len(urls) > 1: # Only consider titles with multiple URLs
	for url in urls:
	fetched_title, keywords, description = fetch_page_info(url)
	sheet.append([url, fetched_title, keywords, description])

	excel_file = "duplicate_titles.xlsx"
	workbook.save(excel_file)
	return excel_file

	return None


	# Create a Gradio interface
	iface = gr.Interface(
	fn=fetch_and_save_to_excel,
	inputs="text",
	outputs="file",
	title="Duplicate Titles Finder and Excel Exporter",
	description="Enter a sitemap URL to find duplicate titles and export the results to an Excel file.",
	allow_flagging=False,
	examples=[["http://www.embedded-innovations.com/sitemap.xml"]]
	)

	# Launch the Gradio interface
	iface.launch()