Spaces:

bonrix
/

url_to_duplicate_title_finder

Runtime error

App Files Files Community

url_to_duplicate_title_finder / app.py

bonrix

Update app.py

6bb850b over 2 years ago

raw

history blame contribute delete

4.92 kB




	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse, urljoin
	import pandas as pd
	from difflib import SequenceMatcher
	from xml.etree import ElementTree as ET
	import openpyxl
	from openpyxl import Workbook
	from openpyxl.styles import PatternFill
	from openpyxl.utils.dataframe import dataframe_to_rows
	import gradio as gr

	visited_urls = set()
	unique_urls = set()

	def create_sitemap_from_url(home_page_url):
	def crawl_website(url):
	# Check if URL has already been visited
	if url in visited_urls:
	return

	# Add URL to visited set
	visited_urls.add(url)

	# Extract domain from the given URL
	parsed_url = urlparse(url)
	base_url = parsed_url.scheme + "://" + parsed_url.netloc

	# Make a GET request to the URL
	try:
	response = requests.get(url)
	except requests.exceptions.RequestException:
	# Handle unreadable URLs
	return

	# Check if the request was successful
	if response.status_code == 200:
	# Parse the HTML content using BeautifulSoup
	soup = BeautifulSoup(response.content, 'html.parser')

	# Add the URL to the set of unique URLs
	unique_urls.add(url)

	# Extract all the links on the page
	links = soup.find_all('a')

	# Visit each link
	for link in links:
	href = link.get('href')
	if href and not href.startswith('#'):
	# Construct the absolute URL by joining the base URL and the relative URL
	absolute_url = urljoin(url, href)
	parsed_absolute_url = urlparse(absolute_url)

	# Check if the URL points to a webpage (excluding image URLs)
	if parsed_absolute_url.netloc == parsed_url.netloc and not parsed_absolute_url.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.zip', '.apk', '.msi')):
	try:
	# Visit the absolute URL
	crawl_website(absolute_url)
	except requests.exceptions.RequestException:
	# Handle unreadable URLs
	continue
	else:
	# Handle unsuccessful requests
	return

	# Call the crawl_website function with the desired URL
	crawl_website(home_page_url)

	# Remove "http://" URLs that have matching content after "http://" in "https://" URLs
	final_urls = set()
	for url in unique_urls:
	if url.startswith("http://"):
	remaining_url = url[len("http://"):]
	if "https://" + remaining_url in unique_urls:
	continue
	final_urls.add(url)

	return final_urls

	def fetch_and_save_to_excel(home_page_url):
	def fetch_page_info(url):
	response = requests.get(url)
	if response.status_code == 200:
	soup = BeautifulSoup(response.text, 'html.parser')
	title = soup.find('title').get_text() if soup.find('title') else 'No title found'
	keywords = soup.find('meta', {'name': 'keywords'})
	keywords = keywords.get('content') if keywords else 'No keywords found'
	description = soup.find('meta', {'name': 'description'})
	description = description.get('content') if description else 'No description found'
	return title, keywords, description
	return None, None, None

	urls = create_sitemap_from_url(home_page_url)
	if urls:
	title_to_urls = {} # Dictionary to store URLs grouped by title

	for url in urls:
	title, _, _ = fetch_page_info(url) # Fetch only title for comparison

	if title in title_to_urls:
	title_to_urls[title].append(url)
	else:
	title_to_urls[title] = [url]

	workbook = openpyxl.Workbook()
	sheet = workbook.active
	sheet.append(["URL", "Title", "Keywords", "Description"])

	for title, urls in title_to_urls.items():
	if len(urls) > 1: # Only consider titles with multiple URLs
	for url in urls:
	fetched_title, keywords, description = fetch_page_info(url)
	sheet.append([url, fetched_title, keywords, description])

	excel_file = "duplicate_titles.xlsx"
	workbook.save(excel_file)
	return excel_file

	return None

	# Create a Gradio interface
	iface = gr.Interface(
	fn=fetch_and_save_to_excel,
	inputs="text",
	outputs="file",
	title="Duplicate Titles Finder and Excel Exporter",
	description="Enter a domain name (or homepage URL) to find duplicate titles and export the results to an Excel file.",
	allow_flagging=False,
	examples=[["http://www.embedded-innovations.com/"]]
	)

	# Launch the Gradio interface
	iface.launch()