Spaces:
Runtime error
Runtime error
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urlparse, urljoin | |
| import pandas as pd | |
| from difflib import SequenceMatcher | |
| from xml.etree import ElementTree as ET | |
| import openpyxl | |
| from openpyxl import Workbook | |
| from openpyxl.styles import PatternFill | |
| from openpyxl.utils.dataframe import dataframe_to_rows | |
| import gradio as gr | |
| visited_urls = set() | |
| unique_urls = set() | |
| def create_sitemap_from_url(home_page_url): | |
| def crawl_website(url): | |
| # Check if URL has already been visited | |
| if url in visited_urls: | |
| return | |
| # Add URL to visited set | |
| visited_urls.add(url) | |
| # Extract domain from the given URL | |
| parsed_url = urlparse(url) | |
| base_url = parsed_url.scheme + "://" + parsed_url.netloc | |
| # Make a GET request to the URL | |
| try: | |
| response = requests.get(url) | |
| except requests.exceptions.RequestException: | |
| # Handle unreadable URLs | |
| return | |
| # Check if the request was successful | |
| if response.status_code == 200: | |
| # Parse the HTML content using BeautifulSoup | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Add the URL to the set of unique URLs | |
| unique_urls.add(url) | |
| # Extract all the links on the page | |
| links = soup.find_all('a') | |
| # Visit each link | |
| for link in links: | |
| href = link.get('href') | |
| if href and not href.startswith('#'): | |
| # Construct the absolute URL by joining the base URL and the relative URL | |
| absolute_url = urljoin(url, href) | |
| parsed_absolute_url = urlparse(absolute_url) | |
| # Check if the URL points to a webpage (excluding image URLs) | |
| if parsed_absolute_url.netloc == parsed_url.netloc and not parsed_absolute_url.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.zip', '.apk', '.msi')): | |
| try: | |
| # Visit the absolute URL | |
| crawl_website(absolute_url) | |
| except requests.exceptions.RequestException: | |
| # Handle unreadable URLs | |
| continue | |
| else: | |
| # Handle unsuccessful requests | |
| return | |
| # Call the crawl_website function with the desired URL | |
| crawl_website(home_page_url) | |
| # Remove "http://" URLs that have matching content after "http://" in "https://" URLs | |
| final_urls = set() | |
| for url in unique_urls: | |
| if url.startswith("http://"): | |
| remaining_url = url[len("http://"):] | |
| if "https://" + remaining_url in unique_urls: | |
| continue | |
| final_urls.add(url) | |
| return final_urls | |
| def fetch_and_save_to_excel(home_page_url): | |
| def fetch_page_info(url): | |
| response = requests.get(url) | |
| if response.status_code == 200: | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| title = soup.find('title').get_text() if soup.find('title') else 'No title found' | |
| keywords = soup.find('meta', {'name': 'keywords'}) | |
| keywords = keywords.get('content') if keywords else 'No keywords found' | |
| description = soup.find('meta', {'name': 'description'}) | |
| description = description.get('content') if description else 'No description found' | |
| return title, keywords, description | |
| return None, None, None | |
| urls = create_sitemap_from_url(home_page_url) | |
| if urls: | |
| title_to_urls = {} # Dictionary to store URLs grouped by title | |
| for url in urls: | |
| title, _, _ = fetch_page_info(url) # Fetch only title for comparison | |
| if title in title_to_urls: | |
| title_to_urls[title].append(url) | |
| else: | |
| title_to_urls[title] = [url] | |
| workbook = openpyxl.Workbook() | |
| sheet = workbook.active | |
| sheet.append(["URL", "Title", "Keywords", "Description"]) | |
| for title, urls in title_to_urls.items(): | |
| if len(urls) > 1: # Only consider titles with multiple URLs | |
| for url in urls: | |
| fetched_title, keywords, description = fetch_page_info(url) | |
| sheet.append([url, fetched_title, keywords, description]) | |
| excel_file = "duplicate_titles.xlsx" | |
| workbook.save(excel_file) | |
| return excel_file | |
| return None | |
| # Create a Gradio interface | |
| iface = gr.Interface( | |
| fn=fetch_and_save_to_excel, | |
| inputs="text", | |
| outputs="file", | |
| title="Duplicate Titles Finder and Excel Exporter", | |
| description="Enter a domain name (or homepage URL) to find duplicate titles and export the results to an Excel file.", | |
| allow_flagging=False, | |
| examples=[["http://www.embedded-innovations.com/"]] | |
| ) | |
| # Launch the Gradio interface | |
| iface.launch() | |