Spaces:

bonrix
/

url_to_duplicate_title_finder

Runtime error

App Files Files Community

bonrix commited on Aug 15, 2023

Commit

6bb850b

1 Parent(s): 7b74f94

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -164

app.py CHANGED Viewed

@@ -1,142 +1,6 @@
-# import requests
-# from bs4 import BeautifulSoup
-# from urllib.parse import urlparse, urljoin
-# import pandas as pd
-# from difflib import SequenceMatcher
-# from xml.etree import ElementTree as ET
-# import openpyxl
-# from openpyxl import Workbook
-# from openpyxl.styles import PatternFill
-# from openpyxl.utils.dataframe import dataframe_to_rows
-# import gradio as gr
-# visited_urls = set()
-# unique_urls = set()
-# def create_sitemap_from_url(home_page_url):
-#     def crawl_website(url):
-#         # Check if URL has already been visited
-#         if url in visited_urls:
-#             return
-#         # Add URL to visited set
-#         visited_urls.add(url)
-#         # Extract domain from the given URL
-#         parsed_url = urlparse(url)
-#         base_url = parsed_url.scheme + "://" + parsed_url.netloc
-#         # Make a GET request to the URL
-#         try:
-#             response = requests.get(url)
-#         except requests.exceptions.RequestException:
-#             # Handle unreadable URLs
-#             return
-#         # Check if the request was successful
-#         if response.status_code == 200:
-#             # Parse the HTML content using BeautifulSoup
-#             soup = BeautifulSoup(response.content, 'html.parser')
-#             # Add the URL to the set of unique URLs
-#             unique_urls.add(url)
-#             # Extract all the links on the page
-#             links = soup.find_all('a')
-#             # Visit each link
-#             for link in links:
-#                 href = link.get('href')
-#                 if href and not href.startswith('#'):
-#                     # Construct the absolute URL by joining the base URL and the relative URL
-#                     absolute_url = urljoin(url, href)
-#                     parsed_absolute_url = urlparse(absolute_url)
-#                     # Check if the URL points to a webpage (excluding image URLs)
-#                     if parsed_absolute_url.netloc == parsed_url.netloc and not parsed_absolute_url.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.zip', '.apk', '.msi')):
-#                         try:
-#                             # Visit the absolute URL
-#                             crawl_website(absolute_url)
-#                         except requests.exceptions.RequestException:
-#                             # Handle unreadable URLs
-#                             continue
-#         else:
-#             # Handle unsuccessful requests
-#             return
-#     # Call the crawl_website function with the desired URL
-#     crawl_website(home_page_url)
-#     # Remove "http://" URLs that have matching content after "http://" in "https://" URLs
-#     final_urls = set()
-#     for url in unique_urls:
-#         if url.startswith("http://"):
-#             remaining_url = url[len("http://"):]
-#             if "https://" + remaining_url in unique_urls:
-#                 continue
-#         final_urls.add(url)
-#     return final_urls
-# def fetch_and_save_to_excel(home_page_url):
-#     def fetch_page_info(url):
-#         response = requests.get(url)
-#         if response.status_code == 200:
-#             soup = BeautifulSoup(response.text, 'html.parser')
-#             title = soup.find('title').get_text() if soup.find('title') else 'No title found'
-#             keywords = soup.find('meta', {'name': 'keywords'})
-#             keywords = keywords.get('content') if keywords else 'No keywords found'
-#             description = soup.find('meta', {'name': 'description'})
-#             description = description.get('content') if description else 'No description found'
-#             return title, keywords, description
-#         return None, None, None
-#     urls = create_sitemap_from_url(home_page_url)
-#     if urls:
-#         title_to_urls = {}  # Dictionary to store URLs grouped by title
-#         for url in urls:
-#             title, _, _ = fetch_page_info(url)  # Fetch only title for comparison
-#             if title in title_to_urls:
-#                 title_to_urls[title].append(url)
-#             else:
-#                 title_to_urls[title] = [url]
-#         workbook = openpyxl.Workbook()
-#         sheet = workbook.active
-#         sheet.append(["URL", "Title", "Keywords", "Description"])
-#         for title, urls in title_to_urls.items():
-#             if len(urls) > 1:  # Only consider titles with multiple URLs
-#                 for url in urls:
-#                     fetched_title, keywords, description = fetch_page_info(url)
-#                     sheet.append([url, fetched_title, keywords, description])
-#         excel_file = "duplicate_titles.xlsx"
-#         workbook.save(excel_file)
-#         return excel_file
-#     return None
-# # Create a Gradio interface
-# iface = gr.Interface(
-#     fn=fetch_and_save_to_excel,
-#     inputs="text",
-#     outputs="file",
-#     title="Duplicate Titles Finder and Excel Exporter",
-#     description="Enter a domain name (or homepage URL) to find duplicate titles and export the results to an Excel file.",
-#     allow_flagging=False,
-#     examples=[["http://www.embedded-innovations.com/"]]
-# )
-# # Launch the Gradio interface
-# iface.launch()
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, urljoin
@@ -149,8 +13,75 @@ from openpyxl.styles import PatternFill
 from openpyxl.utils.dataframe import dataframe_to_rows
 import gradio as gr
-def fetch_and_save_to_excel(sitemap_url):
     def fetch_page_info(url):
         response = requests.get(url)
         if response.status_code == 200:
@@ -163,50 +94,47 @@ def fetch_and_save_to_excel(sitemap_url):
             return title, keywords, description
         return None, None, None
-    if sitemap_url:
-        response = requests.get(sitemap_url)
-        if response.status_code == 200:
-            root = ET.fromstring(response.content)
-            title_to_urls = {}  # Dictionary to store URLs grouped by title
-            for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
-                url = url_element.text
-                title, _, _ = fetch_page_info(url)  # Fetch only title for comparison
-                if title in title_to_urls:
-                    title_to_urls[title].append(url)
-                else:
-                    title_to_urls[title] = [url]
-            workbook = openpyxl.Workbook()
-            sheet = workbook.active
-            sheet.append(["URL", "Title", "Keywords", "Description"])
-            for title, urls in title_to_urls.items():
-                if len(urls) > 1:  # Only consider titles with multiple URLs
-                    for url in urls:
-                        fetched_title, keywords, description = fetch_page_info(url)
-                        sheet.append([url, fetched_title, keywords, description])
-            excel_file = "duplicate_titles.xlsx"
-            workbook.save(excel_file)
-            return excel_file
     return None
 # Create a Gradio interface
 iface = gr.Interface(
     fn=fetch_and_save_to_excel,
     inputs="text",
     outputs="file",
     title="Duplicate Titles Finder and Excel Exporter",
-    description="Enter a sitemap URL to find duplicate titles and export the results to an Excel file.",
     allow_flagging=False,
-    examples=[["http://www.embedded-innovations.com/sitemap.xml"]]
 )
 # Launch the Gradio interface
 iface.launch()

 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, urljoin
 from openpyxl.utils.dataframe import dataframe_to_rows
 import gradio as gr
+visited_urls = set()
+unique_urls = set()
+def create_sitemap_from_url(home_page_url):
+    def crawl_website(url):
+        # Check if URL has already been visited
+        if url in visited_urls:
+            return
+        # Add URL to visited set
+        visited_urls.add(url)
+        # Extract domain from the given URL
+        parsed_url = urlparse(url)
+        base_url = parsed_url.scheme + "://" + parsed_url.netloc
+        # Make a GET request to the URL
+        try:
+            response = requests.get(url)
+        except requests.exceptions.RequestException:
+            # Handle unreadable URLs
+            return
+        # Check if the request was successful
+        if response.status_code == 200:
+            # Parse the HTML content using BeautifulSoup
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Add the URL to the set of unique URLs
+            unique_urls.add(url)
+            # Extract all the links on the page
+            links = soup.find_all('a')
+            # Visit each link
+            for link in links:
+                href = link.get('href')
+                if href and not href.startswith('#'):
+                    # Construct the absolute URL by joining the base URL and the relative URL
+                    absolute_url = urljoin(url, href)
+                    parsed_absolute_url = urlparse(absolute_url)
+                    # Check if the URL points to a webpage (excluding image URLs)
+                    if parsed_absolute_url.netloc == parsed_url.netloc and not parsed_absolute_url.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.zip', '.apk', '.msi')):
+                        try:
+                            # Visit the absolute URL
+                            crawl_website(absolute_url)
+                        except requests.exceptions.RequestException:
+                            # Handle unreadable URLs
+                            continue
+        else:
+            # Handle unsuccessful requests
+            return
+    # Call the crawl_website function with the desired URL
+    crawl_website(home_page_url)
+    # Remove "http://" URLs that have matching content after "http://" in "https://" URLs
+    final_urls = set()
+    for url in unique_urls:
+        if url.startswith("http://"):
+            remaining_url = url[len("http://"):]
+            if "https://" + remaining_url in unique_urls:
+                continue
+        final_urls.add(url)
+    return final_urls
+def fetch_and_save_to_excel(home_page_url):
     def fetch_page_info(url):
         response = requests.get(url)
         if response.status_code == 200:
             return title, keywords, description
         return None, None, None
+    urls = create_sitemap_from_url(home_page_url)
+    if urls:
+        title_to_urls = {}  # Dictionary to store URLs grouped by title
+        for url in urls:
+            title, _, _ = fetch_page_info(url)  # Fetch only title for comparison
+            if title in title_to_urls:
+                title_to_urls[title].append(url)
+            else:
+                title_to_urls[title] = [url]
+        workbook = openpyxl.Workbook()
+        sheet = workbook.active
+        sheet.append(["URL", "Title", "Keywords", "Description"])
+        for title, urls in title_to_urls.items():
+            if len(urls) > 1:  # Only consider titles with multiple URLs
+                for url in urls:
+                    fetched_title, keywords, description = fetch_page_info(url)
+                    sheet.append([url, fetched_title, keywords, description])
+        excel_file = "duplicate_titles.xlsx"
+        workbook.save(excel_file)
+        return excel_file
     return None
 # Create a Gradio interface
 iface = gr.Interface(
     fn=fetch_and_save_to_excel,
     inputs="text",
     outputs="file",
     title="Duplicate Titles Finder and Excel Exporter",
+    description="Enter a domain name (or homepage URL) to find duplicate titles and export the results to an Excel file.",
     allow_flagging=False,
+    examples=[["http://www.embedded-innovations.com/"]]
 )
 # Launch the Gradio interface
 iface.launch()