Spaces:

bonrix
/

duplicate_titles_finder

Runtime error

App Files Files Community

bonrix commited on Aug 10, 2023

Commit

adf1eae

1 Parent(s): ae52287

Create app.py

Browse files

Files changed (1) hide show

app.py +63 -0

app.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import requests
+from bs4 import BeautifulSoup
+import xml.etree.ElementTree as ET
+import openpyxl
+import gradio as gr
+def fetch_page_info(url):
+    response = requests.get(url)
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.text, 'html.parser')
+        title = soup.find('title').get_text() if soup.find('title') else 'No title found'
+        keywords = soup.find('meta', {'name': 'keywords'})
+        keywords = keywords.get('content') if keywords else 'No keywords found'
+        description = soup.find('meta', {'name': 'description'})
+        description = description.get('content') if description else 'No description found'
+        return title, keywords, description
+    return None, None, None
+def main_page(sitemap_url):
+    excel_file = None
+    if sitemap_url:
+        response = requests.get(sitemap_url)
+        if response.status_code == 200:
+            root = ET.fromstring(response.content)
+            title_to_urls = {}  # Dictionary to store URLs grouped by title
+            for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
+                url = url_element.text
+                title, _, _ = fetch_page_info(url)  # Fetch only title for comparison
+                if title in title_to_urls:
+                    title_to_urls[title].append(url)
+                else:
+                    title_to_urls[title] = [url]
+            workbook = openpyxl.Workbook()
+            sheet = workbook.active
+            sheet.append(["URL", "Title", "Keywords", "Description"])
+            for title, urls in title_to_urls.items():
+                if len(urls) > 1:  # Only consider titles with multiple URLs
+                    for url in urls:
+                        fetched_title, keywords, description = fetch_page_info(url)
+                        sheet.append([url, fetched_title, keywords, description])
+            excel_file = "duplicate_titles.xlsx"
+            workbook.save(excel_file)
+    return excel_file
+iface = gr.Interface(
+    fn=main_page,
+    inputs=[gr.inputs.Textbox(placeholder="Enter sitemap URL here")],
+    outputs="file",
+    live=True,
+    title="Duplicate Titles Finder and Excel Exporter",
+    description="Enter a sitemap URL to find duplicate titles and export the results to an Excel file.",
+    examples=[["http://www.embedded-innovations.com/sitemap.xml"]]
+)
+if __name__ == "__main__":
+    iface.launch()