bonrix commited on
Commit
6bb850b
·
1 Parent(s): 7b74f94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -164
app.py CHANGED
@@ -1,142 +1,6 @@
1
 
2
 
3
 
4
- # import requests
5
- # from bs4 import BeautifulSoup
6
- # from urllib.parse import urlparse, urljoin
7
- # import pandas as pd
8
- # from difflib import SequenceMatcher
9
- # from xml.etree import ElementTree as ET
10
- # import openpyxl
11
- # from openpyxl import Workbook
12
- # from openpyxl.styles import PatternFill
13
- # from openpyxl.utils.dataframe import dataframe_to_rows
14
- # import gradio as gr
15
-
16
- # visited_urls = set()
17
- # unique_urls = set()
18
-
19
- # def create_sitemap_from_url(home_page_url):
20
- # def crawl_website(url):
21
- # # Check if URL has already been visited
22
- # if url in visited_urls:
23
- # return
24
-
25
- # # Add URL to visited set
26
- # visited_urls.add(url)
27
-
28
- # # Extract domain from the given URL
29
- # parsed_url = urlparse(url)
30
- # base_url = parsed_url.scheme + "://" + parsed_url.netloc
31
-
32
- # # Make a GET request to the URL
33
- # try:
34
- # response = requests.get(url)
35
- # except requests.exceptions.RequestException:
36
- # # Handle unreadable URLs
37
- # return
38
-
39
- # # Check if the request was successful
40
- # if response.status_code == 200:
41
- # # Parse the HTML content using BeautifulSoup
42
- # soup = BeautifulSoup(response.content, 'html.parser')
43
-
44
- # # Add the URL to the set of unique URLs
45
- # unique_urls.add(url)
46
-
47
- # # Extract all the links on the page
48
- # links = soup.find_all('a')
49
-
50
- # # Visit each link
51
- # for link in links:
52
- # href = link.get('href')
53
- # if href and not href.startswith('#'):
54
- # # Construct the absolute URL by joining the base URL and the relative URL
55
- # absolute_url = urljoin(url, href)
56
- # parsed_absolute_url = urlparse(absolute_url)
57
-
58
- # # Check if the URL points to a webpage (excluding image URLs)
59
- # if parsed_absolute_url.netloc == parsed_url.netloc and not parsed_absolute_url.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.zip', '.apk', '.msi')):
60
- # try:
61
- # # Visit the absolute URL
62
- # crawl_website(absolute_url)
63
- # except requests.exceptions.RequestException:
64
- # # Handle unreadable URLs
65
- # continue
66
- # else:
67
- # # Handle unsuccessful requests
68
- # return
69
-
70
- # # Call the crawl_website function with the desired URL
71
- # crawl_website(home_page_url)
72
-
73
- # # Remove "http://" URLs that have matching content after "http://" in "https://" URLs
74
- # final_urls = set()
75
- # for url in unique_urls:
76
- # if url.startswith("http://"):
77
- # remaining_url = url[len("http://"):]
78
- # if "https://" + remaining_url in unique_urls:
79
- # continue
80
- # final_urls.add(url)
81
-
82
- # return final_urls
83
-
84
- # def fetch_and_save_to_excel(home_page_url):
85
- # def fetch_page_info(url):
86
- # response = requests.get(url)
87
- # if response.status_code == 200:
88
- # soup = BeautifulSoup(response.text, 'html.parser')
89
- # title = soup.find('title').get_text() if soup.find('title') else 'No title found'
90
- # keywords = soup.find('meta', {'name': 'keywords'})
91
- # keywords = keywords.get('content') if keywords else 'No keywords found'
92
- # description = soup.find('meta', {'name': 'description'})
93
- # description = description.get('content') if description else 'No description found'
94
- # return title, keywords, description
95
- # return None, None, None
96
-
97
- # urls = create_sitemap_from_url(home_page_url)
98
- # if urls:
99
- # title_to_urls = {} # Dictionary to store URLs grouped by title
100
-
101
- # for url in urls:
102
- # title, _, _ = fetch_page_info(url) # Fetch only title for comparison
103
-
104
- # if title in title_to_urls:
105
- # title_to_urls[title].append(url)
106
- # else:
107
- # title_to_urls[title] = [url]
108
-
109
- # workbook = openpyxl.Workbook()
110
- # sheet = workbook.active
111
- # sheet.append(["URL", "Title", "Keywords", "Description"])
112
-
113
- # for title, urls in title_to_urls.items():
114
- # if len(urls) > 1: # Only consider titles with multiple URLs
115
- # for url in urls:
116
- # fetched_title, keywords, description = fetch_page_info(url)
117
- # sheet.append([url, fetched_title, keywords, description])
118
-
119
- # excel_file = "duplicate_titles.xlsx"
120
- # workbook.save(excel_file)
121
- # return excel_file
122
-
123
- # return None
124
-
125
- # # Create a Gradio interface
126
- # iface = gr.Interface(
127
- # fn=fetch_and_save_to_excel,
128
- # inputs="text",
129
- # outputs="file",
130
- # title="Duplicate Titles Finder and Excel Exporter",
131
- # description="Enter a domain name (or homepage URL) to find duplicate titles and export the results to an Excel file.",
132
- # allow_flagging=False,
133
- # examples=[["http://www.embedded-innovations.com/"]]
134
- # )
135
-
136
- # # Launch the Gradio interface
137
- # iface.launch()
138
-
139
-
140
  import requests
141
  from bs4 import BeautifulSoup
142
  from urllib.parse import urlparse, urljoin
@@ -149,8 +13,75 @@ from openpyxl.styles import PatternFill
149
  from openpyxl.utils.dataframe import dataframe_to_rows
150
  import gradio as gr
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
- def fetch_and_save_to_excel(sitemap_url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  def fetch_page_info(url):
155
  response = requests.get(url)
156
  if response.status_code == 200:
@@ -163,50 +94,47 @@ def fetch_and_save_to_excel(sitemap_url):
163
  return title, keywords, description
164
  return None, None, None
165
 
166
- if sitemap_url:
167
- response = requests.get(sitemap_url)
168
- if response.status_code == 200:
169
- root = ET.fromstring(response.content)
170
-
171
- title_to_urls = {} # Dictionary to store URLs grouped by title
172
 
173
- for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
174
- url = url_element.text
175
- title, _, _ = fetch_page_info(url) # Fetch only title for comparison
176
 
177
- if title in title_to_urls:
178
- title_to_urls[title].append(url)
179
- else:
180
- title_to_urls[title] = [url]
181
 
182
- workbook = openpyxl.Workbook()
183
- sheet = workbook.active
184
- sheet.append(["URL", "Title", "Keywords", "Description"])
185
 
186
- for title, urls in title_to_urls.items():
187
- if len(urls) > 1: # Only consider titles with multiple URLs
188
- for url in urls:
189
- fetched_title, keywords, description = fetch_page_info(url)
190
- sheet.append([url, fetched_title, keywords, description])
191
 
192
- excel_file = "duplicate_titles.xlsx"
193
- workbook.save(excel_file)
194
- return excel_file
195
 
196
  return None
197
 
198
-
199
  # Create a Gradio interface
200
  iface = gr.Interface(
201
  fn=fetch_and_save_to_excel,
202
  inputs="text",
203
  outputs="file",
204
  title="Duplicate Titles Finder and Excel Exporter",
205
- description="Enter a sitemap URL to find duplicate titles and export the results to an Excel file.",
206
  allow_flagging=False,
207
- examples=[["http://www.embedded-innovations.com/sitemap.xml"]]
208
  )
209
 
210
  # Launch the Gradio interface
211
  iface.launch()
212
 
 
 
 
1
 
2
 
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import requests
5
  from bs4 import BeautifulSoup
6
  from urllib.parse import urlparse, urljoin
 
13
  from openpyxl.utils.dataframe import dataframe_to_rows
14
  import gradio as gr
15
 
16
+ visited_urls = set()
17
+ unique_urls = set()
18
+
19
+ def create_sitemap_from_url(home_page_url):
20
+ def crawl_website(url):
21
+ # Check if URL has already been visited
22
+ if url in visited_urls:
23
+ return
24
+
25
+ # Add URL to visited set
26
+ visited_urls.add(url)
27
+
28
+ # Extract domain from the given URL
29
+ parsed_url = urlparse(url)
30
+ base_url = parsed_url.scheme + "://" + parsed_url.netloc
31
+
32
+ # Make a GET request to the URL
33
+ try:
34
+ response = requests.get(url)
35
+ except requests.exceptions.RequestException:
36
+ # Handle unreadable URLs
37
+ return
38
 
39
+ # Check if the request was successful
40
+ if response.status_code == 200:
41
+ # Parse the HTML content using BeautifulSoup
42
+ soup = BeautifulSoup(response.content, 'html.parser')
43
+
44
+ # Add the URL to the set of unique URLs
45
+ unique_urls.add(url)
46
+
47
+ # Extract all the links on the page
48
+ links = soup.find_all('a')
49
+
50
+ # Visit each link
51
+ for link in links:
52
+ href = link.get('href')
53
+ if href and not href.startswith('#'):
54
+ # Construct the absolute URL by joining the base URL and the relative URL
55
+ absolute_url = urljoin(url, href)
56
+ parsed_absolute_url = urlparse(absolute_url)
57
+
58
+ # Check if the URL points to a webpage (excluding image URLs)
59
+ if parsed_absolute_url.netloc == parsed_url.netloc and not parsed_absolute_url.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.zip', '.apk', '.msi')):
60
+ try:
61
+ # Visit the absolute URL
62
+ crawl_website(absolute_url)
63
+ except requests.exceptions.RequestException:
64
+ # Handle unreadable URLs
65
+ continue
66
+ else:
67
+ # Handle unsuccessful requests
68
+ return
69
+
70
+ # Call the crawl_website function with the desired URL
71
+ crawl_website(home_page_url)
72
+
73
+ # Remove "http://" URLs that have matching content after "http://" in "https://" URLs
74
+ final_urls = set()
75
+ for url in unique_urls:
76
+ if url.startswith("http://"):
77
+ remaining_url = url[len("http://"):]
78
+ if "https://" + remaining_url in unique_urls:
79
+ continue
80
+ final_urls.add(url)
81
+
82
+ return final_urls
83
+
84
+ def fetch_and_save_to_excel(home_page_url):
85
  def fetch_page_info(url):
86
  response = requests.get(url)
87
  if response.status_code == 200:
 
94
  return title, keywords, description
95
  return None, None, None
96
 
97
+ urls = create_sitemap_from_url(home_page_url)
98
+ if urls:
99
+ title_to_urls = {} # Dictionary to store URLs grouped by title
 
 
 
100
 
101
+ for url in urls:
102
+ title, _, _ = fetch_page_info(url) # Fetch only title for comparison
 
103
 
104
+ if title in title_to_urls:
105
+ title_to_urls[title].append(url)
106
+ else:
107
+ title_to_urls[title] = [url]
108
 
109
+ workbook = openpyxl.Workbook()
110
+ sheet = workbook.active
111
+ sheet.append(["URL", "Title", "Keywords", "Description"])
112
 
113
+ for title, urls in title_to_urls.items():
114
+ if len(urls) > 1: # Only consider titles with multiple URLs
115
+ for url in urls:
116
+ fetched_title, keywords, description = fetch_page_info(url)
117
+ sheet.append([url, fetched_title, keywords, description])
118
 
119
+ excel_file = "duplicate_titles.xlsx"
120
+ workbook.save(excel_file)
121
+ return excel_file
122
 
123
  return None
124
 
 
125
  # Create a Gradio interface
126
  iface = gr.Interface(
127
  fn=fetch_and_save_to_excel,
128
  inputs="text",
129
  outputs="file",
130
  title="Duplicate Titles Finder and Excel Exporter",
131
+ description="Enter a domain name (or homepage URL) to find duplicate titles and export the results to an Excel file.",
132
  allow_flagging=False,
133
+ examples=[["http://www.embedded-innovations.com/"]]
134
  )
135
 
136
  # Launch the Gradio interface
137
  iface.launch()
138
 
139
+
140
+