bonrix commited on
Commit
7c58167
·
1 Parent(s): 629a517

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -0
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import xml.etree.ElementTree as ET
4
+ import xml.dom.minidom
5
+ import re
6
+ import gradio as gr
7
+ from urllib.parse import urlparse, urljoin
8
+
9
+
10
+ def crawl_website(url):
11
+ visited_urls = set()
12
+ unique_urls = set()
13
+
14
+ def crawl(url):
15
+ # Check if URL has already been visited
16
+ if url in visited_urls:
17
+ return
18
+
19
+ # Add URL to visited set
20
+ visited_urls.add(url)
21
+
22
+ # Extract domain from the given URL
23
+ parsed_url = urlparse(url)
24
+ base_url = parsed_url.scheme + "://" + parsed_url.netloc
25
+
26
+ # Make a GET request to the URL
27
+ try:
28
+ response = requests.get(url)
29
+ except requests.exceptions.RequestException:
30
+ # Handle unreadable URLs
31
+ return
32
+
33
+ # Check if the request was successful
34
+ if response.status_code == 200:
35
+ # Print the currently crawling URL
36
+ crawl_website.progress_textbox.append(f"Crawling: {url}")
37
+
38
+ # Parse the HTML content using BeautifulSoup
39
+ soup = BeautifulSoup(response.content, 'html.parser')
40
+
41
+ # Add the URL to the set of unique URLs
42
+ unique_urls.add(url)
43
+
44
+ # Extract all the links on the page
45
+ links = soup.find_all('a')
46
+
47
+ # Visit each link
48
+ for link in links:
49
+ href = link.get('href')
50
+ if href and not href.startswith('#'):
51
+ # Construct the absolute URL by joining the base URL and the relative URL
52
+ absolute_url = urljoin(url, href)
53
+ parsed_absolute_url = urlparse(absolute_url)
54
+
55
+ # Check if the URL points to a webpage (excluding image URLs) and ends with ".html" or ".htm"
56
+ if parsed_absolute_url.netloc == parsed_url.netloc and parsed_absolute_url.path.endswith(
57
+ ('.html', '.htm')):
58
+ try:
59
+ # Visit the absolute URL
60
+ crawl(absolute_url)
61
+ except requests.exceptions.RequestException:
62
+ # Handle unreadable URLs
63
+ continue
64
+ else:
65
+ # Handle unsuccessful requests
66
+ return
67
+
68
+ # Call the crawl_website function with the desired URL
69
+ crawl_website.progress_textbox = [] # Create a list to store progress lines
70
+ crawl(url)
71
+
72
+ # Remove "http://" URLs that have matching content after "http://" in "https://" URLs
73
+ final_urls = set()
74
+ for url in unique_urls:
75
+ if url.startswith("http://"):
76
+ remaining_url = url[len("http://"):]
77
+ if "https://" + remaining_url in unique_urls:
78
+ continue
79
+ final_urls.add(url)
80
+
81
+ # Create the XML sitemap
82
+ urlset = ET.Element("urlset")
83
+ urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
84
+
85
+ for url in final_urls:
86
+ url_elem = ET.SubElement(urlset, "url")
87
+ loc_elem = ET.SubElement(url_elem, "loc")
88
+ loc_elem.text = url
89
+
90
+ # Create the ElementTree object
91
+ tree = ET.ElementTree(urlset)
92
+
93
+ # Convert the ElementTree to a formatted string
94
+ xml_str = xml.dom.minidom.parseString(ET.tostring(urlset)).toprettyxml(indent=" ")
95
+
96
+ # Remove empty lines from the formatted XML string
97
+ xml_str = "\n".join([line for line in xml_str.split("\n") if line.strip()])
98
+
99
+ # Write the XML string to a file
100
+ with open("sitemap.xml", "w") as file:
101
+ file.write(xml_str)
102
+
103
+ return "sitemap.xml"
104
+
105
+ def extract_text_from_sitemap(sitemap_file):
106
+ with open(sitemap_file, 'r') as file:
107
+ sitemap_content = file.read()
108
+
109
+ soup = BeautifulSoup(sitemap_content, 'xml')
110
+ urls = [loc.text for loc in soup.find_all('loc')]
111
+
112
+ extracted_text = ""
113
+ for i, url in enumerate(urls):
114
+ if url.lower().endswith(('.html', '.htm')):
115
+ # Print the currently extracting URL
116
+ crawl_website.progress_textbox.append(f"Extracting text: {url}")
117
+
118
+ response = requests.get(url)
119
+ soup = BeautifulSoup(response.text, 'html.parser')
120
+ text = soup.get_text(separator=' ')
121
+ extracted_text += f"\n{url}\n{text}\n\n"
122
+
123
+ # Remove multiple whitespace
124
+ extracted_text = re.sub(r'\s+', ' ', extracted_text)
125
+
126
+ return extracted_text
127
+
128
+ def gradio_interface(url):
129
+ sitemap_file = crawl_website(url)
130
+ extracted_text = extract_text_from_sitemap(sitemap_file)
131
+ text_file_path = 'extracted_text.txt'
132
+
133
+ with open(text_file_path, 'w', encoding='utf-8') as file:
134
+ file.write(extracted_text)
135
+
136
+ return "\n".join(crawl_website.progress_textbox), text_file_path
137
+
138
+
139
+ with gr.Interface(fn=gradio_interface, inputs="text", outputs=["text", "file"],
140
+ title="Website Crawler",
141
+ description="Enter a website URL to crawl and extract text from web pages.") as iface:
142
+ iface.launch(share=True)