bonrix commited on
Commit
fdb1fc2
·
1 Parent(s): 32c0a08

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -8
app.py CHANGED
@@ -5,6 +5,7 @@ import xml.dom.minidom
5
  import re
6
  import gradio as gr
7
  from urllib.parse import urlparse, urljoin
 
8
 
9
 
10
  def crawl_website(url):
@@ -140,11 +141,74 @@ def gradio_interface(url):
140
  return "\n".join(crawl_website.progress_textbox), text_file_path
141
 
142
 
143
- iface = gr.Interface(
144
- fn=gradio_interface,
145
- inputs="text",
146
- outputs=["text", "file"],
147
- title="Website Crawler",
148
- description="Enter a website URL to crawl and extract text from web pages."
149
- )
150
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import re
6
  import gradio as gr
7
  from urllib.parse import urlparse, urljoin
8
+ import difflib
9
 
10
 
11
  def crawl_website(url):
 
141
  return "\n".join(crawl_website.progress_textbox), text_file_path
142
 
143
 
144
+
145
+ def extract_text_from_url1(url):
146
+ response = requests.get(url)
147
+ soup = BeautifulSoup(response.text, 'html.parser')
148
+ text = soup.get_text(separator=' ')
149
+ return f"<p><b>{url}</b></p>\n<p>{text.strip()}</p>\n"
150
+
151
+
152
+ def extract_text_from_sitemap1(sitemap_file):
153
+ with open(sitemap_file, 'r') as file:
154
+ sitemap_content = file.read()
155
+
156
+ soup = BeautifulSoup(sitemap_content, 'xml')
157
+ urls = [loc.text for loc in soup.find_all('loc')]
158
+
159
+ extracted_text = ""
160
+ processed_urls = set()
161
+ existing_text = ""
162
+ for url in urls:
163
+ if url.lower().endswith(('.html', '.htm')) and url not in processed_urls:
164
+ text = extract_text_from_url1(url)
165
+ diff = difflib.SequenceMatcher(None, existing_text, text)
166
+ similarity = diff.ratio()
167
+ if similarity < 0.95:
168
+ extracted_text += text
169
+ existing_text += text
170
+ processed_urls.add(url)
171
+
172
+ # Remove multiple whitespace
173
+ extracted_text = re.sub(r'\s+', ' ', extracted_text)
174
+
175
+ return extracted_text
176
+
177
+ def generate_text_file1(url):
178
+ sitemap_file = crawl_website(url)
179
+ extracted_text = extract_text_from_sitemap1(sitemap_file)
180
+ text_file_path = 'extracted_text.html'
181
+
182
+ with open(text_file_path, 'w', encoding='utf-8') as file:
183
+ file.write(f"<html><body>{extracted_text}</body></html>")
184
+
185
+ return text_file_path
186
+
187
+ # Define the Gradio interface
188
+ def gradio_interface1(sitemap_file):
189
+ output_file = generate_text_file1(sitemap_file)
190
+ return output_file
191
+
192
+
193
+
194
+ with gr.Blocks() as demo:
195
+ gr.Markdown("Enter a website URL to crawl and extract text from web pages.")
196
+ with gr.Tab("Website Crawler"):
197
+ text_input1 = gr.inputs.Textbox()
198
+ progress_output = gr.outputs.Textbox(label="Progress")
199
+ file_output1 = gr.outputs.File(label="Download Text")
200
+ button1 = gr.Button("Website Crawler")
201
+
202
+ with gr.Tab("Website Crawler"):
203
+ text_input2 = gr.inputs.Textbox()
204
+ file_output2 = gr.outputs.File(label="Download HTML File")
205
+ button2 = gr.Button("Website Crawler")
206
+
207
+ def crawl_and_extract_text(url):
208
+ progress, file_path = gradio_interface(url)
209
+ return progress, file_path
210
+
211
+ button1.click(crawl_and_extract_text, inputs=text_input1, outputs=[progress_output, file_output1])
212
+ button2.click(gradio_interface1, inputs=text_input2, outputs=file_output2)
213
+
214
+ demo.launch()