Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,6 +5,7 @@ import xml.dom.minidom
|
|
| 5 |
import re
|
| 6 |
import gradio as gr
|
| 7 |
from urllib.parse import urlparse, urljoin
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def crawl_website(url):
|
|
@@ -140,11 +141,74 @@ def gradio_interface(url):
|
|
| 140 |
return "\n".join(crawl_website.progress_textbox), text_file_path
|
| 141 |
|
| 142 |
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
import re
|
| 6 |
import gradio as gr
|
| 7 |
from urllib.parse import urlparse, urljoin
|
| 8 |
+
import difflib
|
| 9 |
|
| 10 |
|
| 11 |
def crawl_website(url):
|
|
|
|
| 141 |
return "\n".join(crawl_website.progress_textbox), text_file_path
|
| 142 |
|
| 143 |
|
| 144 |
+
|
| 145 |
+
def extract_text_from_url1(url):
|
| 146 |
+
response = requests.get(url)
|
| 147 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 148 |
+
text = soup.get_text(separator=' ')
|
| 149 |
+
return f"<p><b>{url}</b></p>\n<p>{text.strip()}</p>\n"
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def extract_text_from_sitemap1(sitemap_file):
|
| 153 |
+
with open(sitemap_file, 'r') as file:
|
| 154 |
+
sitemap_content = file.read()
|
| 155 |
+
|
| 156 |
+
soup = BeautifulSoup(sitemap_content, 'xml')
|
| 157 |
+
urls = [loc.text for loc in soup.find_all('loc')]
|
| 158 |
+
|
| 159 |
+
extracted_text = ""
|
| 160 |
+
processed_urls = set()
|
| 161 |
+
existing_text = ""
|
| 162 |
+
for url in urls:
|
| 163 |
+
if url.lower().endswith(('.html', '.htm')) and url not in processed_urls:
|
| 164 |
+
text = extract_text_from_url1(url)
|
| 165 |
+
diff = difflib.SequenceMatcher(None, existing_text, text)
|
| 166 |
+
similarity = diff.ratio()
|
| 167 |
+
if similarity < 0.95:
|
| 168 |
+
extracted_text += text
|
| 169 |
+
existing_text += text
|
| 170 |
+
processed_urls.add(url)
|
| 171 |
+
|
| 172 |
+
# Remove multiple whitespace
|
| 173 |
+
extracted_text = re.sub(r'\s+', ' ', extracted_text)
|
| 174 |
+
|
| 175 |
+
return extracted_text
|
| 176 |
+
|
| 177 |
+
def generate_text_file1(url):
|
| 178 |
+
sitemap_file = crawl_website(url)
|
| 179 |
+
extracted_text = extract_text_from_sitemap1(sitemap_file)
|
| 180 |
+
text_file_path = 'extracted_text.html'
|
| 181 |
+
|
| 182 |
+
with open(text_file_path, 'w', encoding='utf-8') as file:
|
| 183 |
+
file.write(f"<html><body>{extracted_text}</body></html>")
|
| 184 |
+
|
| 185 |
+
return text_file_path
|
| 186 |
+
|
| 187 |
+
# Define the Gradio interface
|
| 188 |
+
def gradio_interface1(sitemap_file):
|
| 189 |
+
output_file = generate_text_file1(sitemap_file)
|
| 190 |
+
return output_file
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
with gr.Blocks() as demo:
|
| 195 |
+
gr.Markdown("Enter a website URL to crawl and extract text from web pages.")
|
| 196 |
+
with gr.Tab("Website Crawler"):
|
| 197 |
+
text_input1 = gr.inputs.Textbox()
|
| 198 |
+
progress_output = gr.outputs.Textbox(label="Progress")
|
| 199 |
+
file_output1 = gr.outputs.File(label="Download Text")
|
| 200 |
+
button1 = gr.Button("Website Crawler")
|
| 201 |
+
|
| 202 |
+
with gr.Tab("Website Crawler"):
|
| 203 |
+
text_input2 = gr.inputs.Textbox()
|
| 204 |
+
file_output2 = gr.outputs.File(label="Download HTML File")
|
| 205 |
+
button2 = gr.Button("Website Crawler")
|
| 206 |
+
|
| 207 |
+
def crawl_and_extract_text(url):
|
| 208 |
+
progress, file_path = gradio_interface(url)
|
| 209 |
+
return progress, file_path
|
| 210 |
+
|
| 211 |
+
button1.click(crawl_and_extract_text, inputs=text_input1, outputs=[progress_output, file_output1])
|
| 212 |
+
button2.click(gradio_interface1, inputs=text_input2, outputs=file_output2)
|
| 213 |
+
|
| 214 |
+
demo.launch()
|