Spaces:

bonrix
/

WebsiteCrawler

Runtime error

App Files Files Community

bonrix commited on Jun 1, 2023

Commit

fdb1fc2

1 Parent(s): 32c0a08

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -8

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import xml.dom.minidom
 import re
 import gradio as gr
 from urllib.parse import urlparse, urljoin
 def crawl_website(url):
@@ -140,11 +141,74 @@ def gradio_interface(url):
     return "\n".join(crawl_website.progress_textbox), text_file_path
-iface = gr.Interface(
-    fn=gradio_interface,
-    inputs="text",
-    outputs=["text", "file"],
-    title="Website Crawler",
-    description="Enter a website URL to crawl and extract text from web pages."
-)
-iface.launch()

 import re
 import gradio as gr
 from urllib.parse import urlparse, urljoin
+import  difflib
 def crawl_website(url):
     return "\n".join(crawl_website.progress_textbox), text_file_path
+def extract_text_from_url1(url):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    text = soup.get_text(separator=' ')
+    return f"<p><b>{url}</b></p>\n<p>{text.strip()}</p>\n"
+def extract_text_from_sitemap1(sitemap_file):
+    with open(sitemap_file, 'r') as file:
+        sitemap_content = file.read()
+    soup = BeautifulSoup(sitemap_content, 'xml')
+    urls = [loc.text for loc in soup.find_all('loc')]
+    extracted_text = ""
+    processed_urls = set()
+    existing_text = ""
+    for url in urls:
+        if url.lower().endswith(('.html', '.htm')) and url not in processed_urls:
+            text = extract_text_from_url1(url)
+            diff = difflib.SequenceMatcher(None, existing_text, text)
+            similarity = diff.ratio()
+            if similarity < 0.95:
+                extracted_text += text
+                existing_text += text
+            processed_urls.add(url)
+    # Remove multiple whitespace
+    extracted_text = re.sub(r'\s+', ' ', extracted_text)
+    return extracted_text
+def generate_text_file1(url):
+    sitemap_file = crawl_website(url)
+    extracted_text = extract_text_from_sitemap1(sitemap_file)
+    text_file_path = 'extracted_text.html'
+    with open(text_file_path, 'w', encoding='utf-8') as file:
+        file.write(f"<html><body>{extracted_text}</body></html>")
+    return text_file_path
+# Define the Gradio interface
+def gradio_interface1(sitemap_file):
+    output_file = generate_text_file1(sitemap_file)
+    return output_file
+with gr.Blocks() as demo:
+    gr.Markdown("Enter a website URL to crawl and extract text from web pages.")
+    with gr.Tab("Website Crawler"):
+        text_input1 = gr.inputs.Textbox()
+        progress_output = gr.outputs.Textbox(label="Progress")
+        file_output1 = gr.outputs.File(label="Download Text")
+        button1 = gr.Button("Website Crawler")
+    with gr.Tab("Website Crawler"):
+        text_input2 = gr.inputs.Textbox()
+        file_output2 = gr.outputs.File(label="Download HTML File")
+        button2 = gr.Button("Website Crawler")
+    def crawl_and_extract_text(url):
+        progress, file_path = gradio_interface(url)
+        return progress, file_path
+    button1.click(crawl_and_extract_text, inputs=text_input1, outputs=[progress_output, file_output1])
+    button2.click(gradio_interface1, inputs=text_input2, outputs=file_output2)
+demo.launch()