Spaces:

bonrix
/

WebsiteCrawler

Runtime error

App Files Files Community

bonrix commited on May 31, 2023

Commit

7c58167

1 Parent(s): 629a517

Create app.py

Browse files

Files changed (1) hide show

app.py +142 -0

app.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import requests
+from bs4 import BeautifulSoup
+import xml.etree.ElementTree as ET
+import xml.dom.minidom
+import re
+import gradio as gr
+from urllib.parse import urlparse, urljoin
+def crawl_website(url):
+    visited_urls = set()
+    unique_urls = set()
+    def crawl(url):
+        # Check if URL has already been visited
+        if url in visited_urls:
+            return
+        # Add URL to visited set
+        visited_urls.add(url)
+        # Extract domain from the given URL
+        parsed_url = urlparse(url)
+        base_url = parsed_url.scheme + "://" + parsed_url.netloc
+        # Make a GET request to the URL
+        try:
+            response = requests.get(url)
+        except requests.exceptions.RequestException:
+            # Handle unreadable URLs
+            return
+        # Check if the request was successful
+        if response.status_code == 200:
+            # Print the currently crawling URL
+            crawl_website.progress_textbox.append(f"Crawling: {url}")
+            # Parse the HTML content using BeautifulSoup
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Add the URL to the set of unique URLs
+            unique_urls.add(url)
+            # Extract all the links on the page
+            links = soup.find_all('a')
+            # Visit each link
+            for link in links:
+                href = link.get('href')
+                if href and not href.startswith('#'):
+                    # Construct the absolute URL by joining the base URL and the relative URL
+                    absolute_url = urljoin(url, href)
+                    parsed_absolute_url = urlparse(absolute_url)
+                    # Check if the URL points to a webpage (excluding image URLs) and ends with ".html" or ".htm"
+                    if parsed_absolute_url.netloc == parsed_url.netloc and parsed_absolute_url.path.endswith(
+                            ('.html', '.htm')):
+                        try:
+                            # Visit the absolute URL
+                            crawl(absolute_url)
+                        except requests.exceptions.RequestException:
+                            # Handle unreadable URLs
+                            continue
+        else:
+            # Handle unsuccessful requests
+            return
+    # Call the crawl_website function with the desired URL
+    crawl_website.progress_textbox = []  # Create a list to store progress lines
+    crawl(url)
+    # Remove "http://" URLs that have matching content after "http://" in "https://" URLs
+    final_urls = set()
+    for url in unique_urls:
+        if url.startswith("http://"):
+            remaining_url = url[len("http://"):]
+            if "https://" + remaining_url in unique_urls:
+                continue
+        final_urls.add(url)
+    # Create the XML sitemap
+    urlset = ET.Element("urlset")
+    urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
+    for url in final_urls:
+        url_elem = ET.SubElement(urlset, "url")
+        loc_elem = ET.SubElement(url_elem, "loc")
+        loc_elem.text = url
+    # Create the ElementTree object
+    tree = ET.ElementTree(urlset)
+    # Convert the ElementTree to a formatted string
+    xml_str = xml.dom.minidom.parseString(ET.tostring(urlset)).toprettyxml(indent="  ")
+    # Remove empty lines from the formatted XML string
+    xml_str = "\n".join([line for line in xml_str.split("\n") if line.strip()])
+    # Write the XML string to a file
+    with open("sitemap.xml", "w") as file:
+        file.write(xml_str)
+    return "sitemap.xml"
+def extract_text_from_sitemap(sitemap_file):
+    with open(sitemap_file, 'r') as file:
+        sitemap_content = file.read()
+    soup = BeautifulSoup(sitemap_content, 'xml')
+    urls = [loc.text for loc in soup.find_all('loc')]
+    extracted_text = ""
+    for i, url in enumerate(urls):
+        if url.lower().endswith(('.html', '.htm')):
+            # Print the currently extracting URL
+            crawl_website.progress_textbox.append(f"Extracting text: {url}")
+            response = requests.get(url)
+            soup = BeautifulSoup(response.text, 'html.parser')
+            text = soup.get_text(separator=' ')
+            extracted_text += f"\n{url}\n{text}\n\n"
+    # Remove multiple whitespace
+    extracted_text = re.sub(r'\s+', ' ', extracted_text)
+    return extracted_text
+def gradio_interface(url):
+    sitemap_file = crawl_website(url)
+    extracted_text = extract_text_from_sitemap(sitemap_file)
+    text_file_path = 'extracted_text.txt'
+    with open(text_file_path, 'w', encoding='utf-8') as file:
+        file.write(extracted_text)
+    return "\n".join(crawl_website.progress_textbox), text_file_path
+with gr.Interface(fn=gradio_interface, inputs="text", outputs=["text", "file"],
+                  title="Website Crawler",
+                  description="Enter a website URL to crawl and extract text from web pages.") as iface:
+    iface.launch(share=True)