Spaces:
Runtime error
Runtime error
| import requests | |
| from bs4 import BeautifulSoup | |
| import xml.etree.ElementTree as ET | |
| import xml.dom.minidom | |
| import re | |
| import gradio as gr | |
| from urllib.parse import urlparse, urljoin | |
| def crawl_website(url): | |
| visited_urls = set() | |
| unique_urls = set() | |
| def crawl(url): | |
| # Check if URL has already been visited | |
| if url in visited_urls: | |
| return | |
| # Add URL to visited set | |
| visited_urls.add(url) | |
| # Extract domain from the given URL | |
| parsed_url = urlparse(url) | |
| base_url = parsed_url.scheme + "://" + parsed_url.netloc | |
| # Make a GET request to the URL | |
| try: | |
| response = requests.get(url) | |
| except requests.exceptions.RequestException: | |
| # Handle unreadable URLs | |
| return | |
| # Check if the request was successful | |
| if response.status_code == 200: | |
| # Print the currently crawling URL | |
| crawl_website.progress_textbox.append(f"Crawling: {url}") | |
| # Parse the HTML content using BeautifulSoup | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Add the URL to the set of unique URLs | |
| unique_urls.add(url) | |
| # Extract all the links on the page | |
| links = soup.find_all('a') | |
| # Visit each link | |
| for link in links: | |
| href = link.get('href') | |
| if href and not href.startswith('#'): | |
| # Construct the absolute URL by joining the base URL and the relative URL | |
| absolute_url = urljoin(url, href) | |
| parsed_absolute_url = urlparse(absolute_url) | |
| # Check if the URL points to a webpage (excluding image URLs) and ends with ".html" or ".htm" | |
| if parsed_absolute_url.netloc == parsed_url.netloc and parsed_absolute_url.path.endswith( | |
| ('.html', '.htm')): | |
| try: | |
| # Visit the absolute URL | |
| crawl(absolute_url) | |
| except requests.exceptions.RequestException: | |
| # Handle unreadable URLs | |
| continue | |
| else: | |
| # Handle unsuccessful requests | |
| return | |
| # Call the crawl_website function with the desired URL | |
| crawl_website.progress_textbox = [] # Create a list to store progress lines | |
| crawl(url) | |
| # Remove "http://" URLs that have matching content after "http://" in "https://" URLs | |
| final_urls = set() | |
| for url in unique_urls: | |
| if url.startswith("http://"): | |
| remaining_url = url[len("http://"):] | |
| if "https://" + remaining_url in unique_urls: | |
| continue | |
| final_urls.add(url) | |
| # Create the XML sitemap | |
| urlset = ET.Element("urlset") | |
| urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9") | |
| for url in final_urls: | |
| url_elem = ET.SubElement(urlset, "url") | |
| loc_elem = ET.SubElement(url_elem, "loc") | |
| loc_elem.text = url | |
| # Create the ElementTree object | |
| tree = ET.ElementTree(urlset) | |
| # Convert the ElementTree to a formatted string | |
| xml_str = xml.dom.minidom.parseString(ET.tostring(urlset)).toprettyxml(indent=" ") | |
| # Remove empty lines from the formatted XML string | |
| xml_str = "\n".join([line for line in xml_str.split("\n") if line.strip()]) | |
| # Write the XML string to a file | |
| with open("sitemap.xml", "w") as file: | |
| file.write(xml_str) | |
| return "sitemap.xml" | |
| def extract_text_from_sitemap(sitemap_file): | |
| with open(sitemap_file, 'r') as file: | |
| sitemap_content = file.read() | |
| soup = BeautifulSoup(sitemap_content, 'xml') | |
| urls = [loc.text for loc in soup.find_all('loc')] | |
| extracted_text = "" | |
| for i, url in enumerate(urls): | |
| if url.lower().endswith(('.html', '.htm')): | |
| # Print the currently extracting URL | |
| crawl_website.progress_textbox.append(f"Extracting text: {url}") | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| text = soup.get_text(separator=' ') | |
| extracted_text += f"\n{url}\n{text}\n\n" | |
| # Remove multiple whitespace | |
| extracted_text = re.sub(r'\s+', ' ', extracted_text) | |
| return extracted_text | |
| def gradio_interface(url): | |
| sitemap_file = crawl_website(url) | |
| extracted_text = extract_text_from_sitemap(sitemap_file) | |
| text_file_path = 'extracted_text.txt' | |
| with open(text_file_path, 'w', encoding='utf-8') as file: | |
| file.write(extracted_text) | |
| return "\n".join(crawl_website.progress_textbox), text_file_path | |
| with gr.Interface(fn=gradio_interface, inputs="text", outputs=["text", "file"], | |
| title="Website Crawler", | |
| description="Enter a website URL to crawl and extract text from web pages.") as iface: | |
| iface.launch(share=True) |