Spaces:
Runtime error
Runtime error
| import os | |
| import re | |
| import requests | |
| import uuid | |
| import zipfile | |
| import hashlib | |
| import shutil | |
| import gradio as gr | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| # Function to validate URLs | |
| def is_valid(url): | |
| parsed = urlparse(url) | |
| return bool(parsed.netloc) and bool(parsed.scheme) | |
| # Function to find files on webpage | |
| def find_files(url, soup, file_type): | |
| files = [] | |
| if file_type == "image": | |
| tags = ['jpg', 'jpeg', 'png', 'svg', 'gif'] | |
| for tag in soup.find_all('img'): | |
| file = tag.get('src') | |
| if any(tag in file for tag in tags): | |
| file_url = file | |
| if not is_valid(file_url): | |
| file_url = urljoin(url, file_url) | |
| files.append(file_url) | |
| elif file_type == "text": | |
| text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong'] | |
| for tag in text_tags: | |
| for element in soup.find_all(tag): | |
| files.append(element.get_text()) | |
| else: | |
| for link in soup.find_all('a'): | |
| file = link.get('href') | |
| if file_type in file: | |
| file_url = file | |
| if not is_valid(file_url): | |
| file_url = urljoin(url, file_url) | |
| files.append(file_url) | |
| return files | |
| # Function to download files | |
| def download_files(urls, folder_name): | |
| os.makedirs(folder_name, exist_ok=True) | |
| for i, url in enumerate(urls): | |
| response = requests.get(url, stream=True) | |
| file_extension = url.split(".")[-1].split("&")[0] | |
| url_hash = hashlib.md5(url.encode()).hexdigest() | |
| unique_id = str(uuid.uuid4())[:8] | |
| file_name = f'{url_hash}-{unique_id}.{file_extension}' | |
| file_name = file_name[:255] # Truncate the file name to avoid exceeding the limit | |
| file_name = re.sub(r'[\\/:"*?<>|]+', '_', file_name) # Replace special characters with underscores | |
| with open(f'{folder_name}/{file_name}', 'wb') as out_file: | |
| out_file.write(response.content) | |
| print(f"Downloaded file: {file_name}") | |
| # Function to create zip file | |
| def create_zip_file(folder_name): | |
| # Only create zip file if there are files in the directory | |
| if os.listdir(folder_name): | |
| with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf: | |
| for file in os.listdir(folder_name): | |
| zipf.write(f'{folder_name}/{file}') | |
| return f'{folder_name}.zip' | |
| else: | |
| return "" | |
| # Function to scrape website | |
| def scrape_website(url, images=False, text=False): | |
| try: | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() # Raise an exception if the GET request was unsuccessful | |
| except (requests.exceptions.RequestException, ValueError): | |
| raise gr.Error(f"Unable to access URL: {url}") | |
| return None, None | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Clear the contents of the folders | |
| if images: | |
| shutil.rmtree('images', ignore_errors=True) | |
| if text: | |
| shutil.rmtree('text', ignore_errors=True) | |
| # Download files | |
| if images: | |
| image_urls = find_files(url, soup, 'image') | |
| download_files(image_urls, 'images') | |
| if text: | |
| text_content = find_files(url, soup, 'text') | |
| os.makedirs('text', exist_ok=True) # Make sure the directory exists before writing | |
| if text_content: # Only create the file if there is text to write | |
| with open('text/content.txt', 'w') as text_file: | |
| for line in text_content: | |
| text_file.write(line + '\n') | |
| # Create zip files and return paths | |
| images_zip_file, text_zip_file = None, None | |
| if images and os.path.exists('images') and os.listdir('images'): | |
| images_zip_file = create_zip_file('images') | |
| if text and os.path.exists('text') and os.listdir('text'): | |
| text_zip_file = create_zip_file('text') | |
| return images_zip_file, text_zip_file | |
| # Function for web scraping | |
| def web_scraping(url, file_types): | |
| # Check if the URL is empty | |
| if not url: | |
| raise gr.Error("URL cannot be empty.") | |
| # Check if the URL begins with https:// | |
| if not url.startswith("https://"): | |
| raise gr.Error("The URL must begin with https://") | |
| # Check if at least one checkbox is selected | |
| if not file_types: | |
| raise gr.Error("At least one media type must be selected.") | |
| images = "Images" in file_types | |
| text = "Text" in file_types | |
| return scrape_website(url, images, text) | |
| with gr.Blocks(theme="dwancin/yellow", css=".lg.svelte-1ipelgc {max-height: 60px !important;}") as app: | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| url_name = gr.Textbox( | |
| placeholder="Enter URL here", | |
| show_label=True, | |
| label="Website", | |
| info="Example: https://en.wikipedia.org/wiki/Main_Page", | |
| ) | |
| media_types = gr.CheckboxGroup( | |
| [ | |
| "Images", | |
| "Text", | |
| ], | |
| value="Images", | |
| label="Media types", | |
| ) | |
| submit_button = gr.Button( | |
| "Scrape", | |
| variant="primary", | |
| interactive=True, | |
| ) | |
| with gr.Column(scale=1): | |
| output_images_zip_file = gr.File(label="Images ZIP-file") | |
| output_text_zip_file = gr.File(label="Text ZIP-file") | |
| submit_button.click(web_scraping, inputs=[url_name, media_types], outputs=[output_images_zip_file, output_text_zip_file]) | |
| app.launch() |