Spaces:
Runtime error
Runtime error
File size: 5,615 Bytes
c68cb37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import os
import re
import requests
import uuid
import zipfile
import hashlib
import shutil
import gradio as gr
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
# Function to validate URLs
def is_valid(url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
# Function to find files on webpage
def find_files(url, soup, file_type):
files = []
if file_type == "image":
tags = ['jpg', 'jpeg', 'png', 'svg', 'gif']
for tag in soup.find_all('img'):
file = tag.get('src')
if any(tag in file for tag in tags):
file_url = file
if not is_valid(file_url):
file_url = urljoin(url, file_url)
files.append(file_url)
elif file_type == "text":
text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong']
for tag in text_tags:
for element in soup.find_all(tag):
files.append(element.get_text())
else:
for link in soup.find_all('a'):
file = link.get('href')
if file_type in file:
file_url = file
if not is_valid(file_url):
file_url = urljoin(url, file_url)
files.append(file_url)
return files
# Function to download files
def download_files(urls, folder_name):
os.makedirs(folder_name, exist_ok=True)
for i, url in enumerate(urls):
response = requests.get(url, stream=True)
file_extension = url.split(".")[-1].split("&")[0]
url_hash = hashlib.md5(url.encode()).hexdigest()
unique_id = str(uuid.uuid4())[:8]
file_name = f'{url_hash}-{unique_id}.{file_extension}'
file_name = file_name[:255] # Truncate the file name to avoid exceeding the limit
file_name = re.sub(r'[\\/:"*?<>|]+', '_', file_name) # Replace special characters with underscores
with open(f'{folder_name}/{file_name}', 'wb') as out_file:
out_file.write(response.content)
print(f"Downloaded file: {file_name}")
# Function to create zip file
def create_zip_file(folder_name):
# Only create zip file if there are files in the directory
if os.listdir(folder_name):
with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf:
for file in os.listdir(folder_name):
zipf.write(f'{folder_name}/{file}')
return f'{folder_name}.zip'
else:
return ""
# Function to scrape website
def scrape_website(url, images=False, text=False):
try:
response = requests.get(url, timeout=10)
response.raise_for_status() # Raise an exception if the GET request was unsuccessful
except (requests.exceptions.RequestException, ValueError):
raise gr.Error(f"Unable to access URL: {url}")
return None, None
soup = BeautifulSoup(response.content, 'html.parser')
# Clear the contents of the folders
if images:
shutil.rmtree('images', ignore_errors=True)
if text:
shutil.rmtree('text', ignore_errors=True)
# Download files
if images:
image_urls = find_files(url, soup, 'image')
download_files(image_urls, 'images')
if text:
text_content = find_files(url, soup, 'text')
os.makedirs('text', exist_ok=True) # Make sure the directory exists before writing
if text_content: # Only create the file if there is text to write
with open('text/content.txt', 'w') as text_file:
for line in text_content:
text_file.write(line + '\n')
# Create zip files and return paths
images_zip_file, text_zip_file = None, None
if images and os.path.exists('images') and os.listdir('images'):
images_zip_file = create_zip_file('images')
if text and os.path.exists('text') and os.listdir('text'):
text_zip_file = create_zip_file('text')
return images_zip_file, text_zip_file
# Function for web scraping
def web_scraping(url, file_types):
# Check if the URL is empty
if not url:
raise gr.Error("URL cannot be empty.")
# Check if the URL begins with https://
if not url.startswith("https://"):
raise gr.Error("The URL must begin with https://")
# Check if at least one checkbox is selected
if not file_types:
raise gr.Error("At least one media type must be selected.")
images = "Images" in file_types
text = "Text" in file_types
return scrape_website(url, images, text)
with gr.Blocks(theme="dwancin/yellow", css=".lg.svelte-1ipelgc {max-height: 60px !important;}") as app:
with gr.Row():
with gr.Column(scale=2):
url_name = gr.Textbox(
placeholder="Enter URL here",
show_label=True,
label="Website",
info="Example: https://en.wikipedia.org/wiki/Main_Page",
)
media_types = gr.CheckboxGroup(
[
"Images",
"Text",
],
value="Images",
label="Media types",
)
submit_button = gr.Button(
"Scrape",
variant="primary",
interactive=True,
)
with gr.Column(scale=1):
output_images_zip_file = gr.File(label="Images ZIP-file")
output_text_zip_file = gr.File(label="Text ZIP-file")
submit_button.click(web_scraping, inputs=[url_name, media_types], outputs=[output_images_zip_file, output_text_zip_file])
app.launch() |