web-scraping / app.py
rstallman's picture
Duplicate from dwancin/web-scraping
c68cb37
import os
import re
import requests
import uuid
import zipfile
import hashlib
import shutil
import gradio as gr
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
# Function to validate URLs
def is_valid(url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
# Function to find files on webpage
def find_files(url, soup, file_type):
files = []
if file_type == "image":
tags = ['jpg', 'jpeg', 'png', 'svg', 'gif']
for tag in soup.find_all('img'):
file = tag.get('src')
if any(tag in file for tag in tags):
file_url = file
if not is_valid(file_url):
file_url = urljoin(url, file_url)
files.append(file_url)
elif file_type == "text":
text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong']
for tag in text_tags:
for element in soup.find_all(tag):
files.append(element.get_text())
else:
for link in soup.find_all('a'):
file = link.get('href')
if file_type in file:
file_url = file
if not is_valid(file_url):
file_url = urljoin(url, file_url)
files.append(file_url)
return files
# Function to download files
def download_files(urls, folder_name):
os.makedirs(folder_name, exist_ok=True)
for i, url in enumerate(urls):
response = requests.get(url, stream=True)
file_extension = url.split(".")[-1].split("&")[0]
url_hash = hashlib.md5(url.encode()).hexdigest()
unique_id = str(uuid.uuid4())[:8]
file_name = f'{url_hash}-{unique_id}.{file_extension}'
file_name = file_name[:255] # Truncate the file name to avoid exceeding the limit
file_name = re.sub(r'[\\/:"*?<>|]+', '_', file_name) # Replace special characters with underscores
with open(f'{folder_name}/{file_name}', 'wb') as out_file:
out_file.write(response.content)
print(f"Downloaded file: {file_name}")
# Function to create zip file
def create_zip_file(folder_name):
# Only create zip file if there are files in the directory
if os.listdir(folder_name):
with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf:
for file in os.listdir(folder_name):
zipf.write(f'{folder_name}/{file}')
return f'{folder_name}.zip'
else:
return ""
# Function to scrape website
def scrape_website(url, images=False, text=False):
try:
response = requests.get(url, timeout=10)
response.raise_for_status() # Raise an exception if the GET request was unsuccessful
except (requests.exceptions.RequestException, ValueError):
raise gr.Error(f"Unable to access URL: {url}")
return None, None
soup = BeautifulSoup(response.content, 'html.parser')
# Clear the contents of the folders
if images:
shutil.rmtree('images', ignore_errors=True)
if text:
shutil.rmtree('text', ignore_errors=True)
# Download files
if images:
image_urls = find_files(url, soup, 'image')
download_files(image_urls, 'images')
if text:
text_content = find_files(url, soup, 'text')
os.makedirs('text', exist_ok=True) # Make sure the directory exists before writing
if text_content: # Only create the file if there is text to write
with open('text/content.txt', 'w') as text_file:
for line in text_content:
text_file.write(line + '\n')
# Create zip files and return paths
images_zip_file, text_zip_file = None, None
if images and os.path.exists('images') and os.listdir('images'):
images_zip_file = create_zip_file('images')
if text and os.path.exists('text') and os.listdir('text'):
text_zip_file = create_zip_file('text')
return images_zip_file, text_zip_file
# Function for web scraping
def web_scraping(url, file_types):
# Check if the URL is empty
if not url:
raise gr.Error("URL cannot be empty.")
# Check if the URL begins with https://
if not url.startswith("https://"):
raise gr.Error("The URL must begin with https://")
# Check if at least one checkbox is selected
if not file_types:
raise gr.Error("At least one media type must be selected.")
images = "Images" in file_types
text = "Text" in file_types
return scrape_website(url, images, text)
with gr.Blocks(theme="dwancin/yellow", css=".lg.svelte-1ipelgc {max-height: 60px !important;}") as app:
with gr.Row():
with gr.Column(scale=2):
url_name = gr.Textbox(
placeholder="Enter URL here",
show_label=True,
label="Website",
info="Example: https://en.wikipedia.org/wiki/Main_Page",
)
media_types = gr.CheckboxGroup(
[
"Images",
"Text",
],
value="Images",
label="Media types",
)
submit_button = gr.Button(
"Scrape",
variant="primary",
interactive=True,
)
with gr.Column(scale=1):
output_images_zip_file = gr.File(label="Images ZIP-file")
output_text_zip_file = gr.File(label="Text ZIP-file")
submit_button.click(web_scraping, inputs=[url_name, media_types], outputs=[output_images_zip_file, output_text_zip_file])
app.launch()