Spaces:

rstallman
/

web-scraping

Runtime error

App Files Files Community

web-scraping / app.py

rstallman

Duplicate from dwancin/web-scraping

c68cb37 over 2 years ago

raw

history blame contribute delete

5.62 kB

	import os
	import re
	import requests
	import uuid
	import zipfile
	import hashlib
	import shutil
	import gradio as gr
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse

	# Function to validate URLs
	def is_valid(url):
	parsed = urlparse(url)
	return bool(parsed.netloc) and bool(parsed.scheme)

	# Function to find files on webpage
	def find_files(url, soup, file_type):
	files = []
	if file_type == "image":
	tags = ['jpg', 'jpeg', 'png', 'svg', 'gif']
	for tag in soup.find_all('img'):
	file = tag.get('src')
	if any(tag in file for tag in tags):
	file_url = file
	if not is_valid(file_url):
	file_url = urljoin(url, file_url)
	files.append(file_url)
	elif file_type == "text":
	text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong']
	for tag in text_tags:
	for element in soup.find_all(tag):
	files.append(element.get_text())
	else:
	for link in soup.find_all('a'):
	file = link.get('href')
	if file_type in file:
	file_url = file
	if not is_valid(file_url):
	file_url = urljoin(url, file_url)
	files.append(file_url)
	return files




	# Function to download files
	def download_files(urls, folder_name):
	os.makedirs(folder_name, exist_ok=True)
	for i, url in enumerate(urls):
	response = requests.get(url, stream=True)
	file_extension = url.split(".")[-1].split("&")[0]
	url_hash = hashlib.md5(url.encode()).hexdigest()
	unique_id = str(uuid.uuid4())[:8]
	file_name = f'{url_hash}-{unique_id}.{file_extension}'
	file_name = file_name[:255] # Truncate the file name to avoid exceeding the limit
	file_name = re.sub(r'[\\/:"*?<>\|]+', '_', file_name) # Replace special characters with underscores
	with open(f'{folder_name}/{file_name}', 'wb') as out_file:
	out_file.write(response.content)
	print(f"Downloaded file: {file_name}")

	# Function to create zip file
	def create_zip_file(folder_name):
	# Only create zip file if there are files in the directory
	if os.listdir(folder_name):
	with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf:
	for file in os.listdir(folder_name):
	zipf.write(f'{folder_name}/{file}')
	return f'{folder_name}.zip'
	else:
	return ""



	# Function to scrape website
	def scrape_website(url, images=False, text=False):
	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status() # Raise an exception if the GET request was unsuccessful
	except (requests.exceptions.RequestException, ValueError):
	raise gr.Error(f"Unable to access URL: {url}")
	return None, None

	soup = BeautifulSoup(response.content, 'html.parser')

	# Clear the contents of the folders
	if images:
	shutil.rmtree('images', ignore_errors=True)
	if text:
	shutil.rmtree('text', ignore_errors=True)

	# Download files
	if images:
	image_urls = find_files(url, soup, 'image')
	download_files(image_urls, 'images')
	if text:
	text_content = find_files(url, soup, 'text')
	os.makedirs('text', exist_ok=True) # Make sure the directory exists before writing
	if text_content: # Only create the file if there is text to write
	with open('text/content.txt', 'w') as text_file:
	for line in text_content:
	text_file.write(line + '\n')

	# Create zip files and return paths
	images_zip_file, text_zip_file = None, None
	if images and os.path.exists('images') and os.listdir('images'):
	images_zip_file = create_zip_file('images')
	if text and os.path.exists('text') and os.listdir('text'):
	text_zip_file = create_zip_file('text')

	return images_zip_file, text_zip_file



	# Function for web scraping
	def web_scraping(url, file_types):
	# Check if the URL is empty
	if not url:
	raise gr.Error("URL cannot be empty.")

	# Check if the URL begins with https://
	if not url.startswith("https://"):
	raise gr.Error("The URL must begin with https://")

	# Check if at least one checkbox is selected
	if not file_types:
	raise gr.Error("At least one media type must be selected.")

	images = "Images" in file_types
	text = "Text" in file_types
	return scrape_website(url, images, text)

	with gr.Blocks(theme="dwancin/yellow", css=".lg.svelte-1ipelgc {max-height: 60px !important;}") as app:
	with gr.Row():
	with gr.Column(scale=2):
	url_name = gr.Textbox(
	placeholder="Enter URL here",
	show_label=True,
	label="Website",
	info="Example: https://en.wikipedia.org/wiki/Main_Page",
	)
	media_types = gr.CheckboxGroup(
	[
	"Images",
	"Text",
	],
	value="Images",
	label="Media types",
	)
	submit_button = gr.Button(
	"Scrape",
	variant="primary",
	interactive=True,
	)
	with gr.Column(scale=1):
	output_images_zip_file = gr.File(label="Images ZIP-file")
	output_text_zip_file = gr.File(label="Text ZIP-file")

	submit_button.click(web_scraping, inputs=[url_name, media_types], outputs=[output_images_zip_file, output_text_zip_file])
	app.launch()