Spaces:

maaz21
/

img.scrapper

Sleeping

App Files Files Community

img.scrapper / src /streamlit_app.py

maaz21

Update src/streamlit_app.py

ccffe7a verified 10 months ago

raw

history blame contribute delete

17.7 kB

	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	import os
	import zipfile
	from io import BytesIO
	import time
	from PIL import Image
	import hashlib

	def is_valid_url(url):
	"""Check if the provided URL is valid"""
	try:
	result = urlparse(url)
	return all([result.scheme, result.netloc])
	except:
	return False

	def extract_css_background_images(css_content, base_url):
	"""Extract background image URLs from CSS content"""
	import re

	image_urls = []
	# Pattern to match background-image: url() declarations
	bg_pattern = r'background(?:-image)?\s:\surl\s\(\s["\']?([^"\')\s]+)["\']?\s*\)'

	matches = re.findall(bg_pattern, css_content, re.IGNORECASE)
	for match in matches:
	if match and not match.startswith('data:'):
	absolute_url = urljoin(base_url, match)
	image_urls.append(absolute_url)

	return image_urls

	def get_image_urls(url):
	"""Extract all image URLs from the given webpage using comprehensive methods"""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
	}

	response = requests.get(url, headers=headers, timeout=15)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')
	image_urls = set() # Use set to avoid duplicates

	# 1. Standard img tags with multiple attribute checks
	img_tags = soup.find_all('img')
	for img in img_tags:
	# Check multiple possible attributes
	for attr in ['src', 'data-src', 'data-original', 'data-lazy-src', 'data-srcset',
	'data-image', 'data-bg', 'data-background', 'data-large-image']:
	value = img.get(attr)
	if value and not value.startswith('data:'):
	# Handle srcset (multiple images)
	if 'srcset' in attr.lower() or ',' in value:
	urls = value.split(',')
	for url_part in urls:
	clean_url = url_part.strip().split(' ')[0]
	if clean_url:
	absolute_url = urljoin(url, clean_url)
	image_urls.add(absolute_url)
	else:
	absolute_url = urljoin(url, value)
	image_urls.add(absolute_url)

	# 2. Picture and source tags
	picture_tags = soup.find_all(['picture', 'source'])
	for tag in picture_tags:
	for attr in ['src', 'srcset', 'data-src', 'data-srcset']:
	value = tag.get(attr)
	if value and not value.startswith('data:'):
	if 'srcset' in attr.lower() or ',' in value:
	urls = value.split(',')
	for url_part in urls:
	clean_url = url_part.strip().split(' ')[0]
	if clean_url:
	absolute_url = urljoin(url, clean_url)
	image_urls.add(absolute_url)
	else:
	absolute_url = urljoin(url, value)
	image_urls.add(absolute_url)

	# 3. Divs and other elements with background images in style attribute
	all_elements = soup.find_all(attrs={'style': True})
	for element in all_elements:
	style = element.get('style', '')
	if 'background' in style.lower() and 'url(' in style:
	import re
	bg_matches = re.findall(r'url\s\(\s["\']?([^"\')\s]+)["\']?\s*\)', style, re.IGNORECASE)
	for match in bg_matches:
	if match and not match.startswith('data:'):
	absolute_url = urljoin(url, match)
	image_urls.add(absolute_url)

	# 4. Elements with data attributes that might contain image URLs
	data_attrs = ['data-background-image', 'data-bg-src', 'data-hero-image',
	'data-banner', 'data-slide-img', 'data-thumb', 'data-image-src',
	'data-full-size', 'data-zoom-image', 'data-lightbox']

	for attr in data_attrs:
	elements = soup.find_all(attrs={attr: True})
	for element in elements:
	value = element.get(attr)
	if value and not value.startswith('data:'):
	absolute_url = urljoin(url, value)
	image_urls.add(absolute_url)

	# 5. CSS background images from <style> tags
	style_tags = soup.find_all('style')
	for style_tag in style_tags:
	if style_tag.string:
	css_images = extract_css_background_images(style_tag.string, url)
	image_urls.update(css_images)

	# 6. External CSS files
	link_tags = soup.find_all('link', {'rel': 'stylesheet'})
	for link in link_tags[:5]: # Limit to first 5 CSS files to avoid overload
	css_url = link.get('href')
	if css_url:
	try:
	css_absolute_url = urljoin(url, css_url)
	css_response = requests.get(css_absolute_url, headers=headers, timeout=10)
	if css_response.status_code == 200:
	css_images = extract_css_background_images(css_response.text, url)
	image_urls.update(css_images)
	except:
	continue # Skip if CSS file can't be loaded

	# 7. Meta tags (Open Graph, Twitter Cards, etc.)
	meta_tags = soup.find_all('meta')
	for meta in meta_tags:
	for attr in ['content', 'value']:
	value = meta.get(attr, '')
	if value and any(ext in value.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']):
	if 'http' in value:
	image_urls.add(value)
	else:
	absolute_url = urljoin(url, value)
	image_urls.add(absolute_url)

	# 8. SVG images
	svg_tags = soup.find_all('svg')
	for svg in svg_tags:
	# Look for embedded images in SVG
	image_elements = svg.find_all('image')
	for img in image_elements:
	href = img.get('href') or img.get('xlink:href')
	if href and not href.startswith('data:'):
	absolute_url = urljoin(url, href)
	image_urls.add(absolute_url)

	# 9. Shopify specific selectors
	shopify_selectors = [
	'[data-bgset]', '[data-widths]', '.hero__image', '.banner__media img',
	'.card__media img', '.product__media img', '[data-shopify]'
	]

	for selector in shopify_selectors:
	try:
	elements = soup.select(selector)
	for element in elements:
	for attr in ['src', 'data-src', 'data-bgset', 'data-widths', 'srcset']:
	value = element.get(attr)
	if value and not value.startswith('data:'):
	if 'bgset' in attr or 'widths' in attr or 'srcset' in attr:
	# Parse complex attribute formats
	import re
	urls = re.findall(r'https?://[^\s,]+', value)
	for found_url in urls:
	image_urls.add(found_url)
	else:
	absolute_url = urljoin(url, value)
	image_urls.add(absolute_url)
	except:
	continue

	# 10. Look for JSON-LD structured data
	json_scripts = soup.find_all('script', {'type': 'application/ld+json'})
	for script in json_scripts:
	try:
	import json
	data = json.loads(script.string)
	json_str = json.dumps(data)
	import re
	urls = re.findall(r'https?://[^\s"]+\.(?:jpg\|jpeg\|png\|gif\|webp\|svg)', json_str, re.IGNORECASE)
	image_urls.update(urls)
	except:
	continue

	# Filter out obviously invalid URLs and convert to list
	valid_image_urls = []
	for img_url in image_urls:
	if img_url and len(img_url) > 10 and not img_url.startswith('data:'):
	# Basic validation - must look like a URL
	if '.' in img_url and ('http' in img_url or img_url.startswith('//')):
	valid_image_urls.append(img_url)

	return valid_image_urls

	except requests.RequestException as e:
	st.error(f"Error fetching the webpage: {str(e)}")
	return []
	except Exception as e:
	st.error(f"Error parsing the webpage: {str(e)}")
	return []

	def download_image(url, session):
	"""Download a single image with better error handling"""
	try:
	response = session.get(url, timeout=15, stream=True)
	response.raise_for_status()

	# Check if the response contains image data
	content_type = response.headers.get('content-type', '').lower()
	if not any(img_type in content_type for img_type in ['image/', 'application/octet-stream']):
	return None, None, f"Not an image: {content_type}"

	# Get image content
	image_content = response.content

	# Skip very small files (likely 1x1 tracking pixels)
	if len(image_content) < 500:
	return None, None, "Image too small (likely tracking pixel)"

	# Generate filename
	url_hash = hashlib.md5(url.encode()).hexdigest()[:8]

	# Extract filename from URL if possible
	url_path = urlparse(url).path
	if url_path and '.' in url_path.split('/')[-1]:
	original_name = url_path.split('/')[-1].split('.')[0][:20] # Limit length
	filename = f"{original_name}_{url_hash}"
	else:
	filename = f"image_{url_hash}"

	# Try to get file extension from URL or content-type
	if '.' in url.split('/')[-1] and '?' not in url.split('/')[-1].split('.')[-1]:
	ext = url.split('/')[-1].split('.')[-1].split('?')[0].lower()
	if ext in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp', 'svg', 'ico']:
	filename += f".{ext}"
	elif 'jpeg' in content_type:
	filename += ".jpg"
	elif 'png' in content_type:
	filename += ".png"
	elif 'gif' in content_type:
	filename += ".gif"
	elif 'webp' in content_type:
	filename += ".webp"
	elif 'svg' in content_type:
	filename += ".svg"
	else:
	filename += ".jpg" # Default extension

	return image_content, filename, None

	except requests.RequestException as e:
	return None, None, f"Download error: {str(e)}"
	except Exception as e:
	return None, None, f"Unexpected error: {str(e)}"

	def create_zip_file(images_data):
	"""Create a ZIP file containing all downloaded images"""
	zip_buffer = BytesIO()

	with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
	for filename, image_content in images_data:
	zip_file.writestr(filename, image_content)

	zip_buffer.seek(0)
	return zip_buffer

	def main():
	st.set_page_config(
	page_title="Website Image Crawler",
	page_icon="🖼️",
	layout="wide"
	)

	st.title("🖼️ Website Image Crawler")
	st.markdown("Enter a website URL to extract and download all images from that page.")

	# URL input
	url = st.text_input("Enter Website URL:", placeholder="https://example.com")

	col1, col2 = st.columns([1, 4])

	with col1:
	crawl_button = st.button("🔍 Crawl Images", type="primary")

	if crawl_button and url:
	if not is_valid_url(url):
	st.error("Please enter a valid URL (including http:// or https://)")
	return

	with st.spinner("Crawling website for images..."):
	# Get image URLs
	image_urls = get_image_urls(url)

	if not image_urls:
	st.warning("No images found on the provided webpage.")
	return

	st.success(f"Found {len(image_urls)} images on the webpage!")

	# Show found URLs in an expander
	with st.expander(f"Found Image URLs ({len(image_urls)})"):
	for i, img_url in enumerate(image_urls, 1):
	st.text(f"{i}. {img_url}")

	# Download images
	st.subheader("Downloading Images...")

	progress_bar = st.progress(0)
	status_text = st.empty()
	downloaded_images = []
	failed_downloads = []

	# Create a session for efficient downloading
	session = requests.Session()
	session.headers.update({
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'image/webp,image/apng,image/,/*;q=0.8',
	'Accept-Language': 'en-US,en;q=0.9',
	'Accept-Encoding': 'gzip, deflate, br',
	'DNT': '1',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1'
	})

	for i, img_url in enumerate(image_urls):
	status_text.text(f"Downloading image {i+1}/{len(image_urls)}: {img_url[:50]}...")

	image_content, filename, error = download_image(img_url, session)

	if image_content and filename:
	downloaded_images.append((filename, image_content))
	else:
	failed_downloads.append((img_url, error))

	progress_bar.progress((i + 1) / len(image_urls))
	time.sleep(0.1) # Small delay to avoid overwhelming the server

	session.close()

	# Show results
	st.subheader("Download Results")

	col1, col2 = st.columns(2)

	with col1:
	st.metric("✅ Successfully Downloaded", len(downloaded_images))

	with col2:
	st.metric("❌ Failed Downloads", len(failed_downloads))

	# Show failed downloads
	if failed_downloads:
	with st.expander("Failed Downloads"):
	for img_url, error in failed_downloads:
	st.text(f"❌ {img_url}")
	st.text(f" Error: {error}")
	st.text("")

	# Create download button for ZIP file
	if downloaded_images:
	st.subheader("Download All Images")

	zip_buffer = create_zip_file(downloaded_images)

	st.download_button(
	label=f"📥 Download ZIP file ({len(downloaded_images)} images)",
	data=zip_buffer.getvalue(),
	file_name=f"images_{urlparse(url).netloc}_{int(time.time())}.zip",
	mime="application/zip",
	type="primary"
	)

	# Show preview of first few images
	st.subheader("Image Preview")

	preview_cols = st.columns(4)
	preview_count = min(8, len(downloaded_images))

	for i in range(preview_count):
	filename, image_content = downloaded_images[i]

	try:
	# Try to display image preview
	image = Image.open(BytesIO(image_content))

	with preview_cols[i % 4]:
	st.image(image, caption=filename, use_container_width=True)
	except:
	# If image can't be displayed, show filename only
	with preview_cols[i % 4]:
	st.text(f"📄 {filename}")

	if len(downloaded_images) > preview_count:
	st.text(f"... and {len(downloaded_images) - preview_count} more images")

	elif crawl_button and not url:
	st.error("Please enter a URL to crawl.")

	# Instructions
	st.markdown("---")
	st.subheader("How to use:")
	st.markdown("""
	1. Enter a valid website URL (must include http:// or https://)
	2. Click the "Crawl Images" button
	3. Wait for the application to find and download all images
	4. Download the ZIP file containing all images

	Note: This enhanced crawler finds images from:
	- Standard `<img>` tags with various lazy-loading attributes
	- CSS background images (inline styles and external stylesheets)
	- Shopify banners and product images
	- Meta tags (Open Graph, Twitter Cards)
	- JSON-LD structured data
	- SVG embedded images
	- Container elements with background images

	It does not crawl subpages or follow links - only the main page content.
	""")

	st.markdown("---")
	st.markdown("⚠️ Disclaimer: Please respect website terms of service and copyright laws when downloading images.")

	# Run the main function
	main()