Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| import os | |
| import zipfile | |
| from io import BytesIO | |
| import time | |
| from PIL import Image | |
| import hashlib | |
| def is_valid_url(url): | |
| """Check if the provided URL is valid""" | |
| try: | |
| result = urlparse(url) | |
| return all([result.scheme, result.netloc]) | |
| except: | |
| return False | |
| def extract_css_background_images(css_content, base_url): | |
| """Extract background image URLs from CSS content""" | |
| import re | |
| image_urls = [] | |
| # Pattern to match background-image: url() declarations | |
| bg_pattern = r'background(?:-image)?\s*:\s*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)' | |
| matches = re.findall(bg_pattern, css_content, re.IGNORECASE) | |
| for match in matches: | |
| if match and not match.startswith('data:'): | |
| absolute_url = urljoin(base_url, match) | |
| image_urls.append(absolute_url) | |
| return image_urls | |
| def get_image_urls(url): | |
| """Extract all image URLs from the given webpage using comprehensive methods""" | |
| try: | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' | |
| } | |
| response = requests.get(url, headers=headers, timeout=15) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| image_urls = set() # Use set to avoid duplicates | |
| # 1. Standard img tags with multiple attribute checks | |
| img_tags = soup.find_all('img') | |
| for img in img_tags: | |
| # Check multiple possible attributes | |
| for attr in ['src', 'data-src', 'data-original', 'data-lazy-src', 'data-srcset', | |
| 'data-image', 'data-bg', 'data-background', 'data-large-image']: | |
| value = img.get(attr) | |
| if value and not value.startswith('data:'): | |
| # Handle srcset (multiple images) | |
| if 'srcset' in attr.lower() or ',' in value: | |
| urls = value.split(',') | |
| for url_part in urls: | |
| clean_url = url_part.strip().split(' ')[0] | |
| if clean_url: | |
| absolute_url = urljoin(url, clean_url) | |
| image_urls.add(absolute_url) | |
| else: | |
| absolute_url = urljoin(url, value) | |
| image_urls.add(absolute_url) | |
| # 2. Picture and source tags | |
| picture_tags = soup.find_all(['picture', 'source']) | |
| for tag in picture_tags: | |
| for attr in ['src', 'srcset', 'data-src', 'data-srcset']: | |
| value = tag.get(attr) | |
| if value and not value.startswith('data:'): | |
| if 'srcset' in attr.lower() or ',' in value: | |
| urls = value.split(',') | |
| for url_part in urls: | |
| clean_url = url_part.strip().split(' ')[0] | |
| if clean_url: | |
| absolute_url = urljoin(url, clean_url) | |
| image_urls.add(absolute_url) | |
| else: | |
| absolute_url = urljoin(url, value) | |
| image_urls.add(absolute_url) | |
| # 3. Divs and other elements with background images in style attribute | |
| all_elements = soup.find_all(attrs={'style': True}) | |
| for element in all_elements: | |
| style = element.get('style', '') | |
| if 'background' in style.lower() and 'url(' in style: | |
| import re | |
| bg_matches = re.findall(r'url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)', style, re.IGNORECASE) | |
| for match in bg_matches: | |
| if match and not match.startswith('data:'): | |
| absolute_url = urljoin(url, match) | |
| image_urls.add(absolute_url) | |
| # 4. Elements with data attributes that might contain image URLs | |
| data_attrs = ['data-background-image', 'data-bg-src', 'data-hero-image', | |
| 'data-banner', 'data-slide-img', 'data-thumb', 'data-image-src', | |
| 'data-full-size', 'data-zoom-image', 'data-lightbox'] | |
| for attr in data_attrs: | |
| elements = soup.find_all(attrs={attr: True}) | |
| for element in elements: | |
| value = element.get(attr) | |
| if value and not value.startswith('data:'): | |
| absolute_url = urljoin(url, value) | |
| image_urls.add(absolute_url) | |
| # 5. CSS background images from <style> tags | |
| style_tags = soup.find_all('style') | |
| for style_tag in style_tags: | |
| if style_tag.string: | |
| css_images = extract_css_background_images(style_tag.string, url) | |
| image_urls.update(css_images) | |
| # 6. External CSS files | |
| link_tags = soup.find_all('link', {'rel': 'stylesheet'}) | |
| for link in link_tags[:5]: # Limit to first 5 CSS files to avoid overload | |
| css_url = link.get('href') | |
| if css_url: | |
| try: | |
| css_absolute_url = urljoin(url, css_url) | |
| css_response = requests.get(css_absolute_url, headers=headers, timeout=10) | |
| if css_response.status_code == 200: | |
| css_images = extract_css_background_images(css_response.text, url) | |
| image_urls.update(css_images) | |
| except: | |
| continue # Skip if CSS file can't be loaded | |
| # 7. Meta tags (Open Graph, Twitter Cards, etc.) | |
| meta_tags = soup.find_all('meta') | |
| for meta in meta_tags: | |
| for attr in ['content', 'value']: | |
| value = meta.get(attr, '') | |
| if value and any(ext in value.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']): | |
| if 'http' in value: | |
| image_urls.add(value) | |
| else: | |
| absolute_url = urljoin(url, value) | |
| image_urls.add(absolute_url) | |
| # 8. SVG images | |
| svg_tags = soup.find_all('svg') | |
| for svg in svg_tags: | |
| # Look for embedded images in SVG | |
| image_elements = svg.find_all('image') | |
| for img in image_elements: | |
| href = img.get('href') or img.get('xlink:href') | |
| if href and not href.startswith('data:'): | |
| absolute_url = urljoin(url, href) | |
| image_urls.add(absolute_url) | |
| # 9. Shopify specific selectors | |
| shopify_selectors = [ | |
| '[data-bgset]', '[data-widths]', '.hero__image', '.banner__media img', | |
| '.card__media img', '.product__media img', '[data-shopify]' | |
| ] | |
| for selector in shopify_selectors: | |
| try: | |
| elements = soup.select(selector) | |
| for element in elements: | |
| for attr in ['src', 'data-src', 'data-bgset', 'data-widths', 'srcset']: | |
| value = element.get(attr) | |
| if value and not value.startswith('data:'): | |
| if 'bgset' in attr or 'widths' in attr or 'srcset' in attr: | |
| # Parse complex attribute formats | |
| import re | |
| urls = re.findall(r'https?://[^\s,]+', value) | |
| for found_url in urls: | |
| image_urls.add(found_url) | |
| else: | |
| absolute_url = urljoin(url, value) | |
| image_urls.add(absolute_url) | |
| except: | |
| continue | |
| # 10. Look for JSON-LD structured data | |
| json_scripts = soup.find_all('script', {'type': 'application/ld+json'}) | |
| for script in json_scripts: | |
| try: | |
| import json | |
| data = json.loads(script.string) | |
| json_str = json.dumps(data) | |
| import re | |
| urls = re.findall(r'https?://[^\s"]+\.(?:jpg|jpeg|png|gif|webp|svg)', json_str, re.IGNORECASE) | |
| image_urls.update(urls) | |
| except: | |
| continue | |
| # Filter out obviously invalid URLs and convert to list | |
| valid_image_urls = [] | |
| for img_url in image_urls: | |
| if img_url and len(img_url) > 10 and not img_url.startswith('data:'): | |
| # Basic validation - must look like a URL | |
| if '.' in img_url and ('http' in img_url or img_url.startswith('//')): | |
| valid_image_urls.append(img_url) | |
| return valid_image_urls | |
| except requests.RequestException as e: | |
| st.error(f"Error fetching the webpage: {str(e)}") | |
| return [] | |
| except Exception as e: | |
| st.error(f"Error parsing the webpage: {str(e)}") | |
| return [] | |
| def download_image(url, session): | |
| """Download a single image with better error handling""" | |
| try: | |
| response = session.get(url, timeout=15, stream=True) | |
| response.raise_for_status() | |
| # Check if the response contains image data | |
| content_type = response.headers.get('content-type', '').lower() | |
| if not any(img_type in content_type for img_type in ['image/', 'application/octet-stream']): | |
| return None, None, f"Not an image: {content_type}" | |
| # Get image content | |
| image_content = response.content | |
| # Skip very small files (likely 1x1 tracking pixels) | |
| if len(image_content) < 500: | |
| return None, None, "Image too small (likely tracking pixel)" | |
| # Generate filename | |
| url_hash = hashlib.md5(url.encode()).hexdigest()[:8] | |
| # Extract filename from URL if possible | |
| url_path = urlparse(url).path | |
| if url_path and '.' in url_path.split('/')[-1]: | |
| original_name = url_path.split('/')[-1].split('.')[0][:20] # Limit length | |
| filename = f"{original_name}_{url_hash}" | |
| else: | |
| filename = f"image_{url_hash}" | |
| # Try to get file extension from URL or content-type | |
| if '.' in url.split('/')[-1] and '?' not in url.split('/')[-1].split('.')[-1]: | |
| ext = url.split('/')[-1].split('.')[-1].split('?')[0].lower() | |
| if ext in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp', 'svg', 'ico']: | |
| filename += f".{ext}" | |
| elif 'jpeg' in content_type: | |
| filename += ".jpg" | |
| elif 'png' in content_type: | |
| filename += ".png" | |
| elif 'gif' in content_type: | |
| filename += ".gif" | |
| elif 'webp' in content_type: | |
| filename += ".webp" | |
| elif 'svg' in content_type: | |
| filename += ".svg" | |
| else: | |
| filename += ".jpg" # Default extension | |
| return image_content, filename, None | |
| except requests.RequestException as e: | |
| return None, None, f"Download error: {str(e)}" | |
| except Exception as e: | |
| return None, None, f"Unexpected error: {str(e)}" | |
| def create_zip_file(images_data): | |
| """Create a ZIP file containing all downloaded images""" | |
| zip_buffer = BytesIO() | |
| with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: | |
| for filename, image_content in images_data: | |
| zip_file.writestr(filename, image_content) | |
| zip_buffer.seek(0) | |
| return zip_buffer | |
| def main(): | |
| st.set_page_config( | |
| page_title="Website Image Crawler", | |
| page_icon="πΌοΈ", | |
| layout="wide" | |
| ) | |
| st.title("πΌοΈ Website Image Crawler") | |
| st.markdown("Enter a website URL to extract and download all images from that page.") | |
| # URL input | |
| url = st.text_input("Enter Website URL:", placeholder="https://example.com") | |
| col1, col2 = st.columns([1, 4]) | |
| with col1: | |
| crawl_button = st.button("π Crawl Images", type="primary") | |
| if crawl_button and url: | |
| if not is_valid_url(url): | |
| st.error("Please enter a valid URL (including http:// or https://)") | |
| return | |
| with st.spinner("Crawling website for images..."): | |
| # Get image URLs | |
| image_urls = get_image_urls(url) | |
| if not image_urls: | |
| st.warning("No images found on the provided webpage.") | |
| return | |
| st.success(f"Found {len(image_urls)} images on the webpage!") | |
| # Show found URLs in an expander | |
| with st.expander(f"Found Image URLs ({len(image_urls)})"): | |
| for i, img_url in enumerate(image_urls, 1): | |
| st.text(f"{i}. {img_url}") | |
| # Download images | |
| st.subheader("Downloading Images...") | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| downloaded_images = [] | |
| failed_downloads = [] | |
| # Create a session for efficient downloading | |
| session = requests.Session() | |
| session.headers.update({ | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.9', | |
| 'Accept-Encoding': 'gzip, deflate, br', | |
| 'DNT': '1', | |
| 'Connection': 'keep-alive', | |
| 'Upgrade-Insecure-Requests': '1' | |
| }) | |
| for i, img_url in enumerate(image_urls): | |
| status_text.text(f"Downloading image {i+1}/{len(image_urls)}: {img_url[:50]}...") | |
| image_content, filename, error = download_image(img_url, session) | |
| if image_content and filename: | |
| downloaded_images.append((filename, image_content)) | |
| else: | |
| failed_downloads.append((img_url, error)) | |
| progress_bar.progress((i + 1) / len(image_urls)) | |
| time.sleep(0.1) # Small delay to avoid overwhelming the server | |
| session.close() | |
| # Show results | |
| st.subheader("Download Results") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.metric("β Successfully Downloaded", len(downloaded_images)) | |
| with col2: | |
| st.metric("β Failed Downloads", len(failed_downloads)) | |
| # Show failed downloads | |
| if failed_downloads: | |
| with st.expander("Failed Downloads"): | |
| for img_url, error in failed_downloads: | |
| st.text(f"β {img_url}") | |
| st.text(f" Error: {error}") | |
| st.text("") | |
| # Create download button for ZIP file | |
| if downloaded_images: | |
| st.subheader("Download All Images") | |
| zip_buffer = create_zip_file(downloaded_images) | |
| st.download_button( | |
| label=f"π₯ Download ZIP file ({len(downloaded_images)} images)", | |
| data=zip_buffer.getvalue(), | |
| file_name=f"images_{urlparse(url).netloc}_{int(time.time())}.zip", | |
| mime="application/zip", | |
| type="primary" | |
| ) | |
| # Show preview of first few images | |
| st.subheader("Image Preview") | |
| preview_cols = st.columns(4) | |
| preview_count = min(8, len(downloaded_images)) | |
| for i in range(preview_count): | |
| filename, image_content = downloaded_images[i] | |
| try: | |
| # Try to display image preview | |
| image = Image.open(BytesIO(image_content)) | |
| with preview_cols[i % 4]: | |
| st.image(image, caption=filename, use_container_width=True) | |
| except: | |
| # If image can't be displayed, show filename only | |
| with preview_cols[i % 4]: | |
| st.text(f"π {filename}") | |
| if len(downloaded_images) > preview_count: | |
| st.text(f"... and {len(downloaded_images) - preview_count} more images") | |
| elif crawl_button and not url: | |
| st.error("Please enter a URL to crawl.") | |
| # Instructions | |
| st.markdown("---") | |
| st.subheader("How to use:") | |
| st.markdown(""" | |
| 1. Enter a valid website URL (must include http:// or https://) | |
| 2. Click the "Crawl Images" button | |
| 3. Wait for the application to find and download all images | |
| 4. Download the ZIP file containing all images | |
| **Note:** This enhanced crawler finds images from: | |
| - Standard `<img>` tags with various lazy-loading attributes | |
| - CSS background images (inline styles and external stylesheets) | |
| - Shopify banners and product images | |
| - Meta tags (Open Graph, Twitter Cards) | |
| - JSON-LD structured data | |
| - SVG embedded images | |
| - Container elements with background images | |
| It does not crawl subpages or follow links - only the main page content. | |
| """) | |
| st.markdown("---") | |
| st.markdown("β οΈ **Disclaimer:** Please respect website terms of service and copyright laws when downloading images.") | |
| # Run the main function | |
| main() |