img.scrapper / src /streamlit_app.py
maaz21's picture
Update src/streamlit_app.py
ccffe7a verified
import streamlit as st
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import zipfile
from io import BytesIO
import time
from PIL import Image
import hashlib
def is_valid_url(url):
"""Check if the provided URL is valid"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except:
return False
def extract_css_background_images(css_content, base_url):
"""Extract background image URLs from CSS content"""
import re
image_urls = []
# Pattern to match background-image: url() declarations
bg_pattern = r'background(?:-image)?\s*:\s*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)'
matches = re.findall(bg_pattern, css_content, re.IGNORECASE)
for match in matches:
if match and not match.startswith('data:'):
absolute_url = urljoin(base_url, match)
image_urls.append(absolute_url)
return image_urls
def get_image_urls(url):
"""Extract all image URLs from the given webpage using comprehensive methods"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
image_urls = set() # Use set to avoid duplicates
# 1. Standard img tags with multiple attribute checks
img_tags = soup.find_all('img')
for img in img_tags:
# Check multiple possible attributes
for attr in ['src', 'data-src', 'data-original', 'data-lazy-src', 'data-srcset',
'data-image', 'data-bg', 'data-background', 'data-large-image']:
value = img.get(attr)
if value and not value.startswith('data:'):
# Handle srcset (multiple images)
if 'srcset' in attr.lower() or ',' in value:
urls = value.split(',')
for url_part in urls:
clean_url = url_part.strip().split(' ')[0]
if clean_url:
absolute_url = urljoin(url, clean_url)
image_urls.add(absolute_url)
else:
absolute_url = urljoin(url, value)
image_urls.add(absolute_url)
# 2. Picture and source tags
picture_tags = soup.find_all(['picture', 'source'])
for tag in picture_tags:
for attr in ['src', 'srcset', 'data-src', 'data-srcset']:
value = tag.get(attr)
if value and not value.startswith('data:'):
if 'srcset' in attr.lower() or ',' in value:
urls = value.split(',')
for url_part in urls:
clean_url = url_part.strip().split(' ')[0]
if clean_url:
absolute_url = urljoin(url, clean_url)
image_urls.add(absolute_url)
else:
absolute_url = urljoin(url, value)
image_urls.add(absolute_url)
# 3. Divs and other elements with background images in style attribute
all_elements = soup.find_all(attrs={'style': True})
for element in all_elements:
style = element.get('style', '')
if 'background' in style.lower() and 'url(' in style:
import re
bg_matches = re.findall(r'url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)', style, re.IGNORECASE)
for match in bg_matches:
if match and not match.startswith('data:'):
absolute_url = urljoin(url, match)
image_urls.add(absolute_url)
# 4. Elements with data attributes that might contain image URLs
data_attrs = ['data-background-image', 'data-bg-src', 'data-hero-image',
'data-banner', 'data-slide-img', 'data-thumb', 'data-image-src',
'data-full-size', 'data-zoom-image', 'data-lightbox']
for attr in data_attrs:
elements = soup.find_all(attrs={attr: True})
for element in elements:
value = element.get(attr)
if value and not value.startswith('data:'):
absolute_url = urljoin(url, value)
image_urls.add(absolute_url)
# 5. CSS background images from <style> tags
style_tags = soup.find_all('style')
for style_tag in style_tags:
if style_tag.string:
css_images = extract_css_background_images(style_tag.string, url)
image_urls.update(css_images)
# 6. External CSS files
link_tags = soup.find_all('link', {'rel': 'stylesheet'})
for link in link_tags[:5]: # Limit to first 5 CSS files to avoid overload
css_url = link.get('href')
if css_url:
try:
css_absolute_url = urljoin(url, css_url)
css_response = requests.get(css_absolute_url, headers=headers, timeout=10)
if css_response.status_code == 200:
css_images = extract_css_background_images(css_response.text, url)
image_urls.update(css_images)
except:
continue # Skip if CSS file can't be loaded
# 7. Meta tags (Open Graph, Twitter Cards, etc.)
meta_tags = soup.find_all('meta')
for meta in meta_tags:
for attr in ['content', 'value']:
value = meta.get(attr, '')
if value and any(ext in value.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']):
if 'http' in value:
image_urls.add(value)
else:
absolute_url = urljoin(url, value)
image_urls.add(absolute_url)
# 8. SVG images
svg_tags = soup.find_all('svg')
for svg in svg_tags:
# Look for embedded images in SVG
image_elements = svg.find_all('image')
for img in image_elements:
href = img.get('href') or img.get('xlink:href')
if href and not href.startswith('data:'):
absolute_url = urljoin(url, href)
image_urls.add(absolute_url)
# 9. Shopify specific selectors
shopify_selectors = [
'[data-bgset]', '[data-widths]', '.hero__image', '.banner__media img',
'.card__media img', '.product__media img', '[data-shopify]'
]
for selector in shopify_selectors:
try:
elements = soup.select(selector)
for element in elements:
for attr in ['src', 'data-src', 'data-bgset', 'data-widths', 'srcset']:
value = element.get(attr)
if value and not value.startswith('data:'):
if 'bgset' in attr or 'widths' in attr or 'srcset' in attr:
# Parse complex attribute formats
import re
urls = re.findall(r'https?://[^\s,]+', value)
for found_url in urls:
image_urls.add(found_url)
else:
absolute_url = urljoin(url, value)
image_urls.add(absolute_url)
except:
continue
# 10. Look for JSON-LD structured data
json_scripts = soup.find_all('script', {'type': 'application/ld+json'})
for script in json_scripts:
try:
import json
data = json.loads(script.string)
json_str = json.dumps(data)
import re
urls = re.findall(r'https?://[^\s"]+\.(?:jpg|jpeg|png|gif|webp|svg)', json_str, re.IGNORECASE)
image_urls.update(urls)
except:
continue
# Filter out obviously invalid URLs and convert to list
valid_image_urls = []
for img_url in image_urls:
if img_url and len(img_url) > 10 and not img_url.startswith('data:'):
# Basic validation - must look like a URL
if '.' in img_url and ('http' in img_url or img_url.startswith('//')):
valid_image_urls.append(img_url)
return valid_image_urls
except requests.RequestException as e:
st.error(f"Error fetching the webpage: {str(e)}")
return []
except Exception as e:
st.error(f"Error parsing the webpage: {str(e)}")
return []
def download_image(url, session):
"""Download a single image with better error handling"""
try:
response = session.get(url, timeout=15, stream=True)
response.raise_for_status()
# Check if the response contains image data
content_type = response.headers.get('content-type', '').lower()
if not any(img_type in content_type for img_type in ['image/', 'application/octet-stream']):
return None, None, f"Not an image: {content_type}"
# Get image content
image_content = response.content
# Skip very small files (likely 1x1 tracking pixels)
if len(image_content) < 500:
return None, None, "Image too small (likely tracking pixel)"
# Generate filename
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
# Extract filename from URL if possible
url_path = urlparse(url).path
if url_path and '.' in url_path.split('/')[-1]:
original_name = url_path.split('/')[-1].split('.')[0][:20] # Limit length
filename = f"{original_name}_{url_hash}"
else:
filename = f"image_{url_hash}"
# Try to get file extension from URL or content-type
if '.' in url.split('/')[-1] and '?' not in url.split('/')[-1].split('.')[-1]:
ext = url.split('/')[-1].split('.')[-1].split('?')[0].lower()
if ext in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp', 'svg', 'ico']:
filename += f".{ext}"
elif 'jpeg' in content_type:
filename += ".jpg"
elif 'png' in content_type:
filename += ".png"
elif 'gif' in content_type:
filename += ".gif"
elif 'webp' in content_type:
filename += ".webp"
elif 'svg' in content_type:
filename += ".svg"
else:
filename += ".jpg" # Default extension
return image_content, filename, None
except requests.RequestException as e:
return None, None, f"Download error: {str(e)}"
except Exception as e:
return None, None, f"Unexpected error: {str(e)}"
def create_zip_file(images_data):
"""Create a ZIP file containing all downloaded images"""
zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
for filename, image_content in images_data:
zip_file.writestr(filename, image_content)
zip_buffer.seek(0)
return zip_buffer
def main():
st.set_page_config(
page_title="Website Image Crawler",
page_icon="πŸ–ΌοΈ",
layout="wide"
)
st.title("πŸ–ΌοΈ Website Image Crawler")
st.markdown("Enter a website URL to extract and download all images from that page.")
# URL input
url = st.text_input("Enter Website URL:", placeholder="https://example.com")
col1, col2 = st.columns([1, 4])
with col1:
crawl_button = st.button("πŸ” Crawl Images", type="primary")
if crawl_button and url:
if not is_valid_url(url):
st.error("Please enter a valid URL (including http:// or https://)")
return
with st.spinner("Crawling website for images..."):
# Get image URLs
image_urls = get_image_urls(url)
if not image_urls:
st.warning("No images found on the provided webpage.")
return
st.success(f"Found {len(image_urls)} images on the webpage!")
# Show found URLs in an expander
with st.expander(f"Found Image URLs ({len(image_urls)})"):
for i, img_url in enumerate(image_urls, 1):
st.text(f"{i}. {img_url}")
# Download images
st.subheader("Downloading Images...")
progress_bar = st.progress(0)
status_text = st.empty()
downloaded_images = []
failed_downloads = []
# Create a session for efficient downloading
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
})
for i, img_url in enumerate(image_urls):
status_text.text(f"Downloading image {i+1}/{len(image_urls)}: {img_url[:50]}...")
image_content, filename, error = download_image(img_url, session)
if image_content and filename:
downloaded_images.append((filename, image_content))
else:
failed_downloads.append((img_url, error))
progress_bar.progress((i + 1) / len(image_urls))
time.sleep(0.1) # Small delay to avoid overwhelming the server
session.close()
# Show results
st.subheader("Download Results")
col1, col2 = st.columns(2)
with col1:
st.metric("βœ… Successfully Downloaded", len(downloaded_images))
with col2:
st.metric("❌ Failed Downloads", len(failed_downloads))
# Show failed downloads
if failed_downloads:
with st.expander("Failed Downloads"):
for img_url, error in failed_downloads:
st.text(f"❌ {img_url}")
st.text(f" Error: {error}")
st.text("")
# Create download button for ZIP file
if downloaded_images:
st.subheader("Download All Images")
zip_buffer = create_zip_file(downloaded_images)
st.download_button(
label=f"πŸ“₯ Download ZIP file ({len(downloaded_images)} images)",
data=zip_buffer.getvalue(),
file_name=f"images_{urlparse(url).netloc}_{int(time.time())}.zip",
mime="application/zip",
type="primary"
)
# Show preview of first few images
st.subheader("Image Preview")
preview_cols = st.columns(4)
preview_count = min(8, len(downloaded_images))
for i in range(preview_count):
filename, image_content = downloaded_images[i]
try:
# Try to display image preview
image = Image.open(BytesIO(image_content))
with preview_cols[i % 4]:
st.image(image, caption=filename, use_container_width=True)
except:
# If image can't be displayed, show filename only
with preview_cols[i % 4]:
st.text(f"πŸ“„ {filename}")
if len(downloaded_images) > preview_count:
st.text(f"... and {len(downloaded_images) - preview_count} more images")
elif crawl_button and not url:
st.error("Please enter a URL to crawl.")
# Instructions
st.markdown("---")
st.subheader("How to use:")
st.markdown("""
1. Enter a valid website URL (must include http:// or https://)
2. Click the "Crawl Images" button
3. Wait for the application to find and download all images
4. Download the ZIP file containing all images
**Note:** This enhanced crawler finds images from:
- Standard `<img>` tags with various lazy-loading attributes
- CSS background images (inline styles and external stylesheets)
- Shopify banners and product images
- Meta tags (Open Graph, Twitter Cards)
- JSON-LD structured data
- SVG embedded images
- Container elements with background images
It does not crawl subpages or follow links - only the main page content.
""")
st.markdown("---")
st.markdown("⚠️ **Disclaimer:** Please respect website terms of service and copyright laws when downloading images.")
# Run the main function
main()