Spaces:
Sleeping
Sleeping
File size: 17,692 Bytes
352665d 87a5d61 352665d 87a5d61 352665d cc10ac9 ccffe7a cc10ac9 ccffe7a cc10ac9 ccffe7a cc10ac9 87a5d61 ccffe7a 87a5d61 ccffe7a 87a5d61 ccffe7a 87a5d61 ccffe7a 87a5d61 ccffe7a 87a5d61 ccffe7a cc10ac9 ccffe7a cc10ac9 ccffe7a 87a5d61 ccffe7a cc10ac9 ccffe7a cc10ac9 ccffe7a cc10ac9 ccffe7a cc10ac9 ccffe7a cc10ac9 ccffe7a cc10ac9 ccffe7a cc10ac9 ccffe7a cc10ac9 ccffe7a cc10ac9 ccffe7a cc10ac9 ccffe7a cc10ac9 ccffe7a cc10ac9 ccffe7a cc10ac9 87a5d61 352665d 87a5d61 cc10ac9 87a5d61 cc10ac9 87a5d61 cc10ac9 87a5d61 cc10ac9 87a5d61 cc10ac9 87a5d61 cc10ac9 87a5d61 cc10ac9 87a5d61 352665d 87a5d61 352665d 87a5d61 cc10ac9 87a5d61 cb2a5ba 87a5d61 cc10ac9 87a5d61 352665d 87a5d61 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 | import streamlit as st
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import zipfile
from io import BytesIO
import time
from PIL import Image
import hashlib
def is_valid_url(url):
"""Check if the provided URL is valid"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except:
return False
def extract_css_background_images(css_content, base_url):
"""Extract background image URLs from CSS content"""
import re
image_urls = []
# Pattern to match background-image: url() declarations
bg_pattern = r'background(?:-image)?\s*:\s*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)'
matches = re.findall(bg_pattern, css_content, re.IGNORECASE)
for match in matches:
if match and not match.startswith('data:'):
absolute_url = urljoin(base_url, match)
image_urls.append(absolute_url)
return image_urls
def get_image_urls(url):
"""Extract all image URLs from the given webpage using comprehensive methods"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
image_urls = set() # Use set to avoid duplicates
# 1. Standard img tags with multiple attribute checks
img_tags = soup.find_all('img')
for img in img_tags:
# Check multiple possible attributes
for attr in ['src', 'data-src', 'data-original', 'data-lazy-src', 'data-srcset',
'data-image', 'data-bg', 'data-background', 'data-large-image']:
value = img.get(attr)
if value and not value.startswith('data:'):
# Handle srcset (multiple images)
if 'srcset' in attr.lower() or ',' in value:
urls = value.split(',')
for url_part in urls:
clean_url = url_part.strip().split(' ')[0]
if clean_url:
absolute_url = urljoin(url, clean_url)
image_urls.add(absolute_url)
else:
absolute_url = urljoin(url, value)
image_urls.add(absolute_url)
# 2. Picture and source tags
picture_tags = soup.find_all(['picture', 'source'])
for tag in picture_tags:
for attr in ['src', 'srcset', 'data-src', 'data-srcset']:
value = tag.get(attr)
if value and not value.startswith('data:'):
if 'srcset' in attr.lower() or ',' in value:
urls = value.split(',')
for url_part in urls:
clean_url = url_part.strip().split(' ')[0]
if clean_url:
absolute_url = urljoin(url, clean_url)
image_urls.add(absolute_url)
else:
absolute_url = urljoin(url, value)
image_urls.add(absolute_url)
# 3. Divs and other elements with background images in style attribute
all_elements = soup.find_all(attrs={'style': True})
for element in all_elements:
style = element.get('style', '')
if 'background' in style.lower() and 'url(' in style:
import re
bg_matches = re.findall(r'url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)', style, re.IGNORECASE)
for match in bg_matches:
if match and not match.startswith('data:'):
absolute_url = urljoin(url, match)
image_urls.add(absolute_url)
# 4. Elements with data attributes that might contain image URLs
data_attrs = ['data-background-image', 'data-bg-src', 'data-hero-image',
'data-banner', 'data-slide-img', 'data-thumb', 'data-image-src',
'data-full-size', 'data-zoom-image', 'data-lightbox']
for attr in data_attrs:
elements = soup.find_all(attrs={attr: True})
for element in elements:
value = element.get(attr)
if value and not value.startswith('data:'):
absolute_url = urljoin(url, value)
image_urls.add(absolute_url)
# 5. CSS background images from <style> tags
style_tags = soup.find_all('style')
for style_tag in style_tags:
if style_tag.string:
css_images = extract_css_background_images(style_tag.string, url)
image_urls.update(css_images)
# 6. External CSS files
link_tags = soup.find_all('link', {'rel': 'stylesheet'})
for link in link_tags[:5]: # Limit to first 5 CSS files to avoid overload
css_url = link.get('href')
if css_url:
try:
css_absolute_url = urljoin(url, css_url)
css_response = requests.get(css_absolute_url, headers=headers, timeout=10)
if css_response.status_code == 200:
css_images = extract_css_background_images(css_response.text, url)
image_urls.update(css_images)
except:
continue # Skip if CSS file can't be loaded
# 7. Meta tags (Open Graph, Twitter Cards, etc.)
meta_tags = soup.find_all('meta')
for meta in meta_tags:
for attr in ['content', 'value']:
value = meta.get(attr, '')
if value and any(ext in value.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']):
if 'http' in value:
image_urls.add(value)
else:
absolute_url = urljoin(url, value)
image_urls.add(absolute_url)
# 8. SVG images
svg_tags = soup.find_all('svg')
for svg in svg_tags:
# Look for embedded images in SVG
image_elements = svg.find_all('image')
for img in image_elements:
href = img.get('href') or img.get('xlink:href')
if href and not href.startswith('data:'):
absolute_url = urljoin(url, href)
image_urls.add(absolute_url)
# 9. Shopify specific selectors
shopify_selectors = [
'[data-bgset]', '[data-widths]', '.hero__image', '.banner__media img',
'.card__media img', '.product__media img', '[data-shopify]'
]
for selector in shopify_selectors:
try:
elements = soup.select(selector)
for element in elements:
for attr in ['src', 'data-src', 'data-bgset', 'data-widths', 'srcset']:
value = element.get(attr)
if value and not value.startswith('data:'):
if 'bgset' in attr or 'widths' in attr or 'srcset' in attr:
# Parse complex attribute formats
import re
urls = re.findall(r'https?://[^\s,]+', value)
for found_url in urls:
image_urls.add(found_url)
else:
absolute_url = urljoin(url, value)
image_urls.add(absolute_url)
except:
continue
# 10. Look for JSON-LD structured data
json_scripts = soup.find_all('script', {'type': 'application/ld+json'})
for script in json_scripts:
try:
import json
data = json.loads(script.string)
json_str = json.dumps(data)
import re
urls = re.findall(r'https?://[^\s"]+\.(?:jpg|jpeg|png|gif|webp|svg)', json_str, re.IGNORECASE)
image_urls.update(urls)
except:
continue
# Filter out obviously invalid URLs and convert to list
valid_image_urls = []
for img_url in image_urls:
if img_url and len(img_url) > 10 and not img_url.startswith('data:'):
# Basic validation - must look like a URL
if '.' in img_url and ('http' in img_url or img_url.startswith('//')):
valid_image_urls.append(img_url)
return valid_image_urls
except requests.RequestException as e:
st.error(f"Error fetching the webpage: {str(e)}")
return []
except Exception as e:
st.error(f"Error parsing the webpage: {str(e)}")
return []
def download_image(url, session):
"""Download a single image with better error handling"""
try:
response = session.get(url, timeout=15, stream=True)
response.raise_for_status()
# Check if the response contains image data
content_type = response.headers.get('content-type', '').lower()
if not any(img_type in content_type for img_type in ['image/', 'application/octet-stream']):
return None, None, f"Not an image: {content_type}"
# Get image content
image_content = response.content
# Skip very small files (likely 1x1 tracking pixels)
if len(image_content) < 500:
return None, None, "Image too small (likely tracking pixel)"
# Generate filename
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
# Extract filename from URL if possible
url_path = urlparse(url).path
if url_path and '.' in url_path.split('/')[-1]:
original_name = url_path.split('/')[-1].split('.')[0][:20] # Limit length
filename = f"{original_name}_{url_hash}"
else:
filename = f"image_{url_hash}"
# Try to get file extension from URL or content-type
if '.' in url.split('/')[-1] and '?' not in url.split('/')[-1].split('.')[-1]:
ext = url.split('/')[-1].split('.')[-1].split('?')[0].lower()
if ext in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp', 'svg', 'ico']:
filename += f".{ext}"
elif 'jpeg' in content_type:
filename += ".jpg"
elif 'png' in content_type:
filename += ".png"
elif 'gif' in content_type:
filename += ".gif"
elif 'webp' in content_type:
filename += ".webp"
elif 'svg' in content_type:
filename += ".svg"
else:
filename += ".jpg" # Default extension
return image_content, filename, None
except requests.RequestException as e:
return None, None, f"Download error: {str(e)}"
except Exception as e:
return None, None, f"Unexpected error: {str(e)}"
def create_zip_file(images_data):
"""Create a ZIP file containing all downloaded images"""
zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
for filename, image_content in images_data:
zip_file.writestr(filename, image_content)
zip_buffer.seek(0)
return zip_buffer
def main():
st.set_page_config(
page_title="Website Image Crawler",
page_icon="πΌοΈ",
layout="wide"
)
st.title("πΌοΈ Website Image Crawler")
st.markdown("Enter a website URL to extract and download all images from that page.")
# URL input
url = st.text_input("Enter Website URL:", placeholder="https://example.com")
col1, col2 = st.columns([1, 4])
with col1:
crawl_button = st.button("π Crawl Images", type="primary")
if crawl_button and url:
if not is_valid_url(url):
st.error("Please enter a valid URL (including http:// or https://)")
return
with st.spinner("Crawling website for images..."):
# Get image URLs
image_urls = get_image_urls(url)
if not image_urls:
st.warning("No images found on the provided webpage.")
return
st.success(f"Found {len(image_urls)} images on the webpage!")
# Show found URLs in an expander
with st.expander(f"Found Image URLs ({len(image_urls)})"):
for i, img_url in enumerate(image_urls, 1):
st.text(f"{i}. {img_url}")
# Download images
st.subheader("Downloading Images...")
progress_bar = st.progress(0)
status_text = st.empty()
downloaded_images = []
failed_downloads = []
# Create a session for efficient downloading
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
})
for i, img_url in enumerate(image_urls):
status_text.text(f"Downloading image {i+1}/{len(image_urls)}: {img_url[:50]}...")
image_content, filename, error = download_image(img_url, session)
if image_content and filename:
downloaded_images.append((filename, image_content))
else:
failed_downloads.append((img_url, error))
progress_bar.progress((i + 1) / len(image_urls))
time.sleep(0.1) # Small delay to avoid overwhelming the server
session.close()
# Show results
st.subheader("Download Results")
col1, col2 = st.columns(2)
with col1:
st.metric("β
Successfully Downloaded", len(downloaded_images))
with col2:
st.metric("β Failed Downloads", len(failed_downloads))
# Show failed downloads
if failed_downloads:
with st.expander("Failed Downloads"):
for img_url, error in failed_downloads:
st.text(f"β {img_url}")
st.text(f" Error: {error}")
st.text("")
# Create download button for ZIP file
if downloaded_images:
st.subheader("Download All Images")
zip_buffer = create_zip_file(downloaded_images)
st.download_button(
label=f"π₯ Download ZIP file ({len(downloaded_images)} images)",
data=zip_buffer.getvalue(),
file_name=f"images_{urlparse(url).netloc}_{int(time.time())}.zip",
mime="application/zip",
type="primary"
)
# Show preview of first few images
st.subheader("Image Preview")
preview_cols = st.columns(4)
preview_count = min(8, len(downloaded_images))
for i in range(preview_count):
filename, image_content = downloaded_images[i]
try:
# Try to display image preview
image = Image.open(BytesIO(image_content))
with preview_cols[i % 4]:
st.image(image, caption=filename, use_container_width=True)
except:
# If image can't be displayed, show filename only
with preview_cols[i % 4]:
st.text(f"π {filename}")
if len(downloaded_images) > preview_count:
st.text(f"... and {len(downloaded_images) - preview_count} more images")
elif crawl_button and not url:
st.error("Please enter a URL to crawl.")
# Instructions
st.markdown("---")
st.subheader("How to use:")
st.markdown("""
1. Enter a valid website URL (must include http:// or https://)
2. Click the "Crawl Images" button
3. Wait for the application to find and download all images
4. Download the ZIP file containing all images
**Note:** This enhanced crawler finds images from:
- Standard `<img>` tags with various lazy-loading attributes
- CSS background images (inline styles and external stylesheets)
- Shopify banners and product images
- Meta tags (Open Graph, Twitter Cards)
- JSON-LD structured data
- SVG embedded images
- Container elements with background images
It does not crawl subpages or follow links - only the main page content.
""")
st.markdown("---")
st.markdown("β οΈ **Disclaimer:** Please respect website terms of service and copyright laws when downloading images.")
# Run the main function
main() |