File size: 17,692 Bytes
352665d
87a5d61
 
 
 
 
 
 
 
 
352665d
87a5d61
 
 
 
 
 
 
352665d
cc10ac9
ccffe7a
cc10ac9
 
ccffe7a
 
 
cc10ac9
ccffe7a
 
 
 
 
cc10ac9
 
 
87a5d61
ccffe7a
87a5d61
 
ccffe7a
87a5d61
 
ccffe7a
87a5d61
 
 
ccffe7a
87a5d61
ccffe7a
87a5d61
 
ccffe7a
 
 
cc10ac9
 
ccffe7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc10ac9
ccffe7a
 
87a5d61
ccffe7a
 
 
cc10ac9
 
ccffe7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc10ac9
ccffe7a
cc10ac9
 
 
 
 
 
ccffe7a
cc10ac9
ccffe7a
cc10ac9
 
 
 
 
 
 
 
 
ccffe7a
cc10ac9
ccffe7a
cc10ac9
 
ccffe7a
 
 
 
 
cc10ac9
ccffe7a
 
cc10ac9
 
 
 
ccffe7a
cc10ac9
 
 
 
ccffe7a
 
 
 
 
 
 
 
cc10ac9
ccffe7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc10ac9
 
ccffe7a
 
 
 
cc10ac9
 
87a5d61
 
 
 
 
 
 
352665d
87a5d61
cc10ac9
87a5d61
cc10ac9
87a5d61
 
 
cc10ac9
 
87a5d61
 
 
 
 
cc10ac9
 
 
 
87a5d61
 
cc10ac9
 
 
 
 
 
 
 
87a5d61
 
cc10ac9
 
 
87a5d61
 
 
 
 
 
 
 
 
cc10ac9
 
87a5d61
 
 
 
 
 
 
 
 
352665d
87a5d61
 
 
 
 
 
 
 
 
 
352665d
87a5d61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc10ac9
 
 
 
 
 
 
87a5d61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb2a5ba
87a5d61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc10ac9
 
 
 
 
 
 
 
 
 
87a5d61
 
 
 
352665d
87a5d61
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
import streamlit as st
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import zipfile
from io import BytesIO
import time
from PIL import Image
import hashlib

def is_valid_url(url):
    """Check if the provided URL is valid"""
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except:
        return False

def extract_css_background_images(css_content, base_url):
    """Extract background image URLs from CSS content"""
    import re
    
    image_urls = []
    # Pattern to match background-image: url() declarations
    bg_pattern = r'background(?:-image)?\s*:\s*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)'
    
    matches = re.findall(bg_pattern, css_content, re.IGNORECASE)
    for match in matches:
        if match and not match.startswith('data:'):
            absolute_url = urljoin(base_url, match)
            image_urls.append(absolute_url)
    
    return image_urls

def get_image_urls(url):
    """Extract all image URLs from the given webpage using comprehensive methods"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        image_urls = set()  # Use set to avoid duplicates
        
        # 1. Standard img tags with multiple attribute checks
        img_tags = soup.find_all('img')
        for img in img_tags:
            # Check multiple possible attributes
            for attr in ['src', 'data-src', 'data-original', 'data-lazy-src', 'data-srcset', 
                        'data-image', 'data-bg', 'data-background', 'data-large-image']:
                value = img.get(attr)
                if value and not value.startswith('data:'):
                    # Handle srcset (multiple images)
                    if 'srcset' in attr.lower() or ',' in value:
                        urls = value.split(',')
                        for url_part in urls:
                            clean_url = url_part.strip().split(' ')[0]
                            if clean_url:
                                absolute_url = urljoin(url, clean_url)
                                image_urls.add(absolute_url)
                    else:
                        absolute_url = urljoin(url, value)
                        image_urls.add(absolute_url)
        
        # 2. Picture and source tags
        picture_tags = soup.find_all(['picture', 'source'])
        for tag in picture_tags:
            for attr in ['src', 'srcset', 'data-src', 'data-srcset']:
                value = tag.get(attr)
                if value and not value.startswith('data:'):
                    if 'srcset' in attr.lower() or ',' in value:
                        urls = value.split(',')
                        for url_part in urls:
                            clean_url = url_part.strip().split(' ')[0]
                            if clean_url:
                                absolute_url = urljoin(url, clean_url)
                                image_urls.add(absolute_url)
                    else:
                        absolute_url = urljoin(url, value)
                        image_urls.add(absolute_url)
        
        # 3. Divs and other elements with background images in style attribute
        all_elements = soup.find_all(attrs={'style': True})
        for element in all_elements:
            style = element.get('style', '')
            if 'background' in style.lower() and 'url(' in style:
                import re
                bg_matches = re.findall(r'url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)', style, re.IGNORECASE)
                for match in bg_matches:
                    if match and not match.startswith('data:'):
                        absolute_url = urljoin(url, match)
                        image_urls.add(absolute_url)
        
        # 4. Elements with data attributes that might contain image URLs
        data_attrs = ['data-background-image', 'data-bg-src', 'data-hero-image', 
                     'data-banner', 'data-slide-img', 'data-thumb', 'data-image-src',
                     'data-full-size', 'data-zoom-image', 'data-lightbox']
        
        for attr in data_attrs:
            elements = soup.find_all(attrs={attr: True})
            for element in elements:
                value = element.get(attr)
                if value and not value.startswith('data:'):
                    absolute_url = urljoin(url, value)
                    image_urls.add(absolute_url)
        
        # 5. CSS background images from <style> tags
        style_tags = soup.find_all('style')
        for style_tag in style_tags:
            if style_tag.string:
                css_images = extract_css_background_images(style_tag.string, url)
                image_urls.update(css_images)
        
        # 6. External CSS files
        link_tags = soup.find_all('link', {'rel': 'stylesheet'})
        for link in link_tags[:5]:  # Limit to first 5 CSS files to avoid overload
            css_url = link.get('href')
            if css_url:
                try:
                    css_absolute_url = urljoin(url, css_url)
                    css_response = requests.get(css_absolute_url, headers=headers, timeout=10)
                    if css_response.status_code == 200:
                        css_images = extract_css_background_images(css_response.text, url)
                        image_urls.update(css_images)
                except:
                    continue  # Skip if CSS file can't be loaded
        
        # 7. Meta tags (Open Graph, Twitter Cards, etc.)
        meta_tags = soup.find_all('meta')
        for meta in meta_tags:
            for attr in ['content', 'value']:
                value = meta.get(attr, '')
                if value and any(ext in value.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']):
                    if 'http' in value:
                        image_urls.add(value)
                    else:
                        absolute_url = urljoin(url, value)
                        image_urls.add(absolute_url)
        
        # 8. SVG images
        svg_tags = soup.find_all('svg')
        for svg in svg_tags:
            # Look for embedded images in SVG
            image_elements = svg.find_all('image')
            for img in image_elements:
                href = img.get('href') or img.get('xlink:href')
                if href and not href.startswith('data:'):
                    absolute_url = urljoin(url, href)
                    image_urls.add(absolute_url)
        
        # 9. Shopify specific selectors
        shopify_selectors = [
            '[data-bgset]', '[data-widths]', '.hero__image', '.banner__media img',
            '.card__media img', '.product__media img', '[data-shopify]'
        ]
        
        for selector in shopify_selectors:
            try:
                elements = soup.select(selector)
                for element in elements:
                    for attr in ['src', 'data-src', 'data-bgset', 'data-widths', 'srcset']:
                        value = element.get(attr)
                        if value and not value.startswith('data:'):
                            if 'bgset' in attr or 'widths' in attr or 'srcset' in attr:
                                # Parse complex attribute formats
                                import re
                                urls = re.findall(r'https?://[^\s,]+', value)
                                for found_url in urls:
                                    image_urls.add(found_url)
                            else:
                                absolute_url = urljoin(url, value)
                                image_urls.add(absolute_url)
            except:
                continue
        
        # 10. Look for JSON-LD structured data
        json_scripts = soup.find_all('script', {'type': 'application/ld+json'})
        for script in json_scripts:
            try:
                import json
                data = json.loads(script.string)
                json_str = json.dumps(data)
                import re
                urls = re.findall(r'https?://[^\s"]+\.(?:jpg|jpeg|png|gif|webp|svg)', json_str, re.IGNORECASE)
                image_urls.update(urls)
            except:
                continue
        
        # Filter out obviously invalid URLs and convert to list
        valid_image_urls = []
        for img_url in image_urls:
            if img_url and len(img_url) > 10 and not img_url.startswith('data:'):
                # Basic validation - must look like a URL
                if '.' in img_url and ('http' in img_url or img_url.startswith('//')):
                    valid_image_urls.append(img_url)
        
        return valid_image_urls
    
    except requests.RequestException as e:
        st.error(f"Error fetching the webpage: {str(e)}")
        return []
    except Exception as e:
        st.error(f"Error parsing the webpage: {str(e)}")
        return []

def download_image(url, session):
    """Download a single image with better error handling"""
    try:
        response = session.get(url, timeout=15, stream=True)
        response.raise_for_status()
        
        # Check if the response contains image data
        content_type = response.headers.get('content-type', '').lower()
        if not any(img_type in content_type for img_type in ['image/', 'application/octet-stream']):
            return None, None, f"Not an image: {content_type}"
        
        # Get image content
        image_content = response.content
        
        # Skip very small files (likely 1x1 tracking pixels)
        if len(image_content) < 500:
            return None, None, "Image too small (likely tracking pixel)"
        
        # Generate filename
        url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
        
        # Extract filename from URL if possible
        url_path = urlparse(url).path
        if url_path and '.' in url_path.split('/')[-1]:
            original_name = url_path.split('/')[-1].split('.')[0][:20]  # Limit length
            filename = f"{original_name}_{url_hash}"
        else:
            filename = f"image_{url_hash}"
        
        # Try to get file extension from URL or content-type
        if '.' in url.split('/')[-1] and '?' not in url.split('/')[-1].split('.')[-1]:
            ext = url.split('/')[-1].split('.')[-1].split('?')[0].lower()
            if ext in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp', 'svg', 'ico']:
                filename += f".{ext}"
        elif 'jpeg' in content_type:
            filename += ".jpg"
        elif 'png' in content_type:
            filename += ".png"
        elif 'gif' in content_type:
            filename += ".gif"
        elif 'webp' in content_type:
            filename += ".webp"
        elif 'svg' in content_type:
            filename += ".svg"
        else:
            filename += ".jpg"  # Default extension
        
        return image_content, filename, None
    
    except requests.RequestException as e:
        return None, None, f"Download error: {str(e)}"
    except Exception as e:
        return None, None, f"Unexpected error: {str(e)}"

def create_zip_file(images_data):
    """Create a ZIP file containing all downloaded images"""
    zip_buffer = BytesIO()
    
    with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
        for filename, image_content in images_data:
            zip_file.writestr(filename, image_content)
    
    zip_buffer.seek(0)
    return zip_buffer

def main():
    st.set_page_config(
        page_title="Website Image Crawler",
        page_icon="πŸ–ΌοΈ",
        layout="wide"
    )
    
    st.title("πŸ–ΌοΈ Website Image Crawler")
    st.markdown("Enter a website URL to extract and download all images from that page.")
    
    # URL input
    url = st.text_input("Enter Website URL:", placeholder="https://example.com")
    
    col1, col2 = st.columns([1, 4])
    
    with col1:
        crawl_button = st.button("πŸ” Crawl Images", type="primary")
    
    if crawl_button and url:
        if not is_valid_url(url):
            st.error("Please enter a valid URL (including http:// or https://)")
            return
        
        with st.spinner("Crawling website for images..."):
            # Get image URLs
            image_urls = get_image_urls(url)
            
            if not image_urls:
                st.warning("No images found on the provided webpage.")
                return
            
            st.success(f"Found {len(image_urls)} images on the webpage!")
            
            # Show found URLs in an expander
            with st.expander(f"Found Image URLs ({len(image_urls)})"):
                for i, img_url in enumerate(image_urls, 1):
                    st.text(f"{i}. {img_url}")
        
        # Download images
        st.subheader("Downloading Images...")
        
        progress_bar = st.progress(0)
        status_text = st.empty()
        downloaded_images = []
        failed_downloads = []
        
        # Create a session for efficient downloading
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        })
        
        for i, img_url in enumerate(image_urls):
            status_text.text(f"Downloading image {i+1}/{len(image_urls)}: {img_url[:50]}...")
            
            image_content, filename, error = download_image(img_url, session)
            
            if image_content and filename:
                downloaded_images.append((filename, image_content))
            else:
                failed_downloads.append((img_url, error))
            
            progress_bar.progress((i + 1) / len(image_urls))
            time.sleep(0.1)  # Small delay to avoid overwhelming the server
        
        session.close()
        
        # Show results
        st.subheader("Download Results")
        
        col1, col2 = st.columns(2)
        
        with col1:
            st.metric("βœ… Successfully Downloaded", len(downloaded_images))
        
        with col2:
            st.metric("❌ Failed Downloads", len(failed_downloads))
        
        # Show failed downloads
        if failed_downloads:
            with st.expander("Failed Downloads"):
                for img_url, error in failed_downloads:
                    st.text(f"❌ {img_url}")
                    st.text(f"   Error: {error}")
                    st.text("")
        
        # Create download button for ZIP file
        if downloaded_images:
            st.subheader("Download All Images")
            
            zip_buffer = create_zip_file(downloaded_images)
            
            st.download_button(
                label=f"πŸ“₯ Download ZIP file ({len(downloaded_images)} images)",
                data=zip_buffer.getvalue(),
                file_name=f"images_{urlparse(url).netloc}_{int(time.time())}.zip",
                mime="application/zip",
                type="primary"
            )
            
            # Show preview of first few images
            st.subheader("Image Preview")
            
            preview_cols = st.columns(4)
            preview_count = min(8, len(downloaded_images))
            
            for i in range(preview_count):
                filename, image_content = downloaded_images[i]
                
                try:
                    # Try to display image preview
                    image = Image.open(BytesIO(image_content))
                    
                    with preview_cols[i % 4]:
                        st.image(image, caption=filename, use_container_width=True)
                except:
                    # If image can't be displayed, show filename only
                    with preview_cols[i % 4]:
                        st.text(f"πŸ“„ {filename}")
            
            if len(downloaded_images) > preview_count:
                st.text(f"... and {len(downloaded_images) - preview_count} more images")
    
    elif crawl_button and not url:
        st.error("Please enter a URL to crawl.")
    
    # Instructions
    st.markdown("---")
    st.subheader("How to use:")
    st.markdown("""
    1. Enter a valid website URL (must include http:// or https://)
    2. Click the "Crawl Images" button
    3. Wait for the application to find and download all images
    4. Download the ZIP file containing all images
    
    **Note:** This enhanced crawler finds images from:
    - Standard `<img>` tags with various lazy-loading attributes
    - CSS background images (inline styles and external stylesheets)
    - Shopify banners and product images
    - Meta tags (Open Graph, Twitter Cards)
    - JSON-LD structured data
    - SVG embedded images
    - Container elements with background images
    
    It does not crawl subpages or follow links - only the main page content.
    """)
    
    st.markdown("---")
    st.markdown("⚠️ **Disclaimer:** Please respect website terms of service and copyright laws when downloading images.")

# Run the main function
main()