Spaces:

maaz21
/

img.scrapper

Sleeping

File size: 17,692 Bytes

352665d
87a5d61
 
 
 
 
 
 
 
 
352665d
87a5d61
 
 
 
 
 
 
352665d
cc10ac9
ccffe7a
cc10ac9
 
ccffe7a
 
 
cc10ac9
ccffe7a
 
 
 
 
cc10ac9
 
 
87a5d61
ccffe7a
87a5d61
 
ccffe7a
87a5d61
 
ccffe7a
87a5d61
 
 
ccffe7a
87a5d61
ccffe7a
87a5d61
 
ccffe7a
 
 
cc10ac9
 
ccffe7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc10ac9
ccffe7a
 
87a5d61
ccffe7a
 
 
cc10ac9
 
ccffe7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc10ac9
ccffe7a
cc10ac9
 
 
 
 
 
ccffe7a
cc10ac9
ccffe7a
cc10ac9
 
 
 
 
 
 
 
 
ccffe7a
cc10ac9
ccffe7a
cc10ac9
 
ccffe7a
 
 
 
 
cc10ac9
ccffe7a
 
cc10ac9
 
 
 
ccffe7a
cc10ac9
 
 
 
ccffe7a
 
 
 
 
 
 
 
cc10ac9
ccffe7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc10ac9
 
ccffe7a
 
 
 
cc10ac9
 
87a5d61
 
 
 
 
 
 
352665d
87a5d61
cc10ac9
87a5d61
cc10ac9
87a5d61
 
 
cc10ac9
 
87a5d61
 
 
 
 
cc10ac9
 
 
 
87a5d61
 
cc10ac9
 
 
 
 
 
 
 
87a5d61
 
cc10ac9
 
 
87a5d61
 
 
 
 
 
 
 
 
cc10ac9
 
87a5d61
 
 
 
 
 
 
 
 
352665d
87a5d61
 
 
 
 
 
 
 
 
 
352665d
87a5d61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc10ac9
 
 
 
 
 
 
87a5d61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb2a5ba
87a5d61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc10ac9
 
 
 
 
 
 
 
 
 
87a5d61
 
 
 
352665d
87a5d61

import streamlit as st
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import zipfile
from io import BytesIO
import time
from PIL import Image
import hashlib

def is_valid_url(url):
    """Check if the provided URL is valid"""
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except:
        return False

def extract_css_background_images(css_content, base_url):
    """Extract background image URLs from CSS content"""
    import re
    
    image_urls = []
    # Pattern to match background-image: url() declarations
    bg_pattern = r'background(?:-image)?\s*:\s*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)'
    
    matches = re.findall(bg_pattern, css_content, re.IGNORECASE)
    for match in matches:
        if match and not match.startswith('data:'):
            absolute_url = urljoin(base_url, match)
            image_urls.append(absolute_url)
    
    return image_urls

def get_image_urls(url):
    """Extract all image URLs from the given webpage using comprehensive methods"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        image_urls = set()  # Use set to avoid duplicates
        
        # 1. Standard img tags with multiple attribute checks
        img_tags = soup.find_all('img')
        for img in img_tags:
            # Check multiple possible attributes
            for attr in ['src', 'data-src', 'data-original', 'data-lazy-src', 'data-srcset', 
                        'data-image', 'data-bg', 'data-background', 'data-large-image']:
                value = img.get(attr)
                if value and not value.startswith('data:'):
                    # Handle srcset (multiple images)
                    if 'srcset' in attr.lower() or ',' in value:
                        urls = value.split(',')
                        for url_part in urls:
                            clean_url = url_part.strip().split(' ')[0]
                            if clean_url:
                                absolute_url = urljoin(url, clean_url)
                                image_urls.add(absolute_url)
                    else:
                        absolute_url = urljoin(url, value)
                        image_urls.add(absolute_url)
        
        # 2. Picture and source tags
        picture_tags = soup.find_all(['picture', 'source'])
        for tag in picture_tags:
            for attr in ['src', 'srcset', 'data-src', 'data-srcset']:
                value = tag.get(attr)
                if value and not value.startswith('data:'):
                    if 'srcset' in attr.lower() or ',' in value:
                        urls = value.split(',')
                        for url_part in urls:
                            clean_url = url_part.strip().split(' ')[0]
                            if clean_url:
                                absolute_url = urljoin(url, clean_url)
                                image_urls.add(absolute_url)
                    else:
                        absolute_url = urljoin(url, value)
                        image_urls.add(absolute_url)
        
        # 3. Divs and other elements with background images in style attribute
        all_elements = soup.find_all(attrs={'style': True})
        for element in all_elements:
            style = element.get('style', '')
            if 'background' in style.lower() and 'url(' in style:
                import re
                bg_matches = re.findall(r'url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)', style, re.IGNORECASE)
                for match in bg_matches:
                    if match and not match.startswith('data:'):
                        absolute_url = urljoin(url, match)
                        image_urls.add(absolute_url)
        
        # 4. Elements with data attributes that might contain image URLs
        data_attrs = ['data-background-image', 'data-bg-src', 'data-hero-image', 
                     'data-banner', 'data-slide-img', 'data-thumb', 'data-image-src',
                     'data-full-size', 'data-zoom-image', 'data-lightbox']
        
        for attr in data_attrs:
            elements = soup.find_all(attrs={attr: True})
            for element in elements:
                value = element.get(attr)
                if value and not value.startswith('data:'):
                    absolute_url = urljoin(url, value)
                    image_urls.add(absolute_url)
        
        # 5. CSS background images from <style> tags
        style_tags = soup.find_all('style')
        for style_tag in style_tags:
            if style_tag.string:
                css_images = extract_css_background_images(style_tag.string, url)
                image_urls.update(css_images)
        
        # 6. External CSS files
        link_tags = soup.find_all('link', {'rel': 'stylesheet'})
        for link in link_tags[:5]:  # Limit to first 5 CSS files to avoid overload
            css_url = link.get('href')
            if css_url:
                try:
                    css_absolute_url = urljoin(url, css_url)
                    css_response = requests.get(css_absolute_url, headers=headers, timeout=10)
                    if css_response.status_code == 200:
                        css_images = extract_css_background_images(css_response.text, url)
                        image_urls.update(css_images)
                except:
                    continue  # Skip if CSS file can't be loaded
        
        # 7. Meta tags (Open Graph, Twitter Cards, etc.)
        meta_tags = soup.find_all('meta')
        for meta in meta_tags:
            for attr in ['content', 'value']:
                value = meta.get(attr, '')
                if value and any(ext in value.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']):
                    if 'http' in value:
                        image_urls.add(value)
                    else:
                        absolute_url = urljoin(url, value)
                        image_urls.add(absolute_url)
        
        # 8. SVG images
        svg_tags = soup.find_all('svg')
        for svg in svg_tags:
            # Look for embedded images in SVG
            image_elements = svg.find_all('image')
            for img in image_elements:
                href = img.get('href') or img.get('xlink:href')
                if href and not href.startswith('data:'):
                    absolute_url = urljoin(url, href)
                    image_urls.add(absolute_url)
        
        # 9. Shopify specific selectors
        shopify_selectors = [
            '[data-bgset]', '[data-widths]', '.hero__image', '.banner__media img',
            '.card__media img', '.product__media img', '[data-shopify]'
        ]
        
        for selector in shopify_selectors:
            try:
                elements = soup.select(selector)
                for element in elements:
                    for attr in ['src', 'data-src', 'data-bgset', 'data-widths', 'srcset']:
                        value = element.get(attr)
                        if value and not value.startswith('data:'):
                            if 'bgset' in attr or 'widths' in attr or 'srcset' in attr:
                                # Parse complex attribute formats
                                import re
                                urls = re.findall(r'https?://[^\s,]+', value)
                                for found_url in urls:
                                    image_urls.add(found_url)
                            else:
                                absolute_url = urljoin(url, value)
                                image_urls.add(absolute_url)
            except:
                continue
        
        # 10. Look for JSON-LD structured data
        json_scripts = soup.find_all('script', {'type': 'application/ld+json'})
        for script in json_scripts:
            try:
                import json
                data = json.loads(script.string)
                json_str = json.dumps(data)
                import re
                urls = re.findall(r'https?://[^\s"]+\.(?:jpg|jpeg|png|gif|webp|svg)', json_str, re.IGNORECASE)
                image_urls.update(urls)
            except:
                continue
        
        # Filter out obviously invalid URLs and convert to list
        valid_image_urls = []
        for img_url in image_urls:
            if img_url and len(img_url) > 10 and not img_url.startswith('data:'):
                # Basic validation - must look like a URL
                if '.' in img_url and ('http' in img_url or img_url.startswith('//')):
                    valid_image_urls.append(img_url)
        
        return valid_image_urls
    
    except requests.RequestException as e:
        st.error(f"Error fetching the webpage: {str(e)}")
        return []
    except Exception as e:
        st.error(f"Error parsing the webpage: {str(e)}")
        return []

def download_image(url, session):
    """Download a single image with better error handling"""
    try:
        response = session.get(url, timeout=15, stream=True)
        response.raise_for_status()
        
        # Check if the response contains image data
        content_type = response.headers.get('content-type', '').lower()
        if not any(img_type in content_type for img_type in ['image/', 'application/octet-stream']):
            return None, None, f"Not an image: {content_type}"
        
        # Get image content
        image_content = response.content
        
        # Skip very small files (likely 1x1 tracking pixels)
        if len(image_content) < 500:
            return None, None, "Image too small (likely tracking pixel)"
        
        # Generate filename
        url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
        
        # Extract filename from URL if possible
        url_path = urlparse(url).path
        if url_path and '.' in url_path.split('/')[-1]:
            original_name = url_path.split('/')[-1].split('.')[0][:20]  # Limit length
            filename = f"{original_name}_{url_hash}"
        else:
            filename = f"image_{url_hash}"
        
        # Try to get file extension from URL or content-type
        if '.' in url.split('/')[-1] and '?' not in url.split('/')[-1].split('.')[-1]:
            ext = url.split('/')[-1].split('.')[-1].split('?')[0].lower()
            if ext in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp', 'svg', 'ico']:
                filename += f".{ext}"
        elif 'jpeg' in content_type:
            filename += ".jpg"
        elif 'png' in content_type:
            filename += ".png"
        elif 'gif' in content_type:
            filename += ".gif"
        elif 'webp' in content_type:
            filename += ".webp"
        elif 'svg' in content_type:
            filename += ".svg"
        else:
            filename += ".jpg"  # Default extension
        
        return image_content, filename, None
    
    except requests.RequestException as e:
        return None, None, f"Download error: {str(e)}"
    except Exception as e:
        return None, None, f"Unexpected error: {str(e)}"

def create_zip_file(images_data):
    """Create a ZIP file containing all downloaded images"""
    zip_buffer = BytesIO()
    
    with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
        for filename, image_content in images_data:
            zip_file.writestr(filename, image_content)
    
    zip_buffer.seek(0)
    return zip_buffer

def main():
    st.set_page_config(
        page_title="Website Image Crawler",
        page_icon="🖼️",
        layout="wide"
    )
    
    st.title("🖼️ Website Image Crawler")
    st.markdown("Enter a website URL to extract and download all images from that page.")
    
    # URL input
    url = st.text_input("Enter Website URL:", placeholder="https://example.com")
    
    col1, col2 = st.columns([1, 4])
    
    with col1:
        crawl_button = st.button("🔍 Crawl Images", type="primary")
    
    if crawl_button and url:
        if not is_valid_url(url):
            st.error("Please enter a valid URL (including http:// or https://)")
            return
        
        with st.spinner("Crawling website for images..."):
            # Get image URLs
            image_urls = get_image_urls(url)
            
            if not image_urls:
                st.warning("No images found on the provided webpage.")
                return
            
            st.success(f"Found {len(image_urls)} images on the webpage!")
            
            # Show found URLs in an expander
            with st.expander(f"Found Image URLs ({len(image_urls)})"):
                for i, img_url in enumerate(image_urls, 1):
                    st.text(f"{i}. {img_url}")
        
        # Download images
        st.subheader("Downloading Images...")
        
        progress_bar = st.progress(0)
        status_text = st.empty()
        downloaded_images = []
        failed_downloads = []
        
        # Create a session for efficient downloading
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        })
        
        for i, img_url in enumerate(image_urls):
            status_text.text(f"Downloading image {i+1}/{len(image_urls)}: {img_url[:50]}...")
            
            image_content, filename, error = download_image(img_url, session)
            
            if image_content and filename:
                downloaded_images.append((filename, image_content))
            else:
                failed_downloads.append((img_url, error))
            
            progress_bar.progress((i + 1) / len(image_urls))
            time.sleep(0.1)  # Small delay to avoid overwhelming the server
        
        session.close()
        
        # Show results
        st.subheader("Download Results")
        
        col1, col2 = st.columns(2)
        
        with col1:
            st.metric("✅ Successfully Downloaded", len(downloaded_images))
        
        with col2:
            st.metric("❌ Failed Downloads", len(failed_downloads))
        
        # Show failed downloads
        if failed_downloads:
            with st.expander("Failed Downloads"):
                for img_url, error in failed_downloads:
                    st.text(f"❌ {img_url}")
                    st.text(f"   Error: {error}")
                    st.text("")
        
        # Create download button for ZIP file
        if downloaded_images:
            st.subheader("Download All Images")
            
            zip_buffer = create_zip_file(downloaded_images)
            
            st.download_button(
                label=f"📥 Download ZIP file ({len(downloaded_images)} images)",
                data=zip_buffer.getvalue(),
                file_name=f"images_{urlparse(url).netloc}_{int(time.time())}.zip",
                mime="application/zip",
                type="primary"
            )
            
            # Show preview of first few images
            st.subheader("Image Preview")
            
            preview_cols = st.columns(4)
            preview_count = min(8, len(downloaded_images))
            
            for i in range(preview_count):
                filename, image_content = downloaded_images[i]
                
                try:
                    # Try to display image preview
                    image = Image.open(BytesIO(image_content))
                    
                    with preview_cols[i % 4]:
                        st.image(image, caption=filename, use_container_width=True)
                except:
                    # If image can't be displayed, show filename only
                    with preview_cols[i % 4]:
                        st.text(f"📄 {filename}")
            
            if len(downloaded_images) > preview_count:
                st.text(f"... and {len(downloaded_images) - preview_count} more images")
    
    elif crawl_button and not url:
        st.error("Please enter a URL to crawl.")
    
    # Instructions
    st.markdown("---")
    st.subheader("How to use:")
    st.markdown("""
    1. Enter a valid website URL (must include http:// or https://)
    2. Click the "Crawl Images" button
    3. Wait for the application to find and download all images
    4. Download the ZIP file containing all images
    
    **Note:** This enhanced crawler finds images from:
    - Standard `<img>` tags with various lazy-loading attributes
    - CSS background images (inline styles and external stylesheets)
    - Shopify banners and product images
    - Meta tags (Open Graph, Twitter Cards)
    - JSON-LD structured data
    - SVG embedded images
    - Container elements with background images
    
    It does not crawl subpages or follow links - only the main page content.
    """)
    
    st.markdown("---")
    st.markdown("⚠️ **Disclaimer:** Please respect website terms of service and copyright laws when downloading images.")

# Run the main function
main()