Spaces:

maaz21
/

img.scrapper

Sleeping

App Files Files Community

maaz21 commited on Jun 3, 2025

Commit

ccffe7a

verified ·

1 Parent(s): fb003a4

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +124 -223

src/streamlit_app.py CHANGED Viewed

@@ -17,171 +17,107 @@ def is_valid_url(url):
     except:
         return False
-def extract_shopify_images(soup, base_url):
-    """Shopify-specific image extraction with aggressive parsing"""
-    shopify_images = set()
-    # 1. Shopify CDN URLs from any text content
-    import re
-    page_text = str(soup)
-    # Match Shopify CDN patterns
-    shopify_patterns = [
-        r'https://cdn\.shopify\.com/s/files/[^"\s]+\.(?:jpg|jpeg|png|gif|webp)',
-        r'https://[^".\s]+\.myshopify\.com/[^"\s]+\.(?:jpg|jpeg|png|gif|webp)',
-        r'//cdn\.shopify\.com/s/files/[^"\s]+\.(?:jpg|jpeg|png|gif|webp)',
-        r'cdn\.shopify\.com/s/files/[^"\s]+\.(?:jpg|jpeg|png|gif|webp)'
-    ]
-    for pattern in shopify_patterns:
-        matches = re.findall(pattern, page_text, re.IGNORECASE)
-        for match in matches:
-            if not match.startswith('http'):
-                match = 'https://' + match.lstrip('//')
-            shopify_images.add(match)
-    # 2. Shopify-specific selectors
-    shopify_selectors = [
-        # Hero and banner sections
-        '.hero img', '.hero-banner img', '.banner img', '.slideshow img',
-        '.hero__image', '.hero__media img', '.banner__media img',
-        '.slideshow__media img', '.slideshow-slide img',
-        # Product images
-        '.product-media img', '.product__media img', '.product-image img',
-        '.product-gallery img', '.product-photos img', '.product-photo img',
-        '.product-thumbnail img', '.product-thumbs img',
-        # Collection and card images
-        '.collection-hero img', '.collection-image img', '.card-image img',
-        '.card__media img', '.collection-card img', '.featured-collection img',
-        # Section backgrounds and images
-        '.section-image img', '.image-section img', '.content-image img',
-        '.promo-banner img', '.promotional-banner img',
-        # Common Shopify theme classes
-        '.rte img', '.rich-text img', '.text-section img',
-        '.image-with-text img', '.image-overlay img'
-    ]
-    for selector in shopify_selectors:
-        try:
-            elements = soup.select(selector)
-            for element in elements:
-                for attr in ['src', 'data-src', 'data-original', 'data-srcset', 'srcset']:
-                    value = element.get(attr)
-                    if value:
-                        if 'srcset' in attr and ',' in value:
-                            urls = [u.strip().split(' ')[0] for u in value.split(',')]
-                            for url in urls:
-                                if url:
-                                    shopify_images.add(urljoin(base_url, url))
-                        else:
-                            shopify_images.add(urljoin(base_url, value))
-        except:
-            continue
-    return shopify_images
 def extract_css_background_images(css_content, base_url):
-    """Enhanced CSS background image extraction"""
     import re
-    image_urls = set()
-    # Multiple patterns for background images
-    patterns = [
-        r'background-image\s*:\s*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)',
-        r'background\s*:\s*[^;]*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)',
-        r'--[^:]*:\s*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)',  # CSS variables
-    ]
-    for pattern in patterns:
-        matches = re.findall(pattern, css_content, re.IGNORECASE | re.MULTILINE)
-        for match in matches:
-            if match and not match.startswith('data:') and '.' in match:
-                if match.startswith('//'):
-                    match = 'https:' + match
-                elif not match.startswith('http'):
-                    match = urljoin(base_url, match)
-                image_urls.add(match)
     return image_urls
 def get_image_urls(url):
-    """Enhanced image extraction with aggressive Shopify support"""
     try:
         headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.9',
-            'Accept-Encoding': 'gzip, deflate, br',
-            'DNT': '1',
-            'Connection': 'keep-alive',
-            'Upgrade-Insecure-Requests': '1',
-            'Sec-Fetch-Dest': 'document',
-            'Sec-Fetch-Mode': 'navigate',
-            'Sec-Fetch-Site': 'none'
         }
-        response = requests.get(url, headers=headers, timeout=20)
         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
-        image_urls = set()
-        st.info("🔍 Scanning for images using multiple detection methods...")
-        # 1. Shopify-specific extraction (most important for Shopify sites)
-        shopify_images = extract_shopify_images(soup, url)
-        image_urls.update(shopify_images)
-        if shopify_images:
-            st.success(f"📦 Found {len(shopify_images)} Shopify-specific images")
-        # 2. All possible img tag attributes
-        img_attributes = [
-            'src', 'data-src', 'data-original', 'data-lazy-src', 'data-srcset',
-            'data-image', 'data-bg', 'data-background', 'data-large-image',
-            'data-zoom-image', 'data-full-size', 'data-master', 'data-variant-img',
-            'data-high-res', 'data-retina', 'data-2x', 'data-1x'
-        ]
         img_tags = soup.find_all('img')
         for img in img_tags:
-            for attr in img_attributes:
                 value = img.get(attr)
                 if value and not value.startswith('data:'):
-                    if 'srcset' in attr and ',' in value:
-                        urls = [u.strip().split(' ')[0] for u in value.split(',')]
-                        for img_url in urls:
-                            if img_url:
-                                image_urls.add(urljoin(url, img_url))
                     else:
-                        image_urls.add(urljoin(url, value))
-        # 3. Enhanced CSS background extraction
-        # Inline styles
-        style_elements = soup.find_all(attrs={'style': True})
-        for element in style_elements:
             style = element.get('style', '')
             if 'background' in style.lower() and 'url(' in style:
-                css_images = extract_css_background_images(style, url)
-                image_urls.update(css_images)
-        # Style tags
         style_tags = soup.find_all('style')
         for style_tag in style_tags:
             if style_tag.string:
                 css_images = extract_css_background_images(style_tag.string, url)
                 image_urls.update(css_images)
-        # External CSS (increased limit for Shopify)
         link_tags = soup.find_all('link', {'rel': 'stylesheet'})
-        css_count = 0
-        for link in link_tags:
-            if css_count >= 10:  # Increased from 5 to 10
-                break
             css_url = link.get('href')
             if css_url:
                 try:
@@ -190,112 +126,77 @@ def get_image_urls(url):
                     if css_response.status_code == 200:
                         css_images = extract_css_background_images(css_response.text, url)
                         image_urls.update(css_images)
-                        css_count += 1
                 except:
-                    continue
-        # 4. Picture and source tags
-        media_tags = soup.find_all(['picture', 'source'])
-        for tag in media_tags:
-            for attr in ['src', 'srcset', 'data-src', 'data-srcset']:
-                value = tag.get(attr)
-                if value and not value.startswith('data:'):
-                    if 'srcset' in attr and ',' in value:
-                        urls = [u.strip().split(' ')[0] for u in value.split(',')]
-                        for img_url in urls:
-                            if img_url:
-                                image_urls.add(urljoin(url, img_url))
-                    else:
-                        image_urls.add(urljoin(url, value))
-        # 5. Data attributes on any element
-        data_image_attrs = [
-            'data-background-image', 'data-bg-src', 'data-hero-image',
-            'data-banner', 'data-slide-img', 'data-thumb', 'data-image-src',
-            'data-bgset', 'data-widths', 'data-sizes', 'data-aspect-ratio'
-        ]
-        for attr in data_image_attrs:
-            elements = soup.find_all(attrs={attr: True})
-            for element in elements:
-                value = element.get(attr)
-                if value and not value.startswith('data:'):
-                    # Handle complex Shopify data attributes
-                    if 'bgset' in attr or 'widths' in attr:
-                        import re
-                        urls = re.findall(r'(https?://[^\s,}]+\.(?:jpg|jpeg|png|gif|webp))', value, re.IGNORECASE)
-                        image_urls.update(urls)
-                    else:
-                        image_urls.add(urljoin(url, value))
-        # 6. JavaScript and JSON data
-        script_tags = soup.find_all('script')
-        for script in script_tags:
-            if script.string:
-                script_content = script.string
-                # Look for image URLs in JavaScript
-                import re
-                js_image_patterns = [
-                    r'"(https?://[^"]+\.(?:jpg|jpeg|png|gif|webp)[^"]*)"',
-                    r"'(https?://[^']+\.(?:jpg|jpeg|png|gif|webp)[^']*)'",
-                    r'url:\s*["\']([^"\']+\.(?:jpg|jpeg|png|gif|webp)[^"\']*)["\']',
-                    r'src:\s*["\']([^"\']+\.(?:jpg|jpeg|png|gif|webp)[^"\']*)["\']',
-                    r'image:\s*["\']([^"\']+\.(?:jpg|jpeg|png|gif|webp)[^"\']*)["\']'
-                ]
-                for pattern in js_image_patterns:
-                    matches = re.findall(pattern, script_content, re.IGNORECASE)
-                    for match in matches:
-                        if not match.startswith('data:'):
-                            if match.startswith('//'):
-                                match = 'https:' + match
-                            elif not match.startswith('http'):
-                                match = urljoin(url, match)
-                            image_urls.add(match)
-        # 7. Meta tags (social media images)
         meta_tags = soup.find_all('meta')
         for meta in meta_tags:
-            property_val = meta.get('property', '').lower()
-            name_val = meta.get('name', '').lower()
-            if any(prop in property_val + name_val for prop in ['image', 'photo', 'picture']):
-                content = meta.get('content', '')
-                if content and any(ext in content.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp']):
-                    if content.startswith('http'):
-                        image_urls.add(content)
                     else:
-                        image_urls.add(urljoin(url, content))
         # 8. SVG images
         svg_tags = soup.find_all('svg')
         for svg in svg_tags:
             image_elements = svg.find_all('image')
             for img in image_elements:
                 href = img.get('href') or img.get('xlink:href')
                 if href and not href.startswith('data:'):
-                    image_urls.add(urljoin(url, href))
-        # Final filtering and validation
         valid_image_urls = []
         for img_url in image_urls:
-            if (img_url and len(img_url) > 15 and
-                not img_url.startswith('data:') and
-                any(ext in img_url.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']) and
-                ('http' in img_url or img_url.startswith('//'))):
-                # Clean up URLs
-                if img_url.startswith('//'):
-                    img_url = 'https:' + img_url
-                # Remove query parameters that might break downloads (but keep Shopify transforms)
-                if '?' in img_url and 'shopify.com' not in img_url:
-                    img_url = img_url.split('?')[0]
-                valid_image_urls.append(img_url)
-        # Sort by URL to group similar images together
-        valid_image_urls.sort()
         return valid_image_urls

     except:
         return False
 def extract_css_background_images(css_content, base_url):
+    """Extract background image URLs from CSS content"""
     import re
+    image_urls = []
+    # Pattern to match background-image: url() declarations
+    bg_pattern = r'background(?:-image)?\s*:\s*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)'
+    matches = re.findall(bg_pattern, css_content, re.IGNORECASE)
+    for match in matches:
+        if match and not match.startswith('data:'):
+            absolute_url = urljoin(base_url, match)
+            image_urls.append(absolute_url)
     return image_urls
 def get_image_urls(url):
+    """Extract all image URLs from the given webpage using comprehensive methods"""
     try:
         headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
         }
+        response = requests.get(url, headers=headers, timeout=15)
         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
+        image_urls = set()  # Use set to avoid duplicates
+        # 1. Standard img tags with multiple attribute checks
         img_tags = soup.find_all('img')
         for img in img_tags:
+            # Check multiple possible attributes
+            for attr in ['src', 'data-src', 'data-original', 'data-lazy-src', 'data-srcset',
+                        'data-image', 'data-bg', 'data-background', 'data-large-image']:
                 value = img.get(attr)
                 if value and not value.startswith('data:'):
+                    # Handle srcset (multiple images)
+                    if 'srcset' in attr.lower() or ',' in value:
+                        urls = value.split(',')
+                        for url_part in urls:
+                            clean_url = url_part.strip().split(' ')[0]
+                            if clean_url:
+                                absolute_url = urljoin(url, clean_url)
+                                image_urls.add(absolute_url)
                     else:
+                        absolute_url = urljoin(url, value)
+                        image_urls.add(absolute_url)
+        # 2. Picture and source tags
+        picture_tags = soup.find_all(['picture', 'source'])
+        for tag in picture_tags:
+            for attr in ['src', 'srcset', 'data-src', 'data-srcset']:
+                value = tag.get(attr)
+                if value and not value.startswith('data:'):
+                    if 'srcset' in attr.lower() or ',' in value:
+                        urls = value.split(',')
+                        for url_part in urls:
+                            clean_url = url_part.strip().split(' ')[0]
+                            if clean_url:
+                                absolute_url = urljoin(url, clean_url)
+                                image_urls.add(absolute_url)
+                    else:
+                        absolute_url = urljoin(url, value)
+                        image_urls.add(absolute_url)
+        # 3. Divs and other elements with background images in style attribute
+        all_elements = soup.find_all(attrs={'style': True})
+        for element in all_elements:
             style = element.get('style', '')
             if 'background' in style.lower() and 'url(' in style:
+                import re
+                bg_matches = re.findall(r'url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)', style, re.IGNORECASE)
+                for match in bg_matches:
+                    if match and not match.startswith('data:'):
+                        absolute_url = urljoin(url, match)
+                        image_urls.add(absolute_url)
+        # 4. Elements with data attributes that might contain image URLs
+        data_attrs = ['data-background-image', 'data-bg-src', 'data-hero-image',
+                     'data-banner', 'data-slide-img', 'data-thumb', 'data-image-src',
+                     'data-full-size', 'data-zoom-image', 'data-lightbox']
+        for attr in data_attrs:
+            elements = soup.find_all(attrs={attr: True})
+            for element in elements:
+                value = element.get(attr)
+                if value and not value.startswith('data:'):
+                    absolute_url = urljoin(url, value)
+                    image_urls.add(absolute_url)
+        # 5. CSS background images from <style> tags
         style_tags = soup.find_all('style')
         for style_tag in style_tags:
             if style_tag.string:
                 css_images = extract_css_background_images(style_tag.string, url)
                 image_urls.update(css_images)
+        # 6. External CSS files
         link_tags = soup.find_all('link', {'rel': 'stylesheet'})
+        for link in link_tags[:5]:  # Limit to first 5 CSS files to avoid overload
             css_url = link.get('href')
             if css_url:
                 try:
                     if css_response.status_code == 200:
                         css_images = extract_css_background_images(css_response.text, url)
                         image_urls.update(css_images)
                 except:
+                    continue  # Skip if CSS file can't be loaded
+        # 7. Meta tags (Open Graph, Twitter Cards, etc.)
         meta_tags = soup.find_all('meta')
         for meta in meta_tags:
+            for attr in ['content', 'value']:
+                value = meta.get(attr, '')
+                if value and any(ext in value.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']):
+                    if 'http' in value:
+                        image_urls.add(value)
                     else:
+                        absolute_url = urljoin(url, value)
+                        image_urls.add(absolute_url)
         # 8. SVG images
         svg_tags = soup.find_all('svg')
         for svg in svg_tags:
+            # Look for embedded images in SVG
             image_elements = svg.find_all('image')
             for img in image_elements:
                 href = img.get('href') or img.get('xlink:href')
                 if href and not href.startswith('data:'):
+                    absolute_url = urljoin(url, href)
+                    image_urls.add(absolute_url)
+        # 9. Shopify specific selectors
+        shopify_selectors = [
+            '[data-bgset]', '[data-widths]', '.hero__image', '.banner__media img',
+            '.card__media img', '.product__media img', '[data-shopify]'
+        ]
+        for selector in shopify_selectors:
+            try:
+                elements = soup.select(selector)
+                for element in elements:
+                    for attr in ['src', 'data-src', 'data-bgset', 'data-widths', 'srcset']:
+                        value = element.get(attr)
+                        if value and not value.startswith('data:'):
+                            if 'bgset' in attr or 'widths' in attr or 'srcset' in attr:
+                                # Parse complex attribute formats
+                                import re
+                                urls = re.findall(r'https?://[^\s,]+', value)
+                                for found_url in urls:
+                                    image_urls.add(found_url)
+                            else:
+                                absolute_url = urljoin(url, value)
+                                image_urls.add(absolute_url)
+            except:
+                continue
+        # 10. Look for JSON-LD structured data
+        json_scripts = soup.find_all('script', {'type': 'application/ld+json'})
+        for script in json_scripts:
+            try:
+                import json
+                data = json.loads(script.string)
+                json_str = json.dumps(data)
+                import re
+                urls = re.findall(r'https?://[^\s"]+\.(?:jpg|jpeg|png|gif|webp|svg)', json_str, re.IGNORECASE)
+                image_urls.update(urls)
+            except:
+                continue
+        # Filter out obviously invalid URLs and convert to list
         valid_image_urls = []
         for img_url in image_urls:
+            if img_url and len(img_url) > 10 and not img_url.startswith('data:'):
+                # Basic validation - must look like a URL
+                if '.' in img_url and ('http' in img_url or img_url.startswith('//')):
+                    valid_image_urls.append(img_url)
         return valid_image_urls