Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +124 -223
src/streamlit_app.py
CHANGED
|
@@ -17,171 +17,107 @@ def is_valid_url(url):
|
|
| 17 |
except:
|
| 18 |
return False
|
| 19 |
|
| 20 |
-
def extract_shopify_images(soup, base_url):
|
| 21 |
-
"""Shopify-specific image extraction with aggressive parsing"""
|
| 22 |
-
shopify_images = set()
|
| 23 |
-
|
| 24 |
-
# 1. Shopify CDN URLs from any text content
|
| 25 |
-
import re
|
| 26 |
-
page_text = str(soup)
|
| 27 |
-
|
| 28 |
-
# Match Shopify CDN patterns
|
| 29 |
-
shopify_patterns = [
|
| 30 |
-
r'https://cdn\.shopify\.com/s/files/[^"\s]+\.(?:jpg|jpeg|png|gif|webp)',
|
| 31 |
-
r'https://[^".\s]+\.myshopify\.com/[^"\s]+\.(?:jpg|jpeg|png|gif|webp)',
|
| 32 |
-
r'//cdn\.shopify\.com/s/files/[^"\s]+\.(?:jpg|jpeg|png|gif|webp)',
|
| 33 |
-
r'cdn\.shopify\.com/s/files/[^"\s]+\.(?:jpg|jpeg|png|gif|webp)'
|
| 34 |
-
]
|
| 35 |
-
|
| 36 |
-
for pattern in shopify_patterns:
|
| 37 |
-
matches = re.findall(pattern, page_text, re.IGNORECASE)
|
| 38 |
-
for match in matches:
|
| 39 |
-
if not match.startswith('http'):
|
| 40 |
-
match = 'https://' + match.lstrip('//')
|
| 41 |
-
shopify_images.add(match)
|
| 42 |
-
|
| 43 |
-
# 2. Shopify-specific selectors
|
| 44 |
-
shopify_selectors = [
|
| 45 |
-
# Hero and banner sections
|
| 46 |
-
'.hero img', '.hero-banner img', '.banner img', '.slideshow img',
|
| 47 |
-
'.hero__image', '.hero__media img', '.banner__media img',
|
| 48 |
-
'.slideshow__media img', '.slideshow-slide img',
|
| 49 |
-
|
| 50 |
-
# Product images
|
| 51 |
-
'.product-media img', '.product__media img', '.product-image img',
|
| 52 |
-
'.product-gallery img', '.product-photos img', '.product-photo img',
|
| 53 |
-
'.product-thumbnail img', '.product-thumbs img',
|
| 54 |
-
|
| 55 |
-
# Collection and card images
|
| 56 |
-
'.collection-hero img', '.collection-image img', '.card-image img',
|
| 57 |
-
'.card__media img', '.collection-card img', '.featured-collection img',
|
| 58 |
-
|
| 59 |
-
# Section backgrounds and images
|
| 60 |
-
'.section-image img', '.image-section img', '.content-image img',
|
| 61 |
-
'.promo-banner img', '.promotional-banner img',
|
| 62 |
-
|
| 63 |
-
# Common Shopify theme classes
|
| 64 |
-
'.rte img', '.rich-text img', '.text-section img',
|
| 65 |
-
'.image-with-text img', '.image-overlay img'
|
| 66 |
-
]
|
| 67 |
-
|
| 68 |
-
for selector in shopify_selectors:
|
| 69 |
-
try:
|
| 70 |
-
elements = soup.select(selector)
|
| 71 |
-
for element in elements:
|
| 72 |
-
for attr in ['src', 'data-src', 'data-original', 'data-srcset', 'srcset']:
|
| 73 |
-
value = element.get(attr)
|
| 74 |
-
if value:
|
| 75 |
-
if 'srcset' in attr and ',' in value:
|
| 76 |
-
urls = [u.strip().split(' ')[0] for u in value.split(',')]
|
| 77 |
-
for url in urls:
|
| 78 |
-
if url:
|
| 79 |
-
shopify_images.add(urljoin(base_url, url))
|
| 80 |
-
else:
|
| 81 |
-
shopify_images.add(urljoin(base_url, value))
|
| 82 |
-
except:
|
| 83 |
-
continue
|
| 84 |
-
|
| 85 |
-
return shopify_images
|
| 86 |
-
|
| 87 |
def extract_css_background_images(css_content, base_url):
|
| 88 |
-
"""
|
| 89 |
import re
|
| 90 |
|
| 91 |
-
image_urls =
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
patterns = [
|
| 95 |
-
r'background-image\s*:\s*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)',
|
| 96 |
-
r'background\s*:\s*[^;]*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)',
|
| 97 |
-
r'--[^:]*:\s*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)', # CSS variables
|
| 98 |
-
]
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
match = 'https:' + match
|
| 106 |
-
elif not match.startswith('http'):
|
| 107 |
-
match = urljoin(base_url, match)
|
| 108 |
-
image_urls.add(match)
|
| 109 |
|
| 110 |
return image_urls
|
| 111 |
|
| 112 |
def get_image_urls(url):
|
| 113 |
-
"""
|
| 114 |
try:
|
| 115 |
headers = {
|
| 116 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
| 117 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
| 118 |
-
'Accept-Language': 'en-US,en;q=0.9',
|
| 119 |
-
'Accept-Encoding': 'gzip, deflate, br',
|
| 120 |
-
'DNT': '1',
|
| 121 |
-
'Connection': 'keep-alive',
|
| 122 |
-
'Upgrade-Insecure-Requests': '1',
|
| 123 |
-
'Sec-Fetch-Dest': 'document',
|
| 124 |
-
'Sec-Fetch-Mode': 'navigate',
|
| 125 |
-
'Sec-Fetch-Site': 'none'
|
| 126 |
}
|
| 127 |
|
| 128 |
-
response = requests.get(url, headers=headers, timeout=
|
| 129 |
response.raise_for_status()
|
| 130 |
|
| 131 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 132 |
-
image_urls = set()
|
| 133 |
-
|
| 134 |
-
st.info("🔍 Scanning for images using multiple detection methods...")
|
| 135 |
-
|
| 136 |
-
# 1. Shopify-specific extraction (most important for Shopify sites)
|
| 137 |
-
shopify_images = extract_shopify_images(soup, url)
|
| 138 |
-
image_urls.update(shopify_images)
|
| 139 |
-
if shopify_images:
|
| 140 |
-
st.success(f"📦 Found {len(shopify_images)} Shopify-specific images")
|
| 141 |
-
|
| 142 |
-
# 2. All possible img tag attributes
|
| 143 |
-
img_attributes = [
|
| 144 |
-
'src', 'data-src', 'data-original', 'data-lazy-src', 'data-srcset',
|
| 145 |
-
'data-image', 'data-bg', 'data-background', 'data-large-image',
|
| 146 |
-
'data-zoom-image', 'data-full-size', 'data-master', 'data-variant-img',
|
| 147 |
-
'data-high-res', 'data-retina', 'data-2x', 'data-1x'
|
| 148 |
-
]
|
| 149 |
|
|
|
|
| 150 |
img_tags = soup.find_all('img')
|
| 151 |
for img in img_tags:
|
| 152 |
-
|
|
|
|
|
|
|
| 153 |
value = img.get(attr)
|
| 154 |
if value and not value.startswith('data:'):
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
| 160 |
else:
|
| 161 |
-
|
|
|
|
| 162 |
|
| 163 |
-
#
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
style = element.get('style', '')
|
| 168 |
if 'background' in style.lower() and 'url(' in style:
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
-
#
|
| 173 |
style_tags = soup.find_all('style')
|
| 174 |
for style_tag in style_tags:
|
| 175 |
if style_tag.string:
|
| 176 |
css_images = extract_css_background_images(style_tag.string, url)
|
| 177 |
image_urls.update(css_images)
|
| 178 |
|
| 179 |
-
# External CSS
|
| 180 |
link_tags = soup.find_all('link', {'rel': 'stylesheet'})
|
| 181 |
-
|
| 182 |
-
for link in link_tags:
|
| 183 |
-
if css_count >= 10: # Increased from 5 to 10
|
| 184 |
-
break
|
| 185 |
css_url = link.get('href')
|
| 186 |
if css_url:
|
| 187 |
try:
|
|
@@ -190,112 +126,77 @@ def get_image_urls(url):
|
|
| 190 |
if css_response.status_code == 200:
|
| 191 |
css_images = extract_css_background_images(css_response.text, url)
|
| 192 |
image_urls.update(css_images)
|
| 193 |
-
css_count += 1
|
| 194 |
except:
|
| 195 |
-
continue
|
| 196 |
|
| 197 |
-
#
|
| 198 |
-
media_tags = soup.find_all(['picture', 'source'])
|
| 199 |
-
for tag in media_tags:
|
| 200 |
-
for attr in ['src', 'srcset', 'data-src', 'data-srcset']:
|
| 201 |
-
value = tag.get(attr)
|
| 202 |
-
if value and not value.startswith('data:'):
|
| 203 |
-
if 'srcset' in attr and ',' in value:
|
| 204 |
-
urls = [u.strip().split(' ')[0] for u in value.split(',')]
|
| 205 |
-
for img_url in urls:
|
| 206 |
-
if img_url:
|
| 207 |
-
image_urls.add(urljoin(url, img_url))
|
| 208 |
-
else:
|
| 209 |
-
image_urls.add(urljoin(url, value))
|
| 210 |
-
|
| 211 |
-
# 5. Data attributes on any element
|
| 212 |
-
data_image_attrs = [
|
| 213 |
-
'data-background-image', 'data-bg-src', 'data-hero-image',
|
| 214 |
-
'data-banner', 'data-slide-img', 'data-thumb', 'data-image-src',
|
| 215 |
-
'data-bgset', 'data-widths', 'data-sizes', 'data-aspect-ratio'
|
| 216 |
-
]
|
| 217 |
-
|
| 218 |
-
for attr in data_image_attrs:
|
| 219 |
-
elements = soup.find_all(attrs={attr: True})
|
| 220 |
-
for element in elements:
|
| 221 |
-
value = element.get(attr)
|
| 222 |
-
if value and not value.startswith('data:'):
|
| 223 |
-
# Handle complex Shopify data attributes
|
| 224 |
-
if 'bgset' in attr or 'widths' in attr:
|
| 225 |
-
import re
|
| 226 |
-
urls = re.findall(r'(https?://[^\s,}]+\.(?:jpg|jpeg|png|gif|webp))', value, re.IGNORECASE)
|
| 227 |
-
image_urls.update(urls)
|
| 228 |
-
else:
|
| 229 |
-
image_urls.add(urljoin(url, value))
|
| 230 |
-
|
| 231 |
-
# 6. JavaScript and JSON data
|
| 232 |
-
script_tags = soup.find_all('script')
|
| 233 |
-
for script in script_tags:
|
| 234 |
-
if script.string:
|
| 235 |
-
script_content = script.string
|
| 236 |
-
|
| 237 |
-
# Look for image URLs in JavaScript
|
| 238 |
-
import re
|
| 239 |
-
js_image_patterns = [
|
| 240 |
-
r'"(https?://[^"]+\.(?:jpg|jpeg|png|gif|webp)[^"]*)"',
|
| 241 |
-
r"'(https?://[^']+\.(?:jpg|jpeg|png|gif|webp)[^']*)'",
|
| 242 |
-
r'url:\s*["\']([^"\']+\.(?:jpg|jpeg|png|gif|webp)[^"\']*)["\']',
|
| 243 |
-
r'src:\s*["\']([^"\']+\.(?:jpg|jpeg|png|gif|webp)[^"\']*)["\']',
|
| 244 |
-
r'image:\s*["\']([^"\']+\.(?:jpg|jpeg|png|gif|webp)[^"\']*)["\']'
|
| 245 |
-
]
|
| 246 |
-
|
| 247 |
-
for pattern in js_image_patterns:
|
| 248 |
-
matches = re.findall(pattern, script_content, re.IGNORECASE)
|
| 249 |
-
for match in matches:
|
| 250 |
-
if not match.startswith('data:'):
|
| 251 |
-
if match.startswith('//'):
|
| 252 |
-
match = 'https:' + match
|
| 253 |
-
elif not match.startswith('http'):
|
| 254 |
-
match = urljoin(url, match)
|
| 255 |
-
image_urls.add(match)
|
| 256 |
-
|
| 257 |
-
# 7. Meta tags (social media images)
|
| 258 |
meta_tags = soup.find_all('meta')
|
| 259 |
for meta in meta_tags:
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
if content.startswith('http'):
|
| 266 |
-
image_urls.add(content)
|
| 267 |
else:
|
| 268 |
-
|
|
|
|
| 269 |
|
| 270 |
# 8. SVG images
|
| 271 |
svg_tags = soup.find_all('svg')
|
| 272 |
for svg in svg_tags:
|
|
|
|
| 273 |
image_elements = svg.find_all('image')
|
| 274 |
for img in image_elements:
|
| 275 |
href = img.get('href') or img.get('xlink:href')
|
| 276 |
if href and not href.startswith('data:'):
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
valid_image_urls = []
|
| 281 |
for img_url in image_urls:
|
| 282 |
-
if
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
# Clean up URLs
|
| 288 |
-
if img_url.startswith('//'):
|
| 289 |
-
img_url = 'https:' + img_url
|
| 290 |
-
|
| 291 |
-
# Remove query parameters that might break downloads (but keep Shopify transforms)
|
| 292 |
-
if '?' in img_url and 'shopify.com' not in img_url:
|
| 293 |
-
img_url = img_url.split('?')[0]
|
| 294 |
-
|
| 295 |
-
valid_image_urls.append(img_url)
|
| 296 |
-
|
| 297 |
-
# Sort by URL to group similar images together
|
| 298 |
-
valid_image_urls.sort()
|
| 299 |
|
| 300 |
return valid_image_urls
|
| 301 |
|
|
|
|
| 17 |
except:
|
| 18 |
return False
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
def extract_css_background_images(css_content, base_url):
|
| 21 |
+
"""Extract background image URLs from CSS content"""
|
| 22 |
import re
|
| 23 |
|
| 24 |
+
image_urls = []
|
| 25 |
+
# Pattern to match background-image: url() declarations
|
| 26 |
+
bg_pattern = r'background(?:-image)?\s*:\s*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
+
matches = re.findall(bg_pattern, css_content, re.IGNORECASE)
|
| 29 |
+
for match in matches:
|
| 30 |
+
if match and not match.startswith('data:'):
|
| 31 |
+
absolute_url = urljoin(base_url, match)
|
| 32 |
+
image_urls.append(absolute_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
return image_urls
|
| 35 |
|
| 36 |
def get_image_urls(url):
|
| 37 |
+
"""Extract all image URLs from the given webpage using comprehensive methods"""
|
| 38 |
try:
|
| 39 |
headers = {
|
| 40 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
}
|
| 42 |
|
| 43 |
+
response = requests.get(url, headers=headers, timeout=15)
|
| 44 |
response.raise_for_status()
|
| 45 |
|
| 46 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 47 |
+
image_urls = set() # Use set to avoid duplicates
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
# 1. Standard img tags with multiple attribute checks
|
| 50 |
img_tags = soup.find_all('img')
|
| 51 |
for img in img_tags:
|
| 52 |
+
# Check multiple possible attributes
|
| 53 |
+
for attr in ['src', 'data-src', 'data-original', 'data-lazy-src', 'data-srcset',
|
| 54 |
+
'data-image', 'data-bg', 'data-background', 'data-large-image']:
|
| 55 |
value = img.get(attr)
|
| 56 |
if value and not value.startswith('data:'):
|
| 57 |
+
# Handle srcset (multiple images)
|
| 58 |
+
if 'srcset' in attr.lower() or ',' in value:
|
| 59 |
+
urls = value.split(',')
|
| 60 |
+
for url_part in urls:
|
| 61 |
+
clean_url = url_part.strip().split(' ')[0]
|
| 62 |
+
if clean_url:
|
| 63 |
+
absolute_url = urljoin(url, clean_url)
|
| 64 |
+
image_urls.add(absolute_url)
|
| 65 |
else:
|
| 66 |
+
absolute_url = urljoin(url, value)
|
| 67 |
+
image_urls.add(absolute_url)
|
| 68 |
|
| 69 |
+
# 2. Picture and source tags
|
| 70 |
+
picture_tags = soup.find_all(['picture', 'source'])
|
| 71 |
+
for tag in picture_tags:
|
| 72 |
+
for attr in ['src', 'srcset', 'data-src', 'data-srcset']:
|
| 73 |
+
value = tag.get(attr)
|
| 74 |
+
if value and not value.startswith('data:'):
|
| 75 |
+
if 'srcset' in attr.lower() or ',' in value:
|
| 76 |
+
urls = value.split(',')
|
| 77 |
+
for url_part in urls:
|
| 78 |
+
clean_url = url_part.strip().split(' ')[0]
|
| 79 |
+
if clean_url:
|
| 80 |
+
absolute_url = urljoin(url, clean_url)
|
| 81 |
+
image_urls.add(absolute_url)
|
| 82 |
+
else:
|
| 83 |
+
absolute_url = urljoin(url, value)
|
| 84 |
+
image_urls.add(absolute_url)
|
| 85 |
+
|
| 86 |
+
# 3. Divs and other elements with background images in style attribute
|
| 87 |
+
all_elements = soup.find_all(attrs={'style': True})
|
| 88 |
+
for element in all_elements:
|
| 89 |
style = element.get('style', '')
|
| 90 |
if 'background' in style.lower() and 'url(' in style:
|
| 91 |
+
import re
|
| 92 |
+
bg_matches = re.findall(r'url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)', style, re.IGNORECASE)
|
| 93 |
+
for match in bg_matches:
|
| 94 |
+
if match and not match.startswith('data:'):
|
| 95 |
+
absolute_url = urljoin(url, match)
|
| 96 |
+
image_urls.add(absolute_url)
|
| 97 |
+
|
| 98 |
+
# 4. Elements with data attributes that might contain image URLs
|
| 99 |
+
data_attrs = ['data-background-image', 'data-bg-src', 'data-hero-image',
|
| 100 |
+
'data-banner', 'data-slide-img', 'data-thumb', 'data-image-src',
|
| 101 |
+
'data-full-size', 'data-zoom-image', 'data-lightbox']
|
| 102 |
+
|
| 103 |
+
for attr in data_attrs:
|
| 104 |
+
elements = soup.find_all(attrs={attr: True})
|
| 105 |
+
for element in elements:
|
| 106 |
+
value = element.get(attr)
|
| 107 |
+
if value and not value.startswith('data:'):
|
| 108 |
+
absolute_url = urljoin(url, value)
|
| 109 |
+
image_urls.add(absolute_url)
|
| 110 |
|
| 111 |
+
# 5. CSS background images from <style> tags
|
| 112 |
style_tags = soup.find_all('style')
|
| 113 |
for style_tag in style_tags:
|
| 114 |
if style_tag.string:
|
| 115 |
css_images = extract_css_background_images(style_tag.string, url)
|
| 116 |
image_urls.update(css_images)
|
| 117 |
|
| 118 |
+
# 6. External CSS files
|
| 119 |
link_tags = soup.find_all('link', {'rel': 'stylesheet'})
|
| 120 |
+
for link in link_tags[:5]: # Limit to first 5 CSS files to avoid overload
|
|
|
|
|
|
|
|
|
|
| 121 |
css_url = link.get('href')
|
| 122 |
if css_url:
|
| 123 |
try:
|
|
|
|
| 126 |
if css_response.status_code == 200:
|
| 127 |
css_images = extract_css_background_images(css_response.text, url)
|
| 128 |
image_urls.update(css_images)
|
|
|
|
| 129 |
except:
|
| 130 |
+
continue # Skip if CSS file can't be loaded
|
| 131 |
|
| 132 |
+
# 7. Meta tags (Open Graph, Twitter Cards, etc.)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
meta_tags = soup.find_all('meta')
|
| 134 |
for meta in meta_tags:
|
| 135 |
+
for attr in ['content', 'value']:
|
| 136 |
+
value = meta.get(attr, '')
|
| 137 |
+
if value and any(ext in value.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']):
|
| 138 |
+
if 'http' in value:
|
| 139 |
+
image_urls.add(value)
|
|
|
|
|
|
|
| 140 |
else:
|
| 141 |
+
absolute_url = urljoin(url, value)
|
| 142 |
+
image_urls.add(absolute_url)
|
| 143 |
|
| 144 |
# 8. SVG images
|
| 145 |
svg_tags = soup.find_all('svg')
|
| 146 |
for svg in svg_tags:
|
| 147 |
+
# Look for embedded images in SVG
|
| 148 |
image_elements = svg.find_all('image')
|
| 149 |
for img in image_elements:
|
| 150 |
href = img.get('href') or img.get('xlink:href')
|
| 151 |
if href and not href.startswith('data:'):
|
| 152 |
+
absolute_url = urljoin(url, href)
|
| 153 |
+
image_urls.add(absolute_url)
|
| 154 |
+
|
| 155 |
+
# 9. Shopify specific selectors
|
| 156 |
+
shopify_selectors = [
|
| 157 |
+
'[data-bgset]', '[data-widths]', '.hero__image', '.banner__media img',
|
| 158 |
+
'.card__media img', '.product__media img', '[data-shopify]'
|
| 159 |
+
]
|
| 160 |
|
| 161 |
+
for selector in shopify_selectors:
|
| 162 |
+
try:
|
| 163 |
+
elements = soup.select(selector)
|
| 164 |
+
for element in elements:
|
| 165 |
+
for attr in ['src', 'data-src', 'data-bgset', 'data-widths', 'srcset']:
|
| 166 |
+
value = element.get(attr)
|
| 167 |
+
if value and not value.startswith('data:'):
|
| 168 |
+
if 'bgset' in attr or 'widths' in attr or 'srcset' in attr:
|
| 169 |
+
# Parse complex attribute formats
|
| 170 |
+
import re
|
| 171 |
+
urls = re.findall(r'https?://[^\s,]+', value)
|
| 172 |
+
for found_url in urls:
|
| 173 |
+
image_urls.add(found_url)
|
| 174 |
+
else:
|
| 175 |
+
absolute_url = urljoin(url, value)
|
| 176 |
+
image_urls.add(absolute_url)
|
| 177 |
+
except:
|
| 178 |
+
continue
|
| 179 |
+
|
| 180 |
+
# 10. Look for JSON-LD structured data
|
| 181 |
+
json_scripts = soup.find_all('script', {'type': 'application/ld+json'})
|
| 182 |
+
for script in json_scripts:
|
| 183 |
+
try:
|
| 184 |
+
import json
|
| 185 |
+
data = json.loads(script.string)
|
| 186 |
+
json_str = json.dumps(data)
|
| 187 |
+
import re
|
| 188 |
+
urls = re.findall(r'https?://[^\s"]+\.(?:jpg|jpeg|png|gif|webp|svg)', json_str, re.IGNORECASE)
|
| 189 |
+
image_urls.update(urls)
|
| 190 |
+
except:
|
| 191 |
+
continue
|
| 192 |
+
|
| 193 |
+
# Filter out obviously invalid URLs and convert to list
|
| 194 |
valid_image_urls = []
|
| 195 |
for img_url in image_urls:
|
| 196 |
+
if img_url and len(img_url) > 10 and not img_url.startswith('data:'):
|
| 197 |
+
# Basic validation - must look like a URL
|
| 198 |
+
if '.' in img_url and ('http' in img_url or img_url.startswith('//')):
|
| 199 |
+
valid_image_urls.append(img_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
return valid_image_urls
|
| 202 |
|