maaz21 commited on
Commit
ccffe7a
·
verified ·
1 Parent(s): fb003a4

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +124 -223
src/streamlit_app.py CHANGED
@@ -17,171 +17,107 @@ def is_valid_url(url):
17
  except:
18
  return False
19
 
20
- def extract_shopify_images(soup, base_url):
21
- """Shopify-specific image extraction with aggressive parsing"""
22
- shopify_images = set()
23
-
24
- # 1. Shopify CDN URLs from any text content
25
- import re
26
- page_text = str(soup)
27
-
28
- # Match Shopify CDN patterns
29
- shopify_patterns = [
30
- r'https://cdn\.shopify\.com/s/files/[^"\s]+\.(?:jpg|jpeg|png|gif|webp)',
31
- r'https://[^".\s]+\.myshopify\.com/[^"\s]+\.(?:jpg|jpeg|png|gif|webp)',
32
- r'//cdn\.shopify\.com/s/files/[^"\s]+\.(?:jpg|jpeg|png|gif|webp)',
33
- r'cdn\.shopify\.com/s/files/[^"\s]+\.(?:jpg|jpeg|png|gif|webp)'
34
- ]
35
-
36
- for pattern in shopify_patterns:
37
- matches = re.findall(pattern, page_text, re.IGNORECASE)
38
- for match in matches:
39
- if not match.startswith('http'):
40
- match = 'https://' + match.lstrip('//')
41
- shopify_images.add(match)
42
-
43
- # 2. Shopify-specific selectors
44
- shopify_selectors = [
45
- # Hero and banner sections
46
- '.hero img', '.hero-banner img', '.banner img', '.slideshow img',
47
- '.hero__image', '.hero__media img', '.banner__media img',
48
- '.slideshow__media img', '.slideshow-slide img',
49
-
50
- # Product images
51
- '.product-media img', '.product__media img', '.product-image img',
52
- '.product-gallery img', '.product-photos img', '.product-photo img',
53
- '.product-thumbnail img', '.product-thumbs img',
54
-
55
- # Collection and card images
56
- '.collection-hero img', '.collection-image img', '.card-image img',
57
- '.card__media img', '.collection-card img', '.featured-collection img',
58
-
59
- # Section backgrounds and images
60
- '.section-image img', '.image-section img', '.content-image img',
61
- '.promo-banner img', '.promotional-banner img',
62
-
63
- # Common Shopify theme classes
64
- '.rte img', '.rich-text img', '.text-section img',
65
- '.image-with-text img', '.image-overlay img'
66
- ]
67
-
68
- for selector in shopify_selectors:
69
- try:
70
- elements = soup.select(selector)
71
- for element in elements:
72
- for attr in ['src', 'data-src', 'data-original', 'data-srcset', 'srcset']:
73
- value = element.get(attr)
74
- if value:
75
- if 'srcset' in attr and ',' in value:
76
- urls = [u.strip().split(' ')[0] for u in value.split(',')]
77
- for url in urls:
78
- if url:
79
- shopify_images.add(urljoin(base_url, url))
80
- else:
81
- shopify_images.add(urljoin(base_url, value))
82
- except:
83
- continue
84
-
85
- return shopify_images
86
-
87
  def extract_css_background_images(css_content, base_url):
88
- """Enhanced CSS background image extraction"""
89
  import re
90
 
91
- image_urls = set()
92
-
93
- # Multiple patterns for background images
94
- patterns = [
95
- r'background-image\s*:\s*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)',
96
- r'background\s*:\s*[^;]*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)',
97
- r'--[^:]*:\s*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)', # CSS variables
98
- ]
99
 
100
- for pattern in patterns:
101
- matches = re.findall(pattern, css_content, re.IGNORECASE | re.MULTILINE)
102
- for match in matches:
103
- if match and not match.startswith('data:') and '.' in match:
104
- if match.startswith('//'):
105
- match = 'https:' + match
106
- elif not match.startswith('http'):
107
- match = urljoin(base_url, match)
108
- image_urls.add(match)
109
 
110
  return image_urls
111
 
112
  def get_image_urls(url):
113
- """Enhanced image extraction with aggressive Shopify support"""
114
  try:
115
  headers = {
116
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
117
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
118
- 'Accept-Language': 'en-US,en;q=0.9',
119
- 'Accept-Encoding': 'gzip, deflate, br',
120
- 'DNT': '1',
121
- 'Connection': 'keep-alive',
122
- 'Upgrade-Insecure-Requests': '1',
123
- 'Sec-Fetch-Dest': 'document',
124
- 'Sec-Fetch-Mode': 'navigate',
125
- 'Sec-Fetch-Site': 'none'
126
  }
127
 
128
- response = requests.get(url, headers=headers, timeout=20)
129
  response.raise_for_status()
130
 
131
  soup = BeautifulSoup(response.content, 'html.parser')
132
- image_urls = set()
133
-
134
- st.info("🔍 Scanning for images using multiple detection methods...")
135
-
136
- # 1. Shopify-specific extraction (most important for Shopify sites)
137
- shopify_images = extract_shopify_images(soup, url)
138
- image_urls.update(shopify_images)
139
- if shopify_images:
140
- st.success(f"📦 Found {len(shopify_images)} Shopify-specific images")
141
-
142
- # 2. All possible img tag attributes
143
- img_attributes = [
144
- 'src', 'data-src', 'data-original', 'data-lazy-src', 'data-srcset',
145
- 'data-image', 'data-bg', 'data-background', 'data-large-image',
146
- 'data-zoom-image', 'data-full-size', 'data-master', 'data-variant-img',
147
- 'data-high-res', 'data-retina', 'data-2x', 'data-1x'
148
- ]
149
 
 
150
  img_tags = soup.find_all('img')
151
  for img in img_tags:
152
- for attr in img_attributes:
 
 
153
  value = img.get(attr)
154
  if value and not value.startswith('data:'):
155
- if 'srcset' in attr and ',' in value:
156
- urls = [u.strip().split(' ')[0] for u in value.split(',')]
157
- for img_url in urls:
158
- if img_url:
159
- image_urls.add(urljoin(url, img_url))
 
 
 
160
  else:
161
- image_urls.add(urljoin(url, value))
 
162
 
163
- # 3. Enhanced CSS background extraction
164
- # Inline styles
165
- style_elements = soup.find_all(attrs={'style': True})
166
- for element in style_elements:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  style = element.get('style', '')
168
  if 'background' in style.lower() and 'url(' in style:
169
- css_images = extract_css_background_images(style, url)
170
- image_urls.update(css_images)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
- # Style tags
173
  style_tags = soup.find_all('style')
174
  for style_tag in style_tags:
175
  if style_tag.string:
176
  css_images = extract_css_background_images(style_tag.string, url)
177
  image_urls.update(css_images)
178
 
179
- # External CSS (increased limit for Shopify)
180
  link_tags = soup.find_all('link', {'rel': 'stylesheet'})
181
- css_count = 0
182
- for link in link_tags:
183
- if css_count >= 10: # Increased from 5 to 10
184
- break
185
  css_url = link.get('href')
186
  if css_url:
187
  try:
@@ -190,112 +126,77 @@ def get_image_urls(url):
190
  if css_response.status_code == 200:
191
  css_images = extract_css_background_images(css_response.text, url)
192
  image_urls.update(css_images)
193
- css_count += 1
194
  except:
195
- continue
196
 
197
- # 4. Picture and source tags
198
- media_tags = soup.find_all(['picture', 'source'])
199
- for tag in media_tags:
200
- for attr in ['src', 'srcset', 'data-src', 'data-srcset']:
201
- value = tag.get(attr)
202
- if value and not value.startswith('data:'):
203
- if 'srcset' in attr and ',' in value:
204
- urls = [u.strip().split(' ')[0] for u in value.split(',')]
205
- for img_url in urls:
206
- if img_url:
207
- image_urls.add(urljoin(url, img_url))
208
- else:
209
- image_urls.add(urljoin(url, value))
210
-
211
- # 5. Data attributes on any element
212
- data_image_attrs = [
213
- 'data-background-image', 'data-bg-src', 'data-hero-image',
214
- 'data-banner', 'data-slide-img', 'data-thumb', 'data-image-src',
215
- 'data-bgset', 'data-widths', 'data-sizes', 'data-aspect-ratio'
216
- ]
217
-
218
- for attr in data_image_attrs:
219
- elements = soup.find_all(attrs={attr: True})
220
- for element in elements:
221
- value = element.get(attr)
222
- if value and not value.startswith('data:'):
223
- # Handle complex Shopify data attributes
224
- if 'bgset' in attr or 'widths' in attr:
225
- import re
226
- urls = re.findall(r'(https?://[^\s,}]+\.(?:jpg|jpeg|png|gif|webp))', value, re.IGNORECASE)
227
- image_urls.update(urls)
228
- else:
229
- image_urls.add(urljoin(url, value))
230
-
231
- # 6. JavaScript and JSON data
232
- script_tags = soup.find_all('script')
233
- for script in script_tags:
234
- if script.string:
235
- script_content = script.string
236
-
237
- # Look for image URLs in JavaScript
238
- import re
239
- js_image_patterns = [
240
- r'"(https?://[^"]+\.(?:jpg|jpeg|png|gif|webp)[^"]*)"',
241
- r"'(https?://[^']+\.(?:jpg|jpeg|png|gif|webp)[^']*)'",
242
- r'url:\s*["\']([^"\']+\.(?:jpg|jpeg|png|gif|webp)[^"\']*)["\']',
243
- r'src:\s*["\']([^"\']+\.(?:jpg|jpeg|png|gif|webp)[^"\']*)["\']',
244
- r'image:\s*["\']([^"\']+\.(?:jpg|jpeg|png|gif|webp)[^"\']*)["\']'
245
- ]
246
-
247
- for pattern in js_image_patterns:
248
- matches = re.findall(pattern, script_content, re.IGNORECASE)
249
- for match in matches:
250
- if not match.startswith('data:'):
251
- if match.startswith('//'):
252
- match = 'https:' + match
253
- elif not match.startswith('http'):
254
- match = urljoin(url, match)
255
- image_urls.add(match)
256
-
257
- # 7. Meta tags (social media images)
258
  meta_tags = soup.find_all('meta')
259
  for meta in meta_tags:
260
- property_val = meta.get('property', '').lower()
261
- name_val = meta.get('name', '').lower()
262
- if any(prop in property_val + name_val for prop in ['image', 'photo', 'picture']):
263
- content = meta.get('content', '')
264
- if content and any(ext in content.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp']):
265
- if content.startswith('http'):
266
- image_urls.add(content)
267
  else:
268
- image_urls.add(urljoin(url, content))
 
269
 
270
  # 8. SVG images
271
  svg_tags = soup.find_all('svg')
272
  for svg in svg_tags:
 
273
  image_elements = svg.find_all('image')
274
  for img in image_elements:
275
  href = img.get('href') or img.get('xlink:href')
276
  if href and not href.startswith('data:'):
277
- image_urls.add(urljoin(url, href))
 
 
 
 
 
 
 
278
 
279
- # Final filtering and validation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  valid_image_urls = []
281
  for img_url in image_urls:
282
- if (img_url and len(img_url) > 15 and
283
- not img_url.startswith('data:') and
284
- any(ext in img_url.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']) and
285
- ('http' in img_url or img_url.startswith('//'))):
286
-
287
- # Clean up URLs
288
- if img_url.startswith('//'):
289
- img_url = 'https:' + img_url
290
-
291
- # Remove query parameters that might break downloads (but keep Shopify transforms)
292
- if '?' in img_url and 'shopify.com' not in img_url:
293
- img_url = img_url.split('?')[0]
294
-
295
- valid_image_urls.append(img_url)
296
-
297
- # Sort by URL to group similar images together
298
- valid_image_urls.sort()
299
 
300
  return valid_image_urls
301
 
 
17
  except:
18
  return False
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def extract_css_background_images(css_content, base_url):
21
+ """Extract background image URLs from CSS content"""
22
  import re
23
 
24
+ image_urls = []
25
+ # Pattern to match background-image: url() declarations
26
+ bg_pattern = r'background(?:-image)?\s*:\s*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)'
 
 
 
 
 
27
 
28
+ matches = re.findall(bg_pattern, css_content, re.IGNORECASE)
29
+ for match in matches:
30
+ if match and not match.startswith('data:'):
31
+ absolute_url = urljoin(base_url, match)
32
+ image_urls.append(absolute_url)
 
 
 
 
33
 
34
  return image_urls
35
 
36
  def get_image_urls(url):
37
+ """Extract all image URLs from the given webpage using comprehensive methods"""
38
  try:
39
  headers = {
40
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
 
 
 
 
 
 
 
 
 
41
  }
42
 
43
+ response = requests.get(url, headers=headers, timeout=15)
44
  response.raise_for_status()
45
 
46
  soup = BeautifulSoup(response.content, 'html.parser')
47
+ image_urls = set() # Use set to avoid duplicates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ # 1. Standard img tags with multiple attribute checks
50
  img_tags = soup.find_all('img')
51
  for img in img_tags:
52
+ # Check multiple possible attributes
53
+ for attr in ['src', 'data-src', 'data-original', 'data-lazy-src', 'data-srcset',
54
+ 'data-image', 'data-bg', 'data-background', 'data-large-image']:
55
  value = img.get(attr)
56
  if value and not value.startswith('data:'):
57
+ # Handle srcset (multiple images)
58
+ if 'srcset' in attr.lower() or ',' in value:
59
+ urls = value.split(',')
60
+ for url_part in urls:
61
+ clean_url = url_part.strip().split(' ')[0]
62
+ if clean_url:
63
+ absolute_url = urljoin(url, clean_url)
64
+ image_urls.add(absolute_url)
65
  else:
66
+ absolute_url = urljoin(url, value)
67
+ image_urls.add(absolute_url)
68
 
69
+ # 2. Picture and source tags
70
+ picture_tags = soup.find_all(['picture', 'source'])
71
+ for tag in picture_tags:
72
+ for attr in ['src', 'srcset', 'data-src', 'data-srcset']:
73
+ value = tag.get(attr)
74
+ if value and not value.startswith('data:'):
75
+ if 'srcset' in attr.lower() or ',' in value:
76
+ urls = value.split(',')
77
+ for url_part in urls:
78
+ clean_url = url_part.strip().split(' ')[0]
79
+ if clean_url:
80
+ absolute_url = urljoin(url, clean_url)
81
+ image_urls.add(absolute_url)
82
+ else:
83
+ absolute_url = urljoin(url, value)
84
+ image_urls.add(absolute_url)
85
+
86
+ # 3. Divs and other elements with background images in style attribute
87
+ all_elements = soup.find_all(attrs={'style': True})
88
+ for element in all_elements:
89
  style = element.get('style', '')
90
  if 'background' in style.lower() and 'url(' in style:
91
+ import re
92
+ bg_matches = re.findall(r'url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)', style, re.IGNORECASE)
93
+ for match in bg_matches:
94
+ if match and not match.startswith('data:'):
95
+ absolute_url = urljoin(url, match)
96
+ image_urls.add(absolute_url)
97
+
98
+ # 4. Elements with data attributes that might contain image URLs
99
+ data_attrs = ['data-background-image', 'data-bg-src', 'data-hero-image',
100
+ 'data-banner', 'data-slide-img', 'data-thumb', 'data-image-src',
101
+ 'data-full-size', 'data-zoom-image', 'data-lightbox']
102
+
103
+ for attr in data_attrs:
104
+ elements = soup.find_all(attrs={attr: True})
105
+ for element in elements:
106
+ value = element.get(attr)
107
+ if value and not value.startswith('data:'):
108
+ absolute_url = urljoin(url, value)
109
+ image_urls.add(absolute_url)
110
 
111
+ # 5. CSS background images from <style> tags
112
  style_tags = soup.find_all('style')
113
  for style_tag in style_tags:
114
  if style_tag.string:
115
  css_images = extract_css_background_images(style_tag.string, url)
116
  image_urls.update(css_images)
117
 
118
+ # 6. External CSS files
119
  link_tags = soup.find_all('link', {'rel': 'stylesheet'})
120
+ for link in link_tags[:5]: # Limit to first 5 CSS files to avoid overload
 
 
 
121
  css_url = link.get('href')
122
  if css_url:
123
  try:
 
126
  if css_response.status_code == 200:
127
  css_images = extract_css_background_images(css_response.text, url)
128
  image_urls.update(css_images)
 
129
  except:
130
+ continue # Skip if CSS file can't be loaded
131
 
132
+ # 7. Meta tags (Open Graph, Twitter Cards, etc.)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  meta_tags = soup.find_all('meta')
134
  for meta in meta_tags:
135
+ for attr in ['content', 'value']:
136
+ value = meta.get(attr, '')
137
+ if value and any(ext in value.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']):
138
+ if 'http' in value:
139
+ image_urls.add(value)
 
 
140
  else:
141
+ absolute_url = urljoin(url, value)
142
+ image_urls.add(absolute_url)
143
 
144
  # 8. SVG images
145
  svg_tags = soup.find_all('svg')
146
  for svg in svg_tags:
147
+ # Look for embedded images in SVG
148
  image_elements = svg.find_all('image')
149
  for img in image_elements:
150
  href = img.get('href') or img.get('xlink:href')
151
  if href and not href.startswith('data:'):
152
+ absolute_url = urljoin(url, href)
153
+ image_urls.add(absolute_url)
154
+
155
+ # 9. Shopify specific selectors
156
+ shopify_selectors = [
157
+ '[data-bgset]', '[data-widths]', '.hero__image', '.banner__media img',
158
+ '.card__media img', '.product__media img', '[data-shopify]'
159
+ ]
160
 
161
+ for selector in shopify_selectors:
162
+ try:
163
+ elements = soup.select(selector)
164
+ for element in elements:
165
+ for attr in ['src', 'data-src', 'data-bgset', 'data-widths', 'srcset']:
166
+ value = element.get(attr)
167
+ if value and not value.startswith('data:'):
168
+ if 'bgset' in attr or 'widths' in attr or 'srcset' in attr:
169
+ # Parse complex attribute formats
170
+ import re
171
+ urls = re.findall(r'https?://[^\s,]+', value)
172
+ for found_url in urls:
173
+ image_urls.add(found_url)
174
+ else:
175
+ absolute_url = urljoin(url, value)
176
+ image_urls.add(absolute_url)
177
+ except:
178
+ continue
179
+
180
+ # 10. Look for JSON-LD structured data
181
+ json_scripts = soup.find_all('script', {'type': 'application/ld+json'})
182
+ for script in json_scripts:
183
+ try:
184
+ import json
185
+ data = json.loads(script.string)
186
+ json_str = json.dumps(data)
187
+ import re
188
+ urls = re.findall(r'https?://[^\s"]+\.(?:jpg|jpeg|png|gif|webp|svg)', json_str, re.IGNORECASE)
189
+ image_urls.update(urls)
190
+ except:
191
+ continue
192
+
193
+ # Filter out obviously invalid URLs and convert to list
194
  valid_image_urls = []
195
  for img_url in image_urls:
196
+ if img_url and len(img_url) > 10 and not img_url.startswith('data:'):
197
+ # Basic validation - must look like a URL
198
+ if '.' in img_url and ('http' in img_url or img_url.startswith('//')):
199
+ valid_image_urls.append(img_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
  return valid_image_urls
202