import streamlit as st import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse import os import zipfile from io import BytesIO import time from PIL import Image import hashlib def is_valid_url(url): """Check if the provided URL is valid""" try: result = urlparse(url) return all([result.scheme, result.netloc]) except: return False def extract_css_background_images(css_content, base_url): """Extract background image URLs from CSS content""" import re image_urls = [] # Pattern to match background-image: url() declarations bg_pattern = r'background(?:-image)?\s*:\s*url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)' matches = re.findall(bg_pattern, css_content, re.IGNORECASE) for match in matches: if match and not match.startswith('data:'): absolute_url = urljoin(base_url, match) image_urls.append(absolute_url) return image_urls def get_image_urls(url): """Extract all image URLs from the given webpage using comprehensive methods""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' } response = requests.get(url, headers=headers, timeout=15) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') image_urls = set() # Use set to avoid duplicates # 1. Standard img tags with multiple attribute checks img_tags = soup.find_all('img') for img in img_tags: # Check multiple possible attributes for attr in ['src', 'data-src', 'data-original', 'data-lazy-src', 'data-srcset', 'data-image', 'data-bg', 'data-background', 'data-large-image']: value = img.get(attr) if value and not value.startswith('data:'): # Handle srcset (multiple images) if 'srcset' in attr.lower() or ',' in value: urls = value.split(',') for url_part in urls: clean_url = url_part.strip().split(' ')[0] if clean_url: absolute_url = urljoin(url, clean_url) image_urls.add(absolute_url) else: absolute_url = urljoin(url, value) image_urls.add(absolute_url) # 2. Picture and source tags picture_tags = soup.find_all(['picture', 'source']) for tag in picture_tags: for attr in ['src', 'srcset', 'data-src', 'data-srcset']: value = tag.get(attr) if value and not value.startswith('data:'): if 'srcset' in attr.lower() or ',' in value: urls = value.split(',') for url_part in urls: clean_url = url_part.strip().split(' ')[0] if clean_url: absolute_url = urljoin(url, clean_url) image_urls.add(absolute_url) else: absolute_url = urljoin(url, value) image_urls.add(absolute_url) # 3. Divs and other elements with background images in style attribute all_elements = soup.find_all(attrs={'style': True}) for element in all_elements: style = element.get('style', '') if 'background' in style.lower() and 'url(' in style: import re bg_matches = re.findall(r'url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)', style, re.IGNORECASE) for match in bg_matches: if match and not match.startswith('data:'): absolute_url = urljoin(url, match) image_urls.add(absolute_url) # 4. Elements with data attributes that might contain image URLs data_attrs = ['data-background-image', 'data-bg-src', 'data-hero-image', 'data-banner', 'data-slide-img', 'data-thumb', 'data-image-src', 'data-full-size', 'data-zoom-image', 'data-lightbox'] for attr in data_attrs: elements = soup.find_all(attrs={attr: True}) for element in elements: value = element.get(attr) if value and not value.startswith('data:'): absolute_url = urljoin(url, value) image_urls.add(absolute_url) # 5. CSS background images from