Spaces:
Running
Running
| """ | |
| Media Handler Module | |
| This module determines what type of file a URL points to (image, video, GIF, Instagram link, etc.) | |
| and prepares it for processing. | |
| For Non-Technical Developers: | |
| - Figures out what kind of media the user gave us (image vs video vs Instagram link) | |
| - Downloads the media from the internet or extracts it from Instagram | |
| - Validates that it's something we can actually process | |
| - Returns the media in a format our face-swapping AI can use | |
| """ | |
| import os | |
| import io | |
| import requests | |
| import cv2 | |
| import numpy as np | |
| from PIL import Image | |
| from urllib.parse import urlparse | |
| from src.config import ( | |
| DEFAULT_HEADERS, INSTAGRAM_HEADERS, DOWNLOAD_TIMEOUT, SUPPORTED_FORMATS, | |
| SUPPORTED_IMAGE_FORMATS, SUPPORTED_VIDEO_FORMATS, SUPPORTED_GIF_FORMATS, | |
| ERROR_MESSAGES | |
| ) | |
| from src.logger import debug_log | |
| # ==================== HELPER FUNCTIONS ==================== | |
| def get_file_extension_from_url(url: str) -> str: | |
| """ | |
| Extract the file extension from a URL. | |
| Example: "https://example.com/image.jpg" -> ".jpg" | |
| Args: | |
| url: The web address we're downloading from | |
| Returns: | |
| The file extension (like .jpg, .mp4, .gif) or empty string if not found | |
| """ | |
| parsed = urlparse(url) | |
| path = parsed.path.lower() | |
| # Get the extension from the path | |
| if '.' in path: | |
| return os.path.splitext(path)[1] | |
| return '' | |
| def is_instagram_url(url: str) -> bool: | |
| """ | |
| Check if a URL points to an Instagram page/post/reel URL. | |
| This should only match Instagram page domains, not CDN or media delivery hosts. | |
| """ | |
| instagram_page_domains = { | |
| 'instagram.com', | |
| 'www.instagram.com', | |
| 'm.instagram.com', | |
| 'l.instagram.com', | |
| 'instagr.am', | |
| 'www.instagr.am', | |
| 'ig.me', | |
| 'www.ig.me', | |
| } | |
| parsed = urlparse(url) | |
| domain = parsed.netloc.lower() | |
| return domain in instagram_page_domains | |
| def is_instagram_cdn_url(url: str) -> bool: | |
| """ | |
| Detect Instagram CDN/media delivery hosts. | |
| This helps when the user supplies a direct Instagram CDN URL such as | |
| scontent-iad6-1.cdninstagram.com or cdninstagram.com. | |
| """ | |
| parsed = urlparse(url) | |
| domain = parsed.netloc.lower() | |
| return any(host in domain for host in [ | |
| 'cdninstagram.com', | |
| 'scontent', | |
| 'instagram.com', | |
| ]) | |
| def detect_media_type(url: str) -> str: | |
| """ | |
| Determine what type of media a URL points to. | |
| This is like asking: "Is this a photo, a video, a GIF, or an Instagram link?" | |
| Args: | |
| url: The web address or Instagram link | |
| Returns: | |
| One of: 'instagram', 'gif', 'video', 'image', or 'unknown' | |
| """ | |
| from src.config import DEBUG_MODE | |
| # Check if it's Instagram first | |
| if is_instagram_url(url): | |
| if DEBUG_MODE: | |
| debug_log(f"[detect_media_type] Detected Instagram URL") | |
| return 'instagram' | |
| # Get file extension | |
| ext = get_file_extension_from_url(url) | |
| if DEBUG_MODE: | |
| debug_log(f"[detect_media_type] URL extension: {ext}") | |
| # Categorize based on extension | |
| if ext in SUPPORTED_GIF_FORMATS: | |
| media_type = 'gif' | |
| elif ext == '.webp': | |
| media_type = 'webp' | |
| elif ext in SUPPORTED_VIDEO_FORMATS: | |
| media_type = 'video' | |
| elif ext in SUPPORTED_IMAGE_FORMATS: | |
| media_type = 'image' | |
| else: | |
| media_type = 'unknown' | |
| # If the extension is unknown, attempt to infer from the remote content type. | |
| if media_type == 'unknown' and not is_instagram_url(url): | |
| try: | |
| response = requests.head( | |
| url, | |
| headers=DEFAULT_HEADERS, | |
| timeout=DOWNLOAD_TIMEOUT, | |
| allow_redirects=True | |
| ) | |
| content_type = response.headers.get('Content-Type', '').lower() | |
| if 'video' in content_type: | |
| media_type = 'video' | |
| elif 'gif' in content_type: | |
| media_type = 'gif' | |
| elif 'image/webp' in content_type: | |
| media_type = 'webp' | |
| elif 'image' in content_type: | |
| media_type = 'image' | |
| except Exception: | |
| pass | |
| if DEBUG_MODE: | |
| debug_log(f"[detect_media_type] Detected type: {media_type}") | |
| return media_type | |
| # ==================== MEDIA DOWNLOADING ==================== | |
| def download_media_from_url(url: str) -> bytes: | |
| """ | |
| Download a file from the internet. | |
| This function handles the boring stuff like retries, headers, timeouts, etc. | |
| It's like a smart downloader that fetends to be a browser. | |
| Args: | |
| url: The web address to download from | |
| Returns: | |
| The file contents as binary data | |
| Raises: | |
| ValueError: If download fails | |
| """ | |
| try: | |
| # Make the internet request with browser headers | |
| response = requests.get( | |
| url, | |
| headers=INSTAGRAM_HEADERS if is_instagram_cdn_url(url) else DEFAULT_HEADERS, | |
| timeout=DOWNLOAD_TIMEOUT, | |
| allow_redirects=True, | |
| stream=False # Download the entire file at once | |
| ) | |
| response.raise_for_status() # Raise an error if server returned an error | |
| return response.content | |
| except requests.exceptions.Timeout: | |
| raise ValueError("Download timed out. The server took too long to respond.") | |
| except requests.exceptions.HTTPError as e: | |
| raise ValueError(f"Server returned error: {e.response.status_code}") | |
| except requests.exceptions.RequestException as e: | |
| raise ValueError(f"Download failed: {ERROR_MESSAGES['download_failed']}") | |
| def load_image_from_bytes(image_bytes: bytes) -> np.ndarray: | |
| """ | |
| Convert downloaded image bytes into a format the AI can understand (OpenCV BGR image). | |
| Args: | |
| image_bytes: The raw image file data | |
| Returns: | |
| The image as a BGR numpy array (that's how OpenCV likes images) | |
| Raises: | |
| ValueError: If image data is invalid | |
| """ | |
| try: | |
| # Convert bytes to numpy array | |
| arr = np.frombuffer(image_bytes, np.uint8) | |
| # Decode as image (OpenCV will auto-detect the format) | |
| bgr_image = cv2.imdecode(arr, cv2.IMREAD_COLOR) | |
| if bgr_image is None or bgr_image.size == 0: | |
| raise ValueError("Image data is empty or invalid") | |
| return bgr_image | |
| except Exception as e: | |
| raise ValueError(f"Failed to load image: {str(e)}") | |
| # ==================== VALIDATION ==================== | |
| def validate_url_accessibility(url: str) -> bool: | |
| """ | |
| Check if we can actually reach the URL before processing. | |
| This is like knocking on the door before trying to download - saves time if the door is locked! | |
| Args: | |
| url: The URL to check | |
| Returns: | |
| True if URL is accessible, False otherwise | |
| """ | |
| if is_instagram_url(url): | |
| # Instagram may block HEAD requests, so skip direct accessibility checks. | |
| # Extraction will determine if the link is valid. | |
| return True | |
| try: | |
| headers = INSTAGRAM_HEADERS if is_instagram_cdn_url(url) else DEFAULT_HEADERS | |
| response = requests.head( | |
| url, | |
| headers=headers, | |
| timeout=DOWNLOAD_TIMEOUT, | |
| allow_redirects=True | |
| ) | |
| if response.status_code == 200: | |
| return True | |
| # Some servers reject HEAD requests even though GET works. | |
| if response.status_code in {400, 403, 405, 429}: | |
| try: | |
| response = requests.get( | |
| url, | |
| headers=headers, | |
| timeout=DOWNLOAD_TIMEOUT, | |
| allow_redirects=True, | |
| stream=True | |
| ) | |
| return response.status_code == 200 | |
| except Exception: | |
| return False | |
| return False | |
| except Exception: | |
| # Some hosts (including CDN links) reject HEAD requests but still allow GET. | |
| try: | |
| response = requests.get( | |
| url, | |
| headers=INSTAGRAM_HEADERS if is_instagram_cdn_url(url) else DEFAULT_HEADERS, | |
| timeout=DOWNLOAD_TIMEOUT, | |
| allow_redirects=True, | |
| stream=True | |
| ) | |
| return response.status_code == 200 | |
| except Exception: | |
| return False | |
| def validate_media_format(url: str) -> dict: | |
| """ | |
| Check if a media URL is in a format we support. | |
| Args: | |
| url: The media URL to validate | |
| Returns: | |
| Dictionary with keys: | |
| - 'valid': True/False - can we process this? | |
| - 'media_type': What kind of media is it? | |
| - 'error': Error message if not valid | |
| """ | |
| # Check if it's Instagram (special case - we'll handle it separately) | |
| if is_instagram_url(url): | |
| return { | |
| 'valid': True, | |
| 'media_type': 'instagram', | |
| 'error': None | |
| } | |
| # For other URLs, check the file extension | |
| media_type = detect_media_type(url) | |
| if media_type == 'unknown': | |
| return { | |
| 'valid': False, | |
| 'media_type': 'unknown', | |
| 'error': ERROR_MESSAGES['unsupported_format'] | |
| } | |
| return { | |
| 'valid': True, | |
| 'media_type': media_type, | |
| 'error': None | |
| } | |
| # ==================== PUBLIC API ==================== | |
| def get_media_handler(url: str) -> dict: | |
| """ | |
| Smart function that figures out what to do with a URL. | |
| This is your main entry point - give it a URL and it tells you what it is | |
| and what to do with it. | |
| Args: | |
| url: The media URL provided by the user | |
| Returns: | |
| Dictionary with: | |
| - 'accessible': Can we reach this URL? | |
| - 'media_type': What kind of media? | |
| - 'validation': Full validation results | |
| Raises: | |
| ValueError: If URL is invalid or unreachable | |
| """ | |
| # First, validate the format | |
| validation = validate_media_format(url) | |
| if not validation['valid'] and not is_instagram_url(url): | |
| raise ValueError(validation['error']) | |
| # Check if URL is actually accessible | |
| if not validate_url_accessibility(url): | |
| raise ValueError(ERROR_MESSAGES['download_failed']) | |
| if is_instagram_url(url): | |
| return { | |
| 'accessible': True, | |
| 'media_type': 'instagram', | |
| 'validation': validation | |
| } | |
| if not validate_url_accessibility(url): | |
| raise ValueError(ERROR_MESSAGES['download_failed']) | |
| return { | |
| 'accessible': True, | |
| 'media_type': validation['media_type'], | |
| 'validation': validation | |
| } | |