""" Media Handler Module This module determines what type of file a URL points to (image, video, GIF, Instagram link, etc.) and prepares it for processing. For Non-Technical Developers: - Figures out what kind of media the user gave us (image vs video vs Instagram link) - Downloads the media from the internet or extracts it from Instagram - Validates that it's something we can actually process - Returns the media in a format our face-swapping AI can use """ import os import io import requests import cv2 import numpy as np from PIL import Image from urllib.parse import urlparse from src.config import ( DEFAULT_HEADERS, INSTAGRAM_HEADERS, DOWNLOAD_TIMEOUT, SUPPORTED_FORMATS, SUPPORTED_IMAGE_FORMATS, SUPPORTED_VIDEO_FORMATS, SUPPORTED_GIF_FORMATS, ERROR_MESSAGES ) from src.logger import debug_log # ==================== HELPER FUNCTIONS ==================== def get_file_extension_from_url(url: str) -> str: """ Extract the file extension from a URL. Example: "https://example.com/image.jpg" -> ".jpg" Args: url: The web address we're downloading from Returns: The file extension (like .jpg, .mp4, .gif) or empty string if not found """ parsed = urlparse(url) path = parsed.path.lower() # Get the extension from the path if '.' in path: return os.path.splitext(path)[1] return '' def is_instagram_url(url: str) -> bool: """ Check if a URL points to an Instagram page/post/reel URL. This should only match Instagram page domains, not CDN or media delivery hosts. """ instagram_page_domains = { 'instagram.com', 'www.instagram.com', 'm.instagram.com', 'l.instagram.com', 'instagr.am', 'www.instagr.am', 'ig.me', 'www.ig.me', } parsed = urlparse(url) domain = parsed.netloc.lower() return domain in instagram_page_domains def is_instagram_cdn_url(url: str) -> bool: """ Detect Instagram CDN/media delivery hosts. This helps when the user supplies a direct Instagram CDN URL such as scontent-iad6-1.cdninstagram.com or cdninstagram.com. """ parsed = urlparse(url) domain = parsed.netloc.lower() return any(host in domain for host in [ 'cdninstagram.com', 'scontent', 'instagram.com', ]) def detect_media_type(url: str) -> str: """ Determine what type of media a URL points to. This is like asking: "Is this a photo, a video, a GIF, or an Instagram link?" Args: url: The web address or Instagram link Returns: One of: 'instagram', 'gif', 'video', 'image', or 'unknown' """ from src.config import DEBUG_MODE # Check if it's Instagram first if is_instagram_url(url): if DEBUG_MODE: debug_log(f"[detect_media_type] Detected Instagram URL") return 'instagram' # Get file extension ext = get_file_extension_from_url(url) if DEBUG_MODE: debug_log(f"[detect_media_type] URL extension: {ext}") # Categorize based on extension if ext in SUPPORTED_GIF_FORMATS: media_type = 'gif' elif ext == '.webp': media_type = 'webp' elif ext in SUPPORTED_VIDEO_FORMATS: media_type = 'video' elif ext in SUPPORTED_IMAGE_FORMATS: media_type = 'image' else: media_type = 'unknown' # If the extension is unknown, attempt to infer from the remote content type. if media_type == 'unknown' and not is_instagram_url(url): try: response = requests.head( url, headers=DEFAULT_HEADERS, timeout=DOWNLOAD_TIMEOUT, allow_redirects=True ) content_type = response.headers.get('Content-Type', '').lower() if 'video' in content_type: media_type = 'video' elif 'gif' in content_type: media_type = 'gif' elif 'image/webp' in content_type: media_type = 'webp' elif 'image' in content_type: media_type = 'image' except Exception: pass if DEBUG_MODE: debug_log(f"[detect_media_type] Detected type: {media_type}") return media_type # ==================== MEDIA DOWNLOADING ==================== def download_media_from_url(url: str) -> bytes: """ Download a file from the internet. This function handles the boring stuff like retries, headers, timeouts, etc. It's like a smart downloader that fetends to be a browser. Args: url: The web address to download from Returns: The file contents as binary data Raises: ValueError: If download fails """ try: # Make the internet request with browser headers response = requests.get( url, headers=INSTAGRAM_HEADERS if is_instagram_cdn_url(url) else DEFAULT_HEADERS, timeout=DOWNLOAD_TIMEOUT, allow_redirects=True, stream=False # Download the entire file at once ) response.raise_for_status() # Raise an error if server returned an error return response.content except requests.exceptions.Timeout: raise ValueError("Download timed out. The server took too long to respond.") except requests.exceptions.HTTPError as e: raise ValueError(f"Server returned error: {e.response.status_code}") except requests.exceptions.RequestException as e: raise ValueError(f"Download failed: {ERROR_MESSAGES['download_failed']}") def load_image_from_bytes(image_bytes: bytes) -> np.ndarray: """ Convert downloaded image bytes into a format the AI can understand (OpenCV BGR image). Args: image_bytes: The raw image file data Returns: The image as a BGR numpy array (that's how OpenCV likes images) Raises: ValueError: If image data is invalid """ try: # Convert bytes to numpy array arr = np.frombuffer(image_bytes, np.uint8) # Decode as image (OpenCV will auto-detect the format) bgr_image = cv2.imdecode(arr, cv2.IMREAD_COLOR) if bgr_image is None or bgr_image.size == 0: raise ValueError("Image data is empty or invalid") return bgr_image except Exception as e: raise ValueError(f"Failed to load image: {str(e)}") # ==================== VALIDATION ==================== def validate_url_accessibility(url: str) -> bool: """ Check if we can actually reach the URL before processing. This is like knocking on the door before trying to download - saves time if the door is locked! Args: url: The URL to check Returns: True if URL is accessible, False otherwise """ if is_instagram_url(url): # Instagram may block HEAD requests, so skip direct accessibility checks. # Extraction will determine if the link is valid. return True try: headers = INSTAGRAM_HEADERS if is_instagram_cdn_url(url) else DEFAULT_HEADERS response = requests.head( url, headers=headers, timeout=DOWNLOAD_TIMEOUT, allow_redirects=True ) if response.status_code == 200: return True # Some servers reject HEAD requests even though GET works. if response.status_code in {400, 403, 405, 429}: try: response = requests.get( url, headers=headers, timeout=DOWNLOAD_TIMEOUT, allow_redirects=True, stream=True ) return response.status_code == 200 except Exception: return False return False except Exception: # Some hosts (including CDN links) reject HEAD requests but still allow GET. try: response = requests.get( url, headers=INSTAGRAM_HEADERS if is_instagram_cdn_url(url) else DEFAULT_HEADERS, timeout=DOWNLOAD_TIMEOUT, allow_redirects=True, stream=True ) return response.status_code == 200 except Exception: return False def validate_media_format(url: str) -> dict: """ Check if a media URL is in a format we support. Args: url: The media URL to validate Returns: Dictionary with keys: - 'valid': True/False - can we process this? - 'media_type': What kind of media is it? - 'error': Error message if not valid """ # Check if it's Instagram (special case - we'll handle it separately) if is_instagram_url(url): return { 'valid': True, 'media_type': 'instagram', 'error': None } # For other URLs, check the file extension media_type = detect_media_type(url) if media_type == 'unknown': return { 'valid': False, 'media_type': 'unknown', 'error': ERROR_MESSAGES['unsupported_format'] } return { 'valid': True, 'media_type': media_type, 'error': None } # ==================== PUBLIC API ==================== def get_media_handler(url: str) -> dict: """ Smart function that figures out what to do with a URL. This is your main entry point - give it a URL and it tells you what it is and what to do with it. Args: url: The media URL provided by the user Returns: Dictionary with: - 'accessible': Can we reach this URL? - 'media_type': What kind of media? - 'validation': Full validation results Raises: ValueError: If URL is invalid or unreachable """ # First, validate the format validation = validate_media_format(url) if not validation['valid'] and not is_instagram_url(url): raise ValueError(validation['error']) # Check if URL is actually accessible if not validate_url_accessibility(url): raise ValueError(ERROR_MESSAGES['download_failed']) if is_instagram_url(url): return { 'accessible': True, 'media_type': 'instagram', 'validation': validation } if not validate_url_accessibility(url): raise ValueError(ERROR_MESSAGES['download_failed']) return { 'accessible': True, 'media_type': validation['media_type'], 'validation': validation }