Spaces:

PlaceHolderOrg
/

SwapMe

Running

File size: 10,829 Bytes

"""
Media Handler Module

This module determines what type of file a URL points to (image, video, GIF, Instagram link, etc.)
and prepares it for processing.

For Non-Technical Developers:
- Figures out what kind of media the user gave us (image vs video vs Instagram link)
- Downloads the media from the internet or extracts it from Instagram
- Validates that it's something we can actually process
- Returns the media in a format our face-swapping AI can use
"""

import os
import io
import requests
import cv2
import numpy as np
from PIL import Image
from urllib.parse import urlparse
from src.config import (
    DEFAULT_HEADERS, INSTAGRAM_HEADERS, DOWNLOAD_TIMEOUT, SUPPORTED_FORMATS,
    SUPPORTED_IMAGE_FORMATS, SUPPORTED_VIDEO_FORMATS, SUPPORTED_GIF_FORMATS,
    ERROR_MESSAGES
)
from src.logger import debug_log

# ==================== HELPER FUNCTIONS ====================

def get_file_extension_from_url(url: str) -> str:
    """
    Extract the file extension from a URL.
    
    Example: "https://example.com/image.jpg" -> ".jpg"
    
    Args:
        url: The web address we're downloading from
        
    Returns:
        The file extension (like .jpg, .mp4, .gif) or empty string if not found
    """
    parsed = urlparse(url)
    path = parsed.path.lower()
    
    # Get the extension from the path
    if '.' in path:
        return os.path.splitext(path)[1]
    
    return ''


def is_instagram_url(url: str) -> bool:
    """
    Check if a URL points to an Instagram page/post/reel URL.

    This should only match Instagram page domains, not CDN or media delivery hosts.
    """
    instagram_page_domains = {
        'instagram.com',
        'www.instagram.com',
        'm.instagram.com',
        'l.instagram.com',
        'instagr.am',
        'www.instagr.am',
        'ig.me',
        'www.ig.me',
    }
    parsed = urlparse(url)
    domain = parsed.netloc.lower()

    return domain in instagram_page_domains


def is_instagram_cdn_url(url: str) -> bool:
    """
    Detect Instagram CDN/media delivery hosts.

    This helps when the user supplies a direct Instagram CDN URL such as
    scontent-iad6-1.cdninstagram.com or cdninstagram.com.
    """
    parsed = urlparse(url)
    domain = parsed.netloc.lower()
    return any(host in domain for host in [
        'cdninstagram.com',
        'scontent',
        'instagram.com',
    ])


def detect_media_type(url: str) -> str:
    """
    Determine what type of media a URL points to.
    
    This is like asking: "Is this a photo, a video, a GIF, or an Instagram link?"
    
    Args:
        url: The web address or Instagram link
        
    Returns:
        One of: 'instagram', 'gif', 'video', 'image', or 'unknown'
    """
    from src.config import DEBUG_MODE
    
    # Check if it's Instagram first
    if is_instagram_url(url):
        if DEBUG_MODE:
            debug_log(f"[detect_media_type] Detected Instagram URL")
        return 'instagram'
    
    # Get file extension
    ext = get_file_extension_from_url(url)
    
    if DEBUG_MODE:
        debug_log(f"[detect_media_type] URL extension: {ext}")
    # Categorize based on extension
    if ext in SUPPORTED_GIF_FORMATS:
        media_type = 'gif'
    elif ext == '.webp':
        media_type = 'webp'
    elif ext in SUPPORTED_VIDEO_FORMATS:
        media_type = 'video'
    elif ext in SUPPORTED_IMAGE_FORMATS:
        media_type = 'image'
    else:
        media_type = 'unknown'

    # If the extension is unknown, attempt to infer from the remote content type.
    if media_type == 'unknown' and not is_instagram_url(url):
        try:
            response = requests.head(
                url,
                headers=DEFAULT_HEADERS,
                timeout=DOWNLOAD_TIMEOUT,
                allow_redirects=True
            )
            content_type = response.headers.get('Content-Type', '').lower()
            if 'video' in content_type:
                media_type = 'video'
            elif 'gif' in content_type:
                media_type = 'gif'
            elif 'image/webp' in content_type:
                media_type = 'webp'
            elif 'image' in content_type:
                media_type = 'image'
        except Exception:
            pass

    if DEBUG_MODE:
        debug_log(f"[detect_media_type] Detected type: {media_type}")
    
    return media_type


# ==================== MEDIA DOWNLOADING ====================

def download_media_from_url(url: str) -> bytes:
    """
    Download a file from the internet.
    
    This function handles the boring stuff like retries, headers, timeouts, etc.
    It's like a smart downloader that fetends to be a browser.
    
    Args:
        url: The web address to download from
        
    Returns:
        The file contents as binary data
        
    Raises:
        ValueError: If download fails
    """
    try:
        # Make the internet request with browser headers
        response = requests.get(
            url,
            headers=INSTAGRAM_HEADERS if is_instagram_cdn_url(url) else DEFAULT_HEADERS,
            timeout=DOWNLOAD_TIMEOUT,
            allow_redirects=True,
            stream=False  # Download the entire file at once
        )
        response.raise_for_status()  # Raise an error if server returned an error
        
        return response.content
        
    except requests.exceptions.Timeout:
        raise ValueError("Download timed out. The server took too long to respond.")
    except requests.exceptions.HTTPError as e:
        raise ValueError(f"Server returned error: {e.response.status_code}")
    except requests.exceptions.RequestException as e:
        raise ValueError(f"Download failed: {ERROR_MESSAGES['download_failed']}")


def load_image_from_bytes(image_bytes: bytes) -> np.ndarray:
    """
    Convert downloaded image bytes into a format the AI can understand (OpenCV BGR image).
    
    Args:
        image_bytes: The raw image file data
        
    Returns:
        The image as a BGR numpy array (that's how OpenCV likes images)
        
    Raises:
        ValueError: If image data is invalid
    """
    try:
        # Convert bytes to numpy array
        arr = np.frombuffer(image_bytes, np.uint8)
        
        # Decode as image (OpenCV will auto-detect the format)
        bgr_image = cv2.imdecode(arr, cv2.IMREAD_COLOR)
        
        if bgr_image is None or bgr_image.size == 0:
            raise ValueError("Image data is empty or invalid")
        
        return bgr_image
        
    except Exception as e:
        raise ValueError(f"Failed to load image: {str(e)}")


# ==================== VALIDATION ====================

def validate_url_accessibility(url: str) -> bool:
    """
    Check if we can actually reach the URL before processing.
    
    This is like knocking on the door before trying to download - saves time if the door is locked!
    
    Args:
        url: The URL to check
    
    Returns:
        True if URL is accessible, False otherwise
    """
    if is_instagram_url(url):
        # Instagram may block HEAD requests, so skip direct accessibility checks.
        # Extraction will determine if the link is valid.
        return True

    try:
        headers = INSTAGRAM_HEADERS if is_instagram_cdn_url(url) else DEFAULT_HEADERS
        response = requests.head(
            url,
            headers=headers,
            timeout=DOWNLOAD_TIMEOUT,
            allow_redirects=True
        )

        if response.status_code == 200:
            return True

        # Some servers reject HEAD requests even though GET works.
        if response.status_code in {400, 403, 405, 429}:
            try:
                response = requests.get(
                    url,
                    headers=headers,
                    timeout=DOWNLOAD_TIMEOUT,
                    allow_redirects=True,
                    stream=True
                )
                return response.status_code == 200
            except Exception:
                return False

        return False
    except Exception:
        # Some hosts (including CDN links) reject HEAD requests but still allow GET.
        try:
            response = requests.get(
                url,
                headers=INSTAGRAM_HEADERS if is_instagram_cdn_url(url) else DEFAULT_HEADERS,
                timeout=DOWNLOAD_TIMEOUT,
                allow_redirects=True,
                stream=True
            )
            return response.status_code == 200
        except Exception:
            return False


def validate_media_format(url: str) -> dict:
    """
    Check if a media URL is in a format we support.
    
    Args:
        url: The media URL to validate
        
    Returns:
        Dictionary with keys:
        - 'valid': True/False - can we process this?
        - 'media_type': What kind of media is it?
        - 'error': Error message if not valid
    """
    # Check if it's Instagram (special case - we'll handle it separately)
    if is_instagram_url(url):
        return {
            'valid': True,
            'media_type': 'instagram',
            'error': None
        }
    
    # For other URLs, check the file extension
    media_type = detect_media_type(url)
    
    if media_type == 'unknown':
        return {
            'valid': False,
            'media_type': 'unknown',
            'error': ERROR_MESSAGES['unsupported_format']
        }
    
    return {
        'valid': True,
        'media_type': media_type,
        'error': None
    }


# ==================== PUBLIC API ====================

def get_media_handler(url: str) -> dict:
    """
    Smart function that figures out what to do with a URL.
    
    This is your main entry point - give it a URL and it tells you what it is
    and what to do with it.
    
    Args:
        url: The media URL provided by the user
        
    Returns:
        Dictionary with:
        - 'accessible': Can we reach this URL?
        - 'media_type': What kind of media?
        - 'validation': Full validation results
        
    Raises:
        ValueError: If URL is invalid or unreachable
    """
    # First, validate the format
    validation = validate_media_format(url)
    
    if not validation['valid'] and not is_instagram_url(url):
        raise ValueError(validation['error'])
    
    # Check if URL is actually accessible
    if not validate_url_accessibility(url):
        raise ValueError(ERROR_MESSAGES['download_failed'])
    
    if is_instagram_url(url):
        return {
            'accessible': True,
            'media_type': 'instagram',
            'validation': validation
        }

    if not validate_url_accessibility(url):
        raise ValueError(ERROR_MESSAGES['download_failed'])

    return {
        'accessible': True,
        'media_type': validation['media_type'],
        'validation': validation
    }