SwapMe / src /media_handler.py
Help
fix webp generation and support direct ig links
27816c1
"""
Media Handler Module
This module determines what type of file a URL points to (image, video, GIF, Instagram link, etc.)
and prepares it for processing.
For Non-Technical Developers:
- Figures out what kind of media the user gave us (image vs video vs Instagram link)
- Downloads the media from the internet or extracts it from Instagram
- Validates that it's something we can actually process
- Returns the media in a format our face-swapping AI can use
"""
import os
import io
import requests
import cv2
import numpy as np
from PIL import Image
from urllib.parse import urlparse
from src.config import (
DEFAULT_HEADERS, INSTAGRAM_HEADERS, DOWNLOAD_TIMEOUT, SUPPORTED_FORMATS,
SUPPORTED_IMAGE_FORMATS, SUPPORTED_VIDEO_FORMATS, SUPPORTED_GIF_FORMATS,
ERROR_MESSAGES
)
from src.logger import debug_log
# ==================== HELPER FUNCTIONS ====================
def get_file_extension_from_url(url: str) -> str:
"""
Extract the file extension from a URL.
Example: "https://example.com/image.jpg" -> ".jpg"
Args:
url: The web address we're downloading from
Returns:
The file extension (like .jpg, .mp4, .gif) or empty string if not found
"""
parsed = urlparse(url)
path = parsed.path.lower()
# Get the extension from the path
if '.' in path:
return os.path.splitext(path)[1]
return ''
def is_instagram_url(url: str) -> bool:
"""
Check if a URL points to an Instagram page/post/reel URL.
This should only match Instagram page domains, not CDN or media delivery hosts.
"""
instagram_page_domains = {
'instagram.com',
'www.instagram.com',
'm.instagram.com',
'l.instagram.com',
'instagr.am',
'www.instagr.am',
'ig.me',
'www.ig.me',
}
parsed = urlparse(url)
domain = parsed.netloc.lower()
return domain in instagram_page_domains
def is_instagram_cdn_url(url: str) -> bool:
"""
Detect Instagram CDN/media delivery hosts.
This helps when the user supplies a direct Instagram CDN URL such as
scontent-iad6-1.cdninstagram.com or cdninstagram.com.
"""
parsed = urlparse(url)
domain = parsed.netloc.lower()
return any(host in domain for host in [
'cdninstagram.com',
'scontent',
'instagram.com',
])
def detect_media_type(url: str) -> str:
"""
Determine what type of media a URL points to.
This is like asking: "Is this a photo, a video, a GIF, or an Instagram link?"
Args:
url: The web address or Instagram link
Returns:
One of: 'instagram', 'gif', 'video', 'image', or 'unknown'
"""
from src.config import DEBUG_MODE
# Check if it's Instagram first
if is_instagram_url(url):
if DEBUG_MODE:
debug_log(f"[detect_media_type] Detected Instagram URL")
return 'instagram'
# Get file extension
ext = get_file_extension_from_url(url)
if DEBUG_MODE:
debug_log(f"[detect_media_type] URL extension: {ext}")
# Categorize based on extension
if ext in SUPPORTED_GIF_FORMATS:
media_type = 'gif'
elif ext == '.webp':
media_type = 'webp'
elif ext in SUPPORTED_VIDEO_FORMATS:
media_type = 'video'
elif ext in SUPPORTED_IMAGE_FORMATS:
media_type = 'image'
else:
media_type = 'unknown'
# If the extension is unknown, attempt to infer from the remote content type.
if media_type == 'unknown' and not is_instagram_url(url):
try:
response = requests.head(
url,
headers=DEFAULT_HEADERS,
timeout=DOWNLOAD_TIMEOUT,
allow_redirects=True
)
content_type = response.headers.get('Content-Type', '').lower()
if 'video' in content_type:
media_type = 'video'
elif 'gif' in content_type:
media_type = 'gif'
elif 'image/webp' in content_type:
media_type = 'webp'
elif 'image' in content_type:
media_type = 'image'
except Exception:
pass
if DEBUG_MODE:
debug_log(f"[detect_media_type] Detected type: {media_type}")
return media_type
# ==================== MEDIA DOWNLOADING ====================
def download_media_from_url(url: str) -> bytes:
"""
Download a file from the internet.
This function handles the boring stuff like retries, headers, timeouts, etc.
It's like a smart downloader that fetends to be a browser.
Args:
url: The web address to download from
Returns:
The file contents as binary data
Raises:
ValueError: If download fails
"""
try:
# Make the internet request with browser headers
response = requests.get(
url,
headers=INSTAGRAM_HEADERS if is_instagram_cdn_url(url) else DEFAULT_HEADERS,
timeout=DOWNLOAD_TIMEOUT,
allow_redirects=True,
stream=False # Download the entire file at once
)
response.raise_for_status() # Raise an error if server returned an error
return response.content
except requests.exceptions.Timeout:
raise ValueError("Download timed out. The server took too long to respond.")
except requests.exceptions.HTTPError as e:
raise ValueError(f"Server returned error: {e.response.status_code}")
except requests.exceptions.RequestException as e:
raise ValueError(f"Download failed: {ERROR_MESSAGES['download_failed']}")
def load_image_from_bytes(image_bytes: bytes) -> np.ndarray:
"""
Convert downloaded image bytes into a format the AI can understand (OpenCV BGR image).
Args:
image_bytes: The raw image file data
Returns:
The image as a BGR numpy array (that's how OpenCV likes images)
Raises:
ValueError: If image data is invalid
"""
try:
# Convert bytes to numpy array
arr = np.frombuffer(image_bytes, np.uint8)
# Decode as image (OpenCV will auto-detect the format)
bgr_image = cv2.imdecode(arr, cv2.IMREAD_COLOR)
if bgr_image is None or bgr_image.size == 0:
raise ValueError("Image data is empty or invalid")
return bgr_image
except Exception as e:
raise ValueError(f"Failed to load image: {str(e)}")
# ==================== VALIDATION ====================
def validate_url_accessibility(url: str) -> bool:
"""
Check if we can actually reach the URL before processing.
This is like knocking on the door before trying to download - saves time if the door is locked!
Args:
url: The URL to check
Returns:
True if URL is accessible, False otherwise
"""
if is_instagram_url(url):
# Instagram may block HEAD requests, so skip direct accessibility checks.
# Extraction will determine if the link is valid.
return True
try:
headers = INSTAGRAM_HEADERS if is_instagram_cdn_url(url) else DEFAULT_HEADERS
response = requests.head(
url,
headers=headers,
timeout=DOWNLOAD_TIMEOUT,
allow_redirects=True
)
if response.status_code == 200:
return True
# Some servers reject HEAD requests even though GET works.
if response.status_code in {400, 403, 405, 429}:
try:
response = requests.get(
url,
headers=headers,
timeout=DOWNLOAD_TIMEOUT,
allow_redirects=True,
stream=True
)
return response.status_code == 200
except Exception:
return False
return False
except Exception:
# Some hosts (including CDN links) reject HEAD requests but still allow GET.
try:
response = requests.get(
url,
headers=INSTAGRAM_HEADERS if is_instagram_cdn_url(url) else DEFAULT_HEADERS,
timeout=DOWNLOAD_TIMEOUT,
allow_redirects=True,
stream=True
)
return response.status_code == 200
except Exception:
return False
def validate_media_format(url: str) -> dict:
"""
Check if a media URL is in a format we support.
Args:
url: The media URL to validate
Returns:
Dictionary with keys:
- 'valid': True/False - can we process this?
- 'media_type': What kind of media is it?
- 'error': Error message if not valid
"""
# Check if it's Instagram (special case - we'll handle it separately)
if is_instagram_url(url):
return {
'valid': True,
'media_type': 'instagram',
'error': None
}
# For other URLs, check the file extension
media_type = detect_media_type(url)
if media_type == 'unknown':
return {
'valid': False,
'media_type': 'unknown',
'error': ERROR_MESSAGES['unsupported_format']
}
return {
'valid': True,
'media_type': media_type,
'error': None
}
# ==================== PUBLIC API ====================
def get_media_handler(url: str) -> dict:
"""
Smart function that figures out what to do with a URL.
This is your main entry point - give it a URL and it tells you what it is
and what to do with it.
Args:
url: The media URL provided by the user
Returns:
Dictionary with:
- 'accessible': Can we reach this URL?
- 'media_type': What kind of media?
- 'validation': Full validation results
Raises:
ValueError: If URL is invalid or unreachable
"""
# First, validate the format
validation = validate_media_format(url)
if not validation['valid'] and not is_instagram_url(url):
raise ValueError(validation['error'])
# Check if URL is actually accessible
if not validate_url_accessibility(url):
raise ValueError(ERROR_MESSAGES['download_failed'])
if is_instagram_url(url):
return {
'accessible': True,
'media_type': 'instagram',
'validation': validation
}
if not validate_url_accessibility(url):
raise ValueError(ERROR_MESSAGES['download_failed'])
return {
'accessible': True,
'media_type': validation['media_type'],
'validation': validation
}