Spaces:

PlaceHolderOrg
/

SwapMe

Running

SwapMe / src /media_handler.py

Help

fix webp generation and support direct ig links

27816c1 10 days ago

10.8 kB

	"""
	Media Handler Module

	This module determines what type of file a URL points to (image, video, GIF, Instagram link, etc.)
	and prepares it for processing.

	For Non-Technical Developers:
	- Figures out what kind of media the user gave us (image vs video vs Instagram link)
	- Downloads the media from the internet or extracts it from Instagram
	- Validates that it's something we can actually process
	- Returns the media in a format our face-swapping AI can use
	"""

	import os
	import io
	import requests
	import cv2
	import numpy as np
	from PIL import Image
	from urllib.parse import urlparse
	from src.config import (
	DEFAULT_HEADERS, INSTAGRAM_HEADERS, DOWNLOAD_TIMEOUT, SUPPORTED_FORMATS,
	SUPPORTED_IMAGE_FORMATS, SUPPORTED_VIDEO_FORMATS, SUPPORTED_GIF_FORMATS,
	ERROR_MESSAGES
	)
	from src.logger import debug_log

	# ==================== HELPER FUNCTIONS ====================

	def get_file_extension_from_url(url: str) -> str:
	"""
	Extract the file extension from a URL.

	Example: "https://example.com/image.jpg" -> ".jpg"

	Args:
	url: The web address we're downloading from

	Returns:
	The file extension (like .jpg, .mp4, .gif) or empty string if not found
	"""
	parsed = urlparse(url)
	path = parsed.path.lower()

	# Get the extension from the path
	if '.' in path:
	return os.path.splitext(path)[1]

	return ''


	def is_instagram_url(url: str) -> bool:
	"""
	Check if a URL points to an Instagram page/post/reel URL.

	This should only match Instagram page domains, not CDN or media delivery hosts.
	"""
	instagram_page_domains = {
	'instagram.com',
	'www.instagram.com',
	'm.instagram.com',
	'l.instagram.com',
	'instagr.am',
	'www.instagr.am',
	'ig.me',
	'www.ig.me',
	}
	parsed = urlparse(url)
	domain = parsed.netloc.lower()

	return domain in instagram_page_domains


	def is_instagram_cdn_url(url: str) -> bool:
	"""
	Detect Instagram CDN/media delivery hosts.

	This helps when the user supplies a direct Instagram CDN URL such as
	scontent-iad6-1.cdninstagram.com or cdninstagram.com.
	"""
	parsed = urlparse(url)
	domain = parsed.netloc.lower()
	return any(host in domain for host in [
	'cdninstagram.com',
	'scontent',
	'instagram.com',
	])


	def detect_media_type(url: str) -> str:
	"""
	Determine what type of media a URL points to.

	This is like asking: "Is this a photo, a video, a GIF, or an Instagram link?"

	Args:
	url: The web address or Instagram link

	Returns:
	One of: 'instagram', 'gif', 'video', 'image', or 'unknown'
	"""
	from src.config import DEBUG_MODE

	# Check if it's Instagram first
	if is_instagram_url(url):
	if DEBUG_MODE:
	debug_log(f"[detect_media_type] Detected Instagram URL")
	return 'instagram'

	# Get file extension
	ext = get_file_extension_from_url(url)

	if DEBUG_MODE:
	debug_log(f"[detect_media_type] URL extension: {ext}")
	# Categorize based on extension
	if ext in SUPPORTED_GIF_FORMATS:
	media_type = 'gif'
	elif ext == '.webp':
	media_type = 'webp'
	elif ext in SUPPORTED_VIDEO_FORMATS:
	media_type = 'video'
	elif ext in SUPPORTED_IMAGE_FORMATS:
	media_type = 'image'
	else:
	media_type = 'unknown'

	# If the extension is unknown, attempt to infer from the remote content type.
	if media_type == 'unknown' and not is_instagram_url(url):
	try:
	response = requests.head(
	url,
	headers=DEFAULT_HEADERS,
	timeout=DOWNLOAD_TIMEOUT,
	allow_redirects=True
	)
	content_type = response.headers.get('Content-Type', '').lower()
	if 'video' in content_type:
	media_type = 'video'
	elif 'gif' in content_type:
	media_type = 'gif'
	elif 'image/webp' in content_type:
	media_type = 'webp'
	elif 'image' in content_type:
	media_type = 'image'
	except Exception:
	pass

	if DEBUG_MODE:
	debug_log(f"[detect_media_type] Detected type: {media_type}")

	return media_type


	# ==================== MEDIA DOWNLOADING ====================

	def download_media_from_url(url: str) -> bytes:
	"""
	Download a file from the internet.

	This function handles the boring stuff like retries, headers, timeouts, etc.
	It's like a smart downloader that fetends to be a browser.

	Args:
	url: The web address to download from

	Returns:
	The file contents as binary data

	Raises:
	ValueError: If download fails
	"""
	try:
	# Make the internet request with browser headers
	response = requests.get(
	url,
	headers=INSTAGRAM_HEADERS if is_instagram_cdn_url(url) else DEFAULT_HEADERS,
	timeout=DOWNLOAD_TIMEOUT,
	allow_redirects=True,
	stream=False # Download the entire file at once
	)
	response.raise_for_status() # Raise an error if server returned an error

	return response.content

	except requests.exceptions.Timeout:
	raise ValueError("Download timed out. The server took too long to respond.")
	except requests.exceptions.HTTPError as e:
	raise ValueError(f"Server returned error: {e.response.status_code}")
	except requests.exceptions.RequestException as e:
	raise ValueError(f"Download failed: {ERROR_MESSAGES['download_failed']}")


	def load_image_from_bytes(image_bytes: bytes) -> np.ndarray:
	"""
	Convert downloaded image bytes into a format the AI can understand (OpenCV BGR image).

	Args:
	image_bytes: The raw image file data

	Returns:
	The image as a BGR numpy array (that's how OpenCV likes images)

	Raises:
	ValueError: If image data is invalid
	"""
	try:
	# Convert bytes to numpy array
	arr = np.frombuffer(image_bytes, np.uint8)

	# Decode as image (OpenCV will auto-detect the format)
	bgr_image = cv2.imdecode(arr, cv2.IMREAD_COLOR)

	if bgr_image is None or bgr_image.size == 0:
	raise ValueError("Image data is empty or invalid")

	return bgr_image

	except Exception as e:
	raise ValueError(f"Failed to load image: {str(e)}")


	# ==================== VALIDATION ====================

	def validate_url_accessibility(url: str) -> bool:
	"""
	Check if we can actually reach the URL before processing.

	This is like knocking on the door before trying to download - saves time if the door is locked!

	Args:
	url: The URL to check

	Returns:
	True if URL is accessible, False otherwise
	"""
	if is_instagram_url(url):
	# Instagram may block HEAD requests, so skip direct accessibility checks.
	# Extraction will determine if the link is valid.
	return True

	try:
	headers = INSTAGRAM_HEADERS if is_instagram_cdn_url(url) else DEFAULT_HEADERS
	response = requests.head(
	url,
	headers=headers,
	timeout=DOWNLOAD_TIMEOUT,
	allow_redirects=True
	)

	if response.status_code == 200:
	return True

	# Some servers reject HEAD requests even though GET works.
	if response.status_code in {400, 403, 405, 429}:
	try:
	response = requests.get(
	url,
	headers=headers,
	timeout=DOWNLOAD_TIMEOUT,
	allow_redirects=True,
	stream=True
	)
	return response.status_code == 200
	except Exception:
	return False

	return False
	except Exception:
	# Some hosts (including CDN links) reject HEAD requests but still allow GET.
	try:
	response = requests.get(
	url,
	headers=INSTAGRAM_HEADERS if is_instagram_cdn_url(url) else DEFAULT_HEADERS,
	timeout=DOWNLOAD_TIMEOUT,
	allow_redirects=True,
	stream=True
	)
	return response.status_code == 200
	except Exception:
	return False


	def validate_media_format(url: str) -> dict:
	"""
	Check if a media URL is in a format we support.

	Args:
	url: The media URL to validate

	Returns:
	Dictionary with keys:
	- 'valid': True/False - can we process this?
	- 'media_type': What kind of media is it?
	- 'error': Error message if not valid
	"""
	# Check if it's Instagram (special case - we'll handle it separately)
	if is_instagram_url(url):
	return {
	'valid': True,
	'media_type': 'instagram',
	'error': None
	}

	# For other URLs, check the file extension
	media_type = detect_media_type(url)

	if media_type == 'unknown':
	return {
	'valid': False,
	'media_type': 'unknown',
	'error': ERROR_MESSAGES['unsupported_format']
	}

	return {
	'valid': True,
	'media_type': media_type,
	'error': None
	}


	# ==================== PUBLIC API ====================

	def get_media_handler(url: str) -> dict:
	"""
	Smart function that figures out what to do with a URL.

	This is your main entry point - give it a URL and it tells you what it is
	and what to do with it.

	Args:
	url: The media URL provided by the user

	Returns:
	Dictionary with:
	- 'accessible': Can we reach this URL?
	- 'media_type': What kind of media?
	- 'validation': Full validation results

	Raises:
	ValueError: If URL is invalid or unreachable
	"""
	# First, validate the format
	validation = validate_media_format(url)

	if not validation['valid'] and not is_instagram_url(url):
	raise ValueError(validation['error'])

	# Check if URL is actually accessible
	if not validate_url_accessibility(url):
	raise ValueError(ERROR_MESSAGES['download_failed'])

	if is_instagram_url(url):
	return {
	'accessible': True,
	'media_type': 'instagram',
	'validation': validation
	}

	if not validate_url_accessibility(url):
	raise ValueError(ERROR_MESSAGES['download_failed'])

	return {
	'accessible': True,
	'media_type': validation['media_type'],
	'validation': validation
	}