Upload inference.py with huggingface_hub

6c973e0 verified about 20 hours ago

19.1 kB

	import os
	import sys
	import time
	import logging
	import datetime
	from pathlib import Path
	from typing import Optional, Tuple, List, Union
	import warnings
	warnings.filterwarnings("ignore")

	import numpy as np
	from PIL import Image, ImageDraw, ImageFont
	import imageio
	import imageio_ffmpeg

	logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(name)s: %(message)s')
	logger = logging.getLogger("LegionVideo")

	# Output directory
	OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs")
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# Model directories
	MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
	T2V_MODEL_DIR = os.path.join(MODEL_DIR, "t2v")
	I2V_MODEL_DIR = os.path.join(MODEL_DIR, "i2v")

	# Constants
	DEFAULT_NEGATIVE_PROMPT = ""


	class MockVideoGenerator:
	def __init__(self):
	self.device = "cpu"
	logger.info("MockVideoGenerator initialized - will create test pattern videos")

	def generate_video(self, prompt: str, num_frames: int, width: int, height: int) -> np.ndarray:
	frames = []
	for i in range(num_frames):
	frame = np.zeros((height, width, 3), dtype=np.uint8)
	progress = i / max(num_frames - 1, 1)
	# Moving color bar
	bar_x = int(progress * (width - width // 4))
	frame[:, bar_x:bar_x + width // 4] = [
	int(128 + 127 * np.sin(progress * 4)),
	int(128 + 127 * np.sin(progress * 4 + 2)),
	int(128 + 127 * np.sin(progress * 4 + 4))
	]
	# Text overlay with prompt
	frame_pil = Image.fromarray(frame)
	draw = ImageDraw.Draw(frame_pil)
	draw.text((10, 10), prompt, fill=(255, 255, 255))
	draw.text((10, height - 30), f"LEGION AI \| Frame {i+1}/{num_frames}", fill=(200, 200, 200))
	frames.append(np.array(frame_pil))
	return np.stack(frames)


	class LegionVideoGenerator:
	"""LEGION Video Generator - High-quality video generation system.

	Features:
	- Text-to-Video generation
	- Image-to-Video generation
	- Temporal enhancement for smooth frame transitions
	- QWatermark system (configurable quality watermark overlay)
	- CPU fallback with mock generation when GPU/model unavailable
	"""

	def __init__(self, model_path: Optional[str] = None):
	self.device = self._detect_device()
	self.pipe_t2v = None
	self.pipe_i2v = None
	self.mock_mode = False
	self.mock_gen = None

	logger.info(f"LEGION Video Generator initializing (device: {self.device})")

	# Try loading real models
	if not self._load_models(model_path):
	logger.warning("Real model loading failed - using mock generator fallback")
	self.mock_mode = True
	self.mock_gen = MockVideoGenerator()

	logger.info("LEGION Video Generator initialized successfully")

	def _detect_device(self) -> str:
	try:
	import torch
	if torch.cuda.is_available():
	logger.info(f"GPU detected: {torch.cuda.get_device_name(0)}")
	return "cuda"
	except Exception:
	pass
	logger.info("No GPU detected - using CPU")
	return "cpu"

	def _check_memory_sufficient(self) -> bool:
	try:
	import psutil
	available_gb = psutil.virtual_memory().available / (1024 ** 3)
	logger.info(f"Available system RAM: {available_gb:.1f} GB")
	if available_gb < 20.0:
	logger.warning(
	f"Insufficient RAM ({available_gb:.1f} GB < 20 GB required) "
	f"to load 8.3B parameter model - using mock fallback"
	)
	return False
	return True
	except ImportError:
	try:
	with open('/proc/meminfo', 'r') as f:
	for line in f:
	if 'MemAvailable' in line:
	available_kb = int(line.split()[1])
	available_gb = available_kb / (1024 * 1024)
	logger.info(f"Available system RAM: {available_gb:.1f} GB")
	if available_gb < 20.0:
	logger.warning(
	f"Insufficient RAM ({available_gb:.1f} GB < 20 GB) - using mock"
	)
	return False
	return True
	except Exception as e:
	logger.warning(f"Cannot check RAM: {e}")
	logger.warning("Cannot check RAM - defaulting to mock mode on CPU")
	return False

	def _load_models(self, model_path: Optional[str] = None) -> bool:
	try:
	from diffusers import HunyuanVideo15Pipeline
	except ImportError as e:
	logger.warning(f"Required modules not available: {e}")
	return False

	# On CPU, check if we have enough memory first
	if self.device == "cpu":
	if not self._check_memory_sufficient():
	return False

	# Try T2V model from local path only
	t2v_path = model_path or T2V_MODEL_DIR
	try:
	if os.path.exists(os.path.join(t2v_path, "model_index.json")):
	logger.info(f"Loading T2V model from local path: {t2v_path}")
	self.pipe_t2v = HunyuanVideo15Pipeline.from_pretrained(
	t2v_path,
	torch_dtype=torch.float32,
	)
	else:
	logger.warning(f"T2V model not found at {t2v_path}")
	return False

	# Enable memory optimizations
	if self.pipe_t2v is not None:
	self.pipe_t2v.enable_model_cpu_offload()
	if hasattr(self.pipe_t2v, 'vae') and hasattr(self.pipe_t2v.vae, 'enable_tiling'):
	self.pipe_t2v.vae.enable_tiling()
	self.pipe_t2v.enable_attention_slicing()

	except Exception as e:
	logger.warning(f"Could not load T2V model: {e}")

	# Try I2V model from local path only
	try:
	i2v_path = I2V_MODEL_DIR
	if os.path.exists(os.path.join(i2v_path, "model_index.json")):
	logger.info(f"Loading I2V model from local path: {i2v_path}")
	self.pipe_i2v = HunyuanVideo15Pipeline.from_pretrained(
	i2v_path,
	torch_dtype=torch.float32,
	)

	# Enable memory optimizations on I2V
	if self.pipe_i2v is not None:
	self.pipe_i2v.enable_model_cpu_offload()
	if hasattr(self.pipe_i2v, 'vae') and hasattr(self.pipe_i2v.vae, 'enable_tiling'):
	self.pipe_i2v.vae.enable_tiling()
	self.pipe_i2v.enable_attention_slicing()

	except Exception as e:
	logger.warning(f"Could not load I2V model: {e}")

	return self.pipe_t2v is not None or self.pipe_i2v is not None

	def generate_from_text(
	self,
	prompt: str,
	negative_prompt: str = DEFAULT_NEGATIVE_PROMPT,
	num_frames: int = 49,
	width: int = 480,
	height: int = 480,
	num_inference_steps: int = 50,
	guidance_scale: float = 6.0,
	watermark_strength: float = 0.0,
	seed: Optional[int] = None,
	) -> str:
	"""Generate a video from a text prompt.

	Args:
	prompt: Text description of the video to generate
	negative_prompt: Things to avoid in the video
	num_frames: Number of frames to generate (1-129)
	width, height: Video resolution
	num_inference_steps: Diffusion inference steps
	guidance_scale: Classifier-free guidance scale
	watermark_strength: QWatermark opacity (0.0 = none, 1.0 = full)
	seed: Random seed for reproducibility

	Returns:
	Path to the generated MP4 file
	"""
	logger.info(f"T2V: '{prompt[:60]}...' ({num_frames}f, {width}x{height}, {num_inference_steps}steps)")

	if self.mock_mode:
	return self._generate_mock_video(prompt, num_frames, width, height, watermark_strength, "t2v")

	if self.pipe_t2v is None:
	raise RuntimeError("T2V pipeline not available")

	try:
	import torch
	generator = None
	if seed is not None:
	generator = torch.Generator(device=self.device).manual_seed(seed)

	output = self.pipe_t2v(
	prompt=prompt,
	negative_prompt=negative_prompt,
	num_frames=num_frames,
	width=width,
	height=height,
	num_inference_steps=num_inference_steps,
	guidance_scale=guidance_scale,
	generator=generator,
	)
	frames = output.frames[0]

	return self._export_video(frames, prompt, watermark_strength, "t2v")
	except Exception as e:
	logger.error(f"T2V generation failed: {e}")
	raise

	def generate_from_image(
	self,
	image_path: str,
	prompt: str = "",
	negative_prompt: str = DEFAULT_NEGATIVE_PROMPT,
	num_frames: int = 49,
	width: int = 480,
	height: int = 480,
	num_inference_steps: int = 50,
	guidance_scale: float = 6.0,
	watermark_strength: float = 0.0,
	seed: Optional[int] = None,
	) -> str:
	"""Generate a video from an input image + text prompt.

	Args:
	image_path: Path to the conditioning image
	prompt: Text description of motion/action
	negative_prompt: Things to avoid
	num_frames, width, height, num_inference_steps, guidance_scale: Generation params
	watermark_strength: QWatermark opacity
	seed: Random seed

	Returns:
	Path to the generated MP4 file
	"""
	logger.info(f"I2V from '{image_path}': '{prompt[:60]}...'")

	if self.mock_mode:
	return self._generate_mock_video(prompt, num_frames, width, height, watermark_strength, "i2v")

	from PIL import Image as PILImage

	if not os.path.exists(image_path):
	raise FileNotFoundError(f"Image not found: {image_path}")

	input_image = PILImage.open(image_path).convert("RGB")

	if self.pipe_i2v is not None:
	try:
	import torch
	generator = None
	if seed is not None:
	generator = torch.Generator(device=self.device).manual_seed(seed)

	output = self.pipe_i2v(
	image=input_image,
	prompt=prompt,
	negative_prompt=negative_prompt,
	num_frames=num_frames,
	width=width,
	height=height,
	num_inference_steps=num_inference_steps,
	guidance_scale=guidance_scale,
	generator=generator,
	)
	frames = output.frames[0]
	except Exception as e:
	logger.error(f"I2V generation failed: {e}")
	raise
	elif self.pipe_t2v is not None:
	# Use T2V pipeline as fallback
	logger.warning("I2V pipeline not available, falling back to T2V with prompt style")
	enhanced_prompt = prompt + ", based on the provided image style"
	try:
	import torch
	generator = None
	if seed is not None:
	generator = torch.Generator(device=self.device).manual_seed(seed)

	output = self.pipe_t2v(
	prompt=enhanced_prompt,
	negative_prompt=negative_prompt,
	num_frames=num_frames,
	width=width,
	height=height,
	num_inference_steps=num_inference_steps,
	guidance_scale=guidance_scale,
	generator=generator,
	)
	frames = output.frames[0]
	except Exception as e:
	logger.error(f"T2V fallback generation failed: {e}")
	raise
	else:
	raise RuntimeError("No video generation pipeline available")

	return self._export_video(frames, prompt, watermark_strength, "i2v")

	def _generate_mock_video(
	self, prompt: str, num_frames: int, width: int, height: int,
	watermark_strength: float, mode: str
	) -> str:
	logger.info("Using mock generator (model unavailable)")
	frames = self.mock_gen.generate_video(prompt, num_frames, width, height)
	return self._export_video(frames, prompt, watermark_strength, mode)

	def _temporal_enhancement(self, frames: np.ndarray, strength: float = 0.5) -> np.ndarray:
	"""Apply temporal smoothing to reduce frame-to-frame artifacts.

	Applies a lightweight Gaussian filter across the temporal dimension
	to smooth out flickering and jitter between consecutive frames.

	Args:
	frames: Video frames as numpy array (T, H, W, C)
	strength: Smoothing intensity (0.0 = none, 1.0 = maximum)

	Returns:
	Temporally smoothed frames
	"""
	if not isinstance(frames, np.ndarray):
	return frames

	T, H, W, C = frames.shape
	if T < 3:
	return frames # Not enough frames to smooth

	# Apply lightweight temporal smoothing
	kernel_size = max(3, int(5 * strength))
	if kernel_size % 2 == 0:
	kernel_size += 1

	# Simple temporal blur: average adjacent frames
	smoothed = frames.copy()
	half_k = min(kernel_size // 2, T // 2)

	for t in range(1, T - 1):
	left = max(0, t - half_k)
	right = min(T, t + half_k + 1)
	smoothed[t] = np.mean(frames[left:right], axis=0)

	return smoothed

	def _export_video(
	self, frames, prompt: str, watermark_strength: float, mode: str
	) -> str:
	# Apply temporal enhancement
	frames = self._temporal_enhancement(frames)

	# Apply QWatermark
	if watermark_strength > 0:
	frames = self.apply_qwatermark(frames, strength=watermark_strength)

	# Generate filename
	timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
	safe_prompt = "".join(c if c.isalnum() or c in " _-" else "_" for c in prompt[:30])
	filename = f"legion_{mode}_{timestamp}_{safe_prompt}.mp4"
	output_path = os.path.join(OUTPUT_DIR, filename)

	# Export frames to MP4
	if isinstance(frames, np.ndarray):
	if frames.dtype != np.uint8:
	frames = (np.clip(frames, 0, 1) * 255).astype(np.uint8)
	imageio.mimsave(output_path, frames, fps=8, codec='libx264',
	quality=8, pixelformat='yuv420p')
	else:
	frame_list = []
	for f in frames:
	if hasattr(f, 'mode'):
	frame_list.append(np.array(f.convert("RGB")))
	else:
	frame_list.append(np.array(f))
	imageio.mimsave(output_path, frame_list, fps=8, codec='libx264',
	quality=8, pixelformat='yuv420p')

	file_size = os.path.getsize(output_path)
	logger.info(f"Video exported: {output_path} ({file_size / 1024:.1f} KB)")
	return output_path

	def apply_qwatermark(
	self,
	frames,
	strength: float = 0.3,
	text: str = "LEGION",
	position: str = "bottom-right",
	font_size: int = 36,
	opacity: float = 0.3,
	) -> np.ndarray:
	"""Apply LEGION QWatermark to video frames.

	The QWatermark is a semi-transparent quality assurance marker
	that indicates the video was generated by the LEGION system.

	Args:
	frames: Video frames (numpy array or list of PIL Images)
	strength: Overall watermark intensity (0.0-1.0)
	text: Watermark text
	position: Position on frame
	font_size: Font size for watermark text
	opacity: Text opacity (0.0-1.0)

	Returns:
	Watermarked frames as numpy array
	"""
	opacity = opacity * strength

	if isinstance(frames, np.ndarray):
	pil_frames = [Image.fromarray(f) for f in frames]
	else:
	pil_frames = [Image.fromarray(np.array(f)) for f in frames]

	watermarked = []
	for frame in pil_frames:
	frame = frame.convert("RGBA")
	overlay = Image.new("RGBA", frame.size, (0, 0, 0, 0))
	draw = ImageDraw.Draw(overlay)

	try:
	font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
	except (IOError, OSError):
	try:
	font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", font_size)
	except (IOError, OSError):
	font = ImageFont.load_default()

	bbox = draw.textbbox((0, 0), text, font=font)
	text_w = bbox[2] - bbox[0]
	text_h = bbox[3] - bbox[1]

	padding = 10
	margin = 15
	w, h = frame.size

	pos_map = {
	"top-left": (margin, margin),
	"top-right": (w - text_w - margin, margin),
	"bottom-left": (margin, h - text_h - margin),
	"center": ((w - text_w) // 2, (h - text_h) // 2),
	"bottom-right": (w - text_w - margin, h - text_h - margin),
	}
	x, y = pos_map.get(position, pos_map["bottom-right"])

	alpha_bg = int(40 * strength)
	draw.rectangle(
	[x - padding, y - padding, x + text_w + padding, y + text_h + padding],
	fill=(0, 0, 0, alpha_bg)
	)

	alpha_text = int(255 * opacity)
	draw.text((x, y), text, font=font, fill=(255, 255, 255, alpha_text))

	badge_text = "LEGION AI"
	try:
	small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
	except:
	small_font = ImageFont.load_default()
	bbox_badge = draw.textbbox((0, 0), badge_text, font=small_font)
	badge_w = bbox_badge[2] - bbox_badge[0]
	badge_h = bbox_badge[3] - bbox_badge[1]

	draw.rectangle([5, 5, 5 + badge_w + 8, 5 + badge_h + 4], fill=(0, 0, 0, alpha_bg))
	draw.text((9, 7), badge_text, font=small_font, fill=(200, 200, 200, alpha_text))

	watermarked_frame = Image.alpha_composite(frame, overlay)
	watermarked.append(np.array(watermarked_frame.convert("RGB")))

	return np.stack(watermarked)