File size: 9,027 Bytes

3d8856d

"""
Inference script for TTV-1B Text-to-Video Model
Generate videos from text prompts
"""

import torch
import torch.nn as nn
from video_ttv_1b import VideoTTV1B, DDPMScheduler
from pathlib import Path
import numpy as np
from typing import Optional, List
from tqdm import tqdm
import json


class VideoGenerator:
    """Video generation from text prompts"""
    def __init__(
        self,
        model: nn.Module,
        noise_scheduler: DDPMScheduler,
        device: str = 'cuda',
    ):
        self.model = model.to(device)
        self.model.eval()
        self.noise_scheduler = noise_scheduler
        self.device = device
        
    def tokenize(self, text: str, max_length: int = 256) -> torch.Tensor:
        """Tokenize text (simple character-level tokenization)"""
        tokens = [ord(c) % 50257 for c in text[:max_length]]
        tokens = tokens + [0] * (max_length - len(tokens))
        return torch.tensor([tokens], dtype=torch.long)
    
    @torch.no_grad()
    def generate(
        self,
        prompt: str,
        num_inference_steps: int = 50,
        guidance_scale: float = 7.5,
        seed: Optional[int] = None,
    ) -> torch.Tensor:
        """
        Generate video from text prompt
        
        Args:
            prompt: Text description of the video
            num_inference_steps: Number of denoising steps
            guidance_scale: Classifier-free guidance scale
            seed: Random seed for reproducibility
            
        Returns:
            Generated video tensor (C, T, H, W)
        """
        if seed is not None:
            torch.manual_seed(seed)
            if torch.cuda.is_available():
                torch.cuda.manual_seed(seed)
        
        # Tokenize prompt
        text_tokens = self.tokenize(prompt).to(self.device)
        
        # Start from random noise
        shape = (1, 3, self.model.num_frames, *self.model.img_size)
        x = torch.randn(shape, device=self.device)
        
        # Prepare timesteps for inference
        timesteps = torch.linspace(
            self.noise_scheduler.num_steps - 1,
            0,
            num_inference_steps,
            dtype=torch.long,
            device=self.device
        )
        
        # Denoising loop
        for i, t in enumerate(tqdm(timesteps, desc="Generating video")):
            # Expand timestep to batch dimension
            t_batch = t.unsqueeze(0)
            
            # Predict noise
            noise_pred = self.model(x, t_batch, text_tokens)
            
            # Classifier-free guidance (requires training with unconditional dropout)
            if guidance_scale != 1.0:
                # Generate unconditional prediction
                uncond_tokens = torch.zeros_like(text_tokens)
                noise_pred_uncond = self.model(x, t_batch, uncond_tokens)
                
                # Apply guidance
                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
            
            # Denoise step
            x = self.noise_scheduler.sample_step(
                lambda x_t, ts, txt: noise_pred,
                x,
                t.item(),
                text_tokens
            )
        
        # Denormalize from [-1, 1] to [0, 1]
        video = (x.squeeze(0) + 1) / 2
        video = torch.clamp(video, 0, 1)
        
        return video
    
    def save_video(self, video: torch.Tensor, output_path: str, fps: int = 8):
        """
        Save video tensor to file
        
        Args:
            video: Video tensor (C, T, H, W) in range [0, 1]
            output_path: Output file path
            fps: Frames per second
        """
        try:
            import torchvision
            from torchvision.io import write_video
            
            # Convert to (T, H, W, C) and scale to [0, 255]
            video = video.permute(1, 2, 3, 0).cpu()
            video = (video * 255).to(torch.uint8)
            
            # Save video
            write_video(output_path, video, fps=fps)
            print(f"Video saved to {output_path}")
            
        except ImportError:
            print("torchvision not available, saving as numpy array")
            video_np = video.cpu().numpy()
            np.save(output_path.replace('.mp4', '.npy'), video_np)
            print(f"Video saved as numpy array to {output_path.replace('.mp4', '.npy')}")


def load_model(checkpoint_path: str, device: str = 'cuda') -> VideoTTV1B:
    """Load model from checkpoint"""
    # Load config
    config_path = Path(checkpoint_path).parent / 'model_config.json'
    if config_path.exists():
        with open(config_path, 'r') as f:
            config = json.load(f)
        print(f"Loaded model config: {config}")
    
    # Create model
    model = VideoTTV1B(
        img_size=(256, 256),
        num_frames=16,
        patch_size=(2, 16, 16),
        in_channels=3,
        hidden_dim=1536,
        depth=24,
        num_heads=24,
        mlp_ratio=4.0,
    )
    
    # Load weights
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Loaded checkpoint from {checkpoint_path}")
    print(f"Training step: {checkpoint.get('global_step', 'unknown')}")
    
    return model


def generate_video_from_prompt(
    prompt: str,
    checkpoint_path: str,
    output_path: str = "generated_video.mp4",
    num_steps: int = 50,
    guidance_scale: float = 7.5,
    seed: Optional[int] = None,
    device: str = 'cuda',
):
    """
    High-level function to generate video from text prompt
    
    Args:
        prompt: Text description
        checkpoint_path: Path to model checkpoint
        output_path: Where to save the video
        num_steps: Number of denoising steps
        guidance_scale: Guidance strength
        seed: Random seed
        device: Device to run on
    """
    print(f"Generating video for prompt: '{prompt}'")
    print(f"Using {num_steps} inference steps with guidance scale {guidance_scale}")
    
    # Load model
    model = load_model(checkpoint_path, device)
    
    # Create generator
    noise_scheduler = DDPMScheduler(num_steps=1000)
    generator = VideoGenerator(model, noise_scheduler, device)
    
    # Generate video
    video = generator.generate(
        prompt=prompt,
        num_inference_steps=num_steps,
        guidance_scale=guidance_scale,
        seed=seed,
    )
    
    # Save video
    generator.save_video(video, output_path)
    
    return video


def batch_generate(
    prompts: List[str],
    checkpoint_path: str,
    output_dir: str = "./generated_videos",
    **kwargs
):
    """Generate multiple videos from a list of prompts"""
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    for i, prompt in enumerate(prompts):
        print(f"\n[{i+1}/{len(prompts)}] Generating: {prompt}")
        output_path = output_dir / f"video_{i:04d}.mp4"
        
        try:
            generate_video_from_prompt(
                prompt=prompt,
                checkpoint_path=checkpoint_path,
                output_path=str(output_path),
                **kwargs
            )
        except Exception as e:
            print(f"Error generating video {i}: {e}")
            continue


def main():
    """Example usage"""
    import argparse
    
    parser = argparse.ArgumentParser(description="Generate videos from text prompts")
    parser.add_argument('--prompt', type=str, required=True, help='Text prompt')
    parser.add_argument('--checkpoint', type=str, required=True, help='Model checkpoint path')
    parser.add_argument('--output', type=str, default='generated_video.mp4', help='Output path')
    parser.add_argument('--steps', type=int, default=50, help='Number of inference steps')
    parser.add_argument('--guidance', type=float, default=7.5, help='Guidance scale')
    parser.add_argument('--seed', type=int, default=None, help='Random seed')
    parser.add_argument('--device', type=str, default='cuda', help='Device (cuda/cpu)')
    
    args = parser.parse_args()
    
    # Generate video
    generate_video_from_prompt(
        prompt=args.prompt,
        checkpoint_path=args.checkpoint,
        output_path=args.output,
        num_steps=args.steps,
        guidance_scale=args.guidance,
        seed=args.seed,
        device=args.device,
    )


if __name__ == "__main__":
    # Example prompts for testing
    example_prompts = [
        "A serene sunset over the ocean with gentle waves",
        "A cat playing with a ball of yarn in slow motion",
        "Time-lapse of a flower blooming in spring",
        "Aerial view of a city at night with twinkling lights",
        "Underwater scene with colorful fish swimming",
    ]
    
    print("Example prompts for video generation:")
    for i, prompt in enumerate(example_prompts, 1):
        print(f"{i}. {prompt}")
    
    print("\nRun with: python inference.py --prompt 'your prompt' --checkpoint path/to/checkpoint.pt")