Spaces:

ameythakur
/

Zero-Shot-Video-Generation

Running

File size: 6,959 Bytes

# ==================================================================================================
# ZERO-SHOT-VIDEO-GENERATION - app.py (Primary Application Interface)
# ==================================================================================================
# 
# 📝 DESCRIPTION
# This script serves as the main entry point and Gradio-based web interface for the Zero-Shot 
# Video Generation framework. It provisions the required neural network models and exposes a 
# user-friendly front-end for generating temporally consistent video content from textual prompts. 
# The interface is robustly abstracted to handle execution seamlessly across various environments, 
# inclusive of local execution and cloud instances.
#
# 👤 AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
#
# 🤝🏻 CREDITS
# Based directly on the foundational logic of Text2Video-Zero.
# Source Authors: Picsart AI Research (PAIR), UT Austin, U of Oregon, UIUC
# Reference: https://arxiv.org/abs/2303.13439
#
# 🔗 PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/ZERO-SHOT-VIDEO-GENERATION
# Live Demo: https://huggingface.co/spaces/ameythakur/Zero-Shot-Video-Generation
# Video Demo: https://youtu.be/za9hId6UPoY
#
# 📅 RELEASE DATE
# November 22, 2023
#
# 📜 LICENSE
# Released under the MIT License
# ==================================================================================================

import warnings
# Suppress unavoidable third-party deprecation warnings (torch.distributed, timm, diffusers).
# These originate inside library internals and cannot be fixed from application code.
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, message=".*deprecated.*")
warnings.filterwarnings("ignore", category=UserWarning, message=".*Mapping deprecated.*")

import gradio as gr
import torch

from model import Model, ModelType
from app_text_to_video import create_demo as create_demo_text_to_video
import argparse
import os

# --- ENVIRONMENT & HARDWARE INITIALIZATION ---
# Identify the operational environment to conditionally adapt interface parameters.
# Checking for 'SPACE_ID' is the robust, platform-agnostic way to detect a Hugging Face Space.
on_huggingspace = os.environ.get("SPACE_ID") is not None
device = "cuda" if torch.cuda.is_available() else "cpu"

# Instantiate the primary generative diffusion model employing Float16 on GPU resources 
# for memory-efficient tensor operations, and Float32 as a robust computational fallback.
model = Model(device=device, dtype=torch.float16 if device == "cuda" else torch.float32)

# --- CLI ARGUMENTS PARSING ---
# Establishes public accessibility parameters, useful when tunneling standard localhost traffic 
# securely for temporary external evaluations over the internet.
parser = argparse.ArgumentParser()
parser.add_argument('--public_access', action='store_true',
                    help="if enabled, the app can be access from a public url", default=False)
args = parser.parse_args()

# --- WEB INTERFACE ARCHITECTURE ---
# Assembles the Gradio Application Block layout, injecting structured HTML context and 
# encapsulating the discrete video synthesis module instance utilizing the neural pipeline.
with gr.Blocks() as demo:

    gr.HTML(
        """
        <style>
            .title-link {
                color: white !important;
                text-decoration: none !important;
                border-bottom: none !important;
                transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
                display: block;
            }
            .title-link:hover {
                transform: scale(1.01);
                text-shadow: 0 0 20px rgba(255,255,255,0.3);
                cursor: pointer;
            }
            @keyframes floating {
                0% { transform: translateY(0px) rotate(0deg); }
                25% { transform: translateY(-5px) rotate(-5deg); }
                75% { transform: translateY(5px) rotate(5deg); }
                100% { transform: translateY(0px) rotate(0deg); }
            }
            .camera-anim {
                display: inline-block;
                animation: floating 4s infinite ease-in-out;
                margin-right: 10px;
            }
        </style>
        <div style="background: linear-gradient(135deg, #4A00E0 0%, #8E2DE2 100%); padding: 3rem; border-radius: 20px; text-align: center; margin-bottom: 2rem; box-shadow: 0 10px 30px rgba(0,0,0,0.1);">
            <a href="https://github.com/Amey-Thakur/ZERO-SHOT-VIDEO-GENERATION" target="_blank" class="title-link">
                <h1 style="color: white; font-size: 3.5rem; font-weight: 800; margin: 0; text-shadow: 2px 2px 4px rgba(0,0,0,0.2); letter-spacing: -1px;">
                    <span class="camera-anim">🎥</span> Zero-Shot Video Generation
                </h1>
            </a>
            <p style="color: rgba(255,255,255,0.9); font-size: 1.3rem; margin-top: 1rem; font-weight: 500;">
                Text-to-Video Studio using Temporal Latent Warping & Cross-Frame Attention
            </p>
        </div>
        """
    )

    with gr.Tab('Zero-Shot Text2Video'):
        # Invoke the pre-defined layout specific to the Text-to-Video generative logic, passing 
        # the initialized main diffusion model capable of handling the temporal latent inference.
        create_demo_text_to_video(model)

    gr.HTML(
        """
        <div style="text-align: center; margin-top: 3rem; padding: 2.5rem; border-radius: 15px; background: rgba(142, 45, 226, 0.05); border: 1px solid rgba(142, 45, 226, 0.1);">
            <p style="color: #4A00E0; font-size: 1rem; font-weight: 600; margin: 0;">
                © 2023 <a href="https://github.com/Amey-Thakur" target="_blank" style="color: #8E2DE2; text-decoration: none !important; border-bottom: none !important; transition: all 0.3s ease;">Amey Thakur</a> | University of Windsor
            </p>
            <p style="color: #777; font-size: 0.85rem; margin-top: 0.75rem; max-width: 600px; margin-left: auto; margin-right: auto; line-height: 1.5;">
                <b>Research Foundation:</b> Based on foundational breakthroughs in zero-shot temporal consistency by Picsart AI Research (PAIR), UT Austin, U of Oregon, and UIUC.
            </p>
        </div>
        """
    )

# --- APPLICATION DEPLOYMENT ALGORITHM ---
# Deploys the constructed graphical interface. Configures queuing mechanisms intrinsically to 
# prevent execution thread over-saturation during concurrent generation requests.
if on_huggingspace:
    demo.queue().launch(
        debug=True,
        ssr_mode=False,
        theme=gr.themes.Soft(primary_hue="blue", secondary_hue="indigo")
    )
else:
    _, _, link = demo.queue().launch(
        allowed_paths=['temporal'], 
        share=args.public_access,
        css='style.css',
        theme=gr.themes.Soft(primary_hue="blue", secondary_hue="indigo")
    )
    print(link)