File size: 6,959 Bytes
4edb0a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57868fa
 
 
 
 
 
 
4edb0a5
 
 
 
 
 
 
 
 
f49ebce
 
 
4edb0a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b15a26
 
 
1609c70
 
2b15a26
 
 
 
 
 
 
 
f6cc056
 
 
 
 
 
 
 
 
 
 
2b15a26
4edb0a5
2b15a26
 
f6cc056
2b15a26
 
7f6108f
f43a22f
4edb0a5
 
 
 
 
 
 
 
 
7f6108f
 
 
f43a22f
 
7c8e90d
7f6108f
f43a22f
 
7f6108f
 
 
 
4edb0a5
 
 
 
 
 
 
021e7d4
4edb0a5
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# ==================================================================================================
# ZERO-SHOT-VIDEO-GENERATION - app.py (Primary Application Interface)
# ==================================================================================================
# 
# πŸ“ DESCRIPTION
# This script serves as the main entry point and Gradio-based web interface for the Zero-Shot 
# Video Generation framework. It provisions the required neural network models and exposes a 
# user-friendly front-end for generating temporally consistent video content from textual prompts. 
# The interface is robustly abstracted to handle execution seamlessly across various environments, 
# inclusive of local execution and cloud instances.
#
# πŸ‘€ AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
#
# 🀝🏻 CREDITS
# Based directly on the foundational logic of Text2Video-Zero.
# Source Authors: Picsart AI Research (PAIR), UT Austin, U of Oregon, UIUC
# Reference: https://arxiv.org/abs/2303.13439
#
# πŸ”— PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/ZERO-SHOT-VIDEO-GENERATION
# Live Demo: https://huggingface.co/spaces/ameythakur/Zero-Shot-Video-Generation
# Video Demo: https://youtu.be/za9hId6UPoY
#
# πŸ“… RELEASE DATE
# November 22, 2023
#
# πŸ“œ LICENSE
# Released under the MIT License
# ==================================================================================================

import warnings
# Suppress unavoidable third-party deprecation warnings (torch.distributed, timm, diffusers).
# These originate inside library internals and cannot be fixed from application code.
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, message=".*deprecated.*")
warnings.filterwarnings("ignore", category=UserWarning, message=".*Mapping deprecated.*")

import gradio as gr
import torch

from model import Model, ModelType
from app_text_to_video import create_demo as create_demo_text_to_video
import argparse
import os

# --- ENVIRONMENT & HARDWARE INITIALIZATION ---
# Identify the operational environment to conditionally adapt interface parameters.
# Checking for 'SPACE_ID' is the robust, platform-agnostic way to detect a Hugging Face Space.
on_huggingspace = os.environ.get("SPACE_ID") is not None
device = "cuda" if torch.cuda.is_available() else "cpu"

# Instantiate the primary generative diffusion model employing Float16 on GPU resources 
# for memory-efficient tensor operations, and Float32 as a robust computational fallback.
model = Model(device=device, dtype=torch.float16 if device == "cuda" else torch.float32)

# --- CLI ARGUMENTS PARSING ---
# Establishes public accessibility parameters, useful when tunneling standard localhost traffic 
# securely for temporary external evaluations over the internet.
parser = argparse.ArgumentParser()
parser.add_argument('--public_access', action='store_true',
                    help="if enabled, the app can be access from a public url", default=False)
args = parser.parse_args()

# --- WEB INTERFACE ARCHITECTURE ---
# Assembles the Gradio Application Block layout, injecting structured HTML context and 
# encapsulating the discrete video synthesis module instance utilizing the neural pipeline.
with gr.Blocks() as demo:

    gr.HTML(
        """
        <style>
            .title-link {
                color: white !important;
                text-decoration: none !important;
                border-bottom: none !important;
                transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
                display: block;
            }
            .title-link:hover {
                transform: scale(1.01);
                text-shadow: 0 0 20px rgba(255,255,255,0.3);
                cursor: pointer;
            }
            @keyframes floating {
                0% { transform: translateY(0px) rotate(0deg); }
                25% { transform: translateY(-5px) rotate(-5deg); }
                75% { transform: translateY(5px) rotate(5deg); }
                100% { transform: translateY(0px) rotate(0deg); }
            }
            .camera-anim {
                display: inline-block;
                animation: floating 4s infinite ease-in-out;
                margin-right: 10px;
            }
        </style>
        <div style="background: linear-gradient(135deg, #4A00E0 0%, #8E2DE2 100%); padding: 3rem; border-radius: 20px; text-align: center; margin-bottom: 2rem; box-shadow: 0 10px 30px rgba(0,0,0,0.1);">
            <a href="https://github.com/Amey-Thakur/ZERO-SHOT-VIDEO-GENERATION" target="_blank" class="title-link">
                <h1 style="color: white; font-size: 3.5rem; font-weight: 800; margin: 0; text-shadow: 2px 2px 4px rgba(0,0,0,0.2); letter-spacing: -1px;">
                    <span class="camera-anim">πŸŽ₯</span> Zero-Shot Video Generation
                </h1>
            </a>
            <p style="color: rgba(255,255,255,0.9); font-size: 1.3rem; margin-top: 1rem; font-weight: 500;">
                Text-to-Video Studio using Temporal Latent Warping & Cross-Frame Attention
            </p>
        </div>
        """
    )

    with gr.Tab('Zero-Shot Text2Video'):
        # Invoke the pre-defined layout specific to the Text-to-Video generative logic, passing 
        # the initialized main diffusion model capable of handling the temporal latent inference.
        create_demo_text_to_video(model)

    gr.HTML(
        """
        <div style="text-align: center; margin-top: 3rem; padding: 2.5rem; border-radius: 15px; background: rgba(142, 45, 226, 0.05); border: 1px solid rgba(142, 45, 226, 0.1);">
            <p style="color: #4A00E0; font-size: 1rem; font-weight: 600; margin: 0;">
                Β© 2023 <a href="https://github.com/Amey-Thakur" target="_blank" style="color: #8E2DE2; text-decoration: none !important; border-bottom: none !important; transition: all 0.3s ease;">Amey Thakur</a> | University of Windsor
            </p>
            <p style="color: #777; font-size: 0.85rem; margin-top: 0.75rem; max-width: 600px; margin-left: auto; margin-right: auto; line-height: 1.5;">
                <b>Research Foundation:</b> Based on foundational breakthroughs in zero-shot temporal consistency by Picsart AI Research (PAIR), UT Austin, U of Oregon, and UIUC.
            </p>
        </div>
        """
    )

# --- APPLICATION DEPLOYMENT ALGORITHM ---
# Deploys the constructed graphical interface. Configures queuing mechanisms intrinsically to 
# prevent execution thread over-saturation during concurrent generation requests.
if on_huggingspace:
    demo.queue().launch(
        debug=True,
        ssr_mode=False,
        theme=gr.themes.Soft(primary_hue="blue", secondary_hue="indigo")
    )
else:
    _, _, link = demo.queue().launch(
        allowed_paths=['temporal'], 
        share=args.public_access,
        css='style.css',
        theme=gr.themes.Soft(primary_hue="blue", secondary_hue="indigo")
    )
    print(link)