# ================================================================================================== # ZERO-SHOT-VIDEO-GENERATION - app_text_to_video.py (Gradio UI Components) # ================================================================================================== # # 📝 DESCRIPTION # This module constructs the structural interface for the Text2Video-Zero generation task. It # defines the modular Gradio UI components, formulates the layout parameters, and specifies the # data bindings between visual controls (like sliders, dropdowns, and buttons) and the underlying # neural processing model. Designed for modularity, it manages state interactions specifically for # translating textual representations into dynamic video sequences. # # 👤 AUTHORS # - Amey Thakur (https://github.com/Amey-Thakur) # # 🤝🏻 CREDITS # Based directly on the foundational logic of Text2Video-Zero. # Source Authors: Picsart AI Research (PAIR), UT Austin, U of Oregon, UIUC # Reference: https://arxiv.org/abs/2303.13439 # # 🔗 PROJECT LINKS # Repository: https://github.com/Amey-Thakur/ZERO-SHOT-VIDEO-GENERATION # Live Demo: https://huggingface.co/spaces/ameythakur/Zero-Shot-Video-Generation # Video Demo: https://youtu.be/za9hId6UPoY # # 📅 RELEASE DATE # November 22, 2023 # # 📜 LICENSE # Released under the MIT License # ================================================================================================== import gradio as gr from model import Model import os from hf_utils import get_model_list # Determine the operational execution context. on_huggingspace = os.environ.get("SPACE_ID") is not None # Predefined contextual exemplars establishing baseline structural validation inputs. # These prompts generate optimal temporal consistency in generated outputs utilizing the # latent diffusion methodology. Each example must provide values for ALL bound inputs # (prompt, model_name, video_length) to prevent NoneType errors during example caching. examples = [ ["an astronaut waving the arm on the moon", "dreamlike-art/dreamlike-photoreal-2.0", 2], ["a sloth surfing on a wakeboard", "dreamlike-art/dreamlike-photoreal-2.0", 2], ["an astronaut walking on a street", "dreamlike-art/dreamlike-photoreal-2.0", 2], ["a cute cat walking on grass", "dreamlike-art/dreamlike-photoreal-2.0", 2], ["a horse is galloping on a street", "dreamlike-art/dreamlike-photoreal-2.0", 2], ["an astronaut is skiing down the hill", "dreamlike-art/dreamlike-photoreal-2.0", 2], ["a gorilla walking alone down the street", "dreamlike-art/dreamlike-photoreal-2.0", 2], ["a gorilla dancing on times square", "dreamlike-art/dreamlike-photoreal-2.0", 2], ["A panda dancing dancing like crazy on Times Square", "dreamlike-art/dreamlike-photoreal-2.0", 2], ] def create_demo(model: Model): """ Constructs and returns the interactive elements of the Gradio interface for textual inputs. Binds the local inference 'model' context to user-facing input handlers to coordinate state between the UI framework and the PyTorch execution context. """ import torch is_cpu = not torch.cuda.is_available() # Wrapper function ensuring correct keyword argument mapping between the Gradio UI # components and the model's process_text2video method. This prevents positional argument # misalignment (e.g., video_length being passed as motion_field_strength_x). def generate_video(prompt, model_name, video_length): return model.process_text2video( prompt=prompt, model_name=model_name, video_length=int(video_length), ) # Instantiate the declarative layout constructor. with gr.Blocks() as demo: with gr.Row(): gr.HTML( """

Zero-Shot Video Studio

Transform cinematic text prompts into dynamic, temporally consistent AI video. Choose a diffusion model, describe your vision, and generate instantly.

Neural Architecture Verified Research Notebook
Colab
""" ) if is_cpu: gr.HTML( """

⚡ CPU Mode. Running on free-tier hardware. Resolution and frames are reduced to fit this environment. Full resolution on T4 GPU works on Google Colab. Click the notebook link above to try it.

""" ) with gr.Row(equal_height=False): with gr.Column(scale=1, variant="panel"): gr.Markdown("### ✨ Model & Concept Configuration") # Configuration block controlling diffusion model weights and textual targets. model_name = gr.Dropdown( label="Diffusion Strategy (Model)", choices=get_model_list(), value="dreamlike-art/dreamlike-photoreal-2.0", ) prompt = gr.Textbox( label='Cinematic Prompt', placeholder="Describe the scene in detail (e.g. 'an astronaut waving the arm on the moon')...", lines=3 ) run_button = gr.Button(value='Generate Sequence 🎬', variant='primary', size="lg") # Expandable execution variables defining trajectory lengths (temporal depth). with gr.Accordion('🛠️ Advanced Options', open=False): # Adapting video constraints algorithmically based on the execution domain constraints. if is_cpu: video_length = gr.Slider( label="Video Timeline (Frames)", minimum=2, maximum=4, step=1, value=2) elif on_huggingspace: video_length = gr.Slider( label="Video Timeline (Frames)", minimum=8, maximum=16, step=1, value=8) else: video_length = gr.Number( label="Video Timeline (Frames)", value=8, precision=0) with gr.Column(scale=1): # Instantiation of the rendering element to visualize synthesized structures. gr.Markdown("### 🎞️ Output Stream") result = gr.Video(label="Synthesized Video Result", height=380) inputs = [ prompt, model_name, video_length, ] # Bind curated input permutations to expedite visualization pathways. # cache_examples is disabled: video generation is GPU-intensive and will timeout # on free-tier Spaces. Users click "Generate" to run inference on-demand instead. gr.Examples(examples=examples, inputs=inputs, outputs=result, fn=generate_video, run_on_click=False, cache_examples=False, ) # Trigger execution of the generative framework upon interactive activation. run_button.click(fn=generate_video, inputs=inputs, outputs=result,) return demo