Spaces:

ameythakur
/

Zero-Shot-Video-Generation

Sleeping

App Files Files Community

Zero-Shot-Video-Generation / Source Code /app_text_to_video.py

ameythakur

text2video

57868fa verified 24 days ago

raw

history blame contribute delete

9.22 kB

	# ==================================================================================================
	# ZERO-SHOT-VIDEO-GENERATION - app_text_to_video.py (Gradio UI Components)
	# ==================================================================================================
	#
	# 📝 DESCRIPTION
	# This module constructs the structural interface for the Text2Video-Zero generation task. It
	# defines the modular Gradio UI components, formulates the layout parameters, and specifies the
	# data bindings between visual controls (like sliders, dropdowns, and buttons) and the underlying
	# neural processing model. Designed for modularity, it manages state interactions specifically for
	# translating textual representations into dynamic video sequences.
	#
	# 👤 AUTHORS
	# - Amey Thakur (https://github.com/Amey-Thakur)
	#
	# 🤝🏻 CREDITS
	# Based directly on the foundational logic of Text2Video-Zero.
	# Source Authors: Picsart AI Research (PAIR), UT Austin, U of Oregon, UIUC
	# Reference: https://arxiv.org/abs/2303.13439
	#
	# 🔗 PROJECT LINKS
	# Repository: https://github.com/Amey-Thakur/ZERO-SHOT-VIDEO-GENERATION
	# Live Demo: https://huggingface.co/spaces/ameythakur/Zero-Shot-Video-Generation
	# Video Demo: https://youtu.be/za9hId6UPoY
	#
	# 📅 RELEASE DATE
	# November 22, 2023
	#
	# 📜 LICENSE
	# Released under the MIT License
	# ==================================================================================================

	import gradio as gr
	from model import Model
	import os
	from hf_utils import get_model_list

	# Determine the operational execution context.
	on_huggingspace = os.environ.get("SPACE_ID") is not None

	# Predefined contextual exemplars establishing baseline structural validation inputs.
	# These prompts generate optimal temporal consistency in generated outputs utilizing the
	# latent diffusion methodology. Each example must provide values for ALL bound inputs
	# (prompt, model_name, video_length) to prevent NoneType errors during example caching.
	examples = [
	["an astronaut waving the arm on the moon", "dreamlike-art/dreamlike-photoreal-2.0", 2],
	["a sloth surfing on a wakeboard", "dreamlike-art/dreamlike-photoreal-2.0", 2],
	["an astronaut walking on a street", "dreamlike-art/dreamlike-photoreal-2.0", 2],
	["a cute cat walking on grass", "dreamlike-art/dreamlike-photoreal-2.0", 2],
	["a horse is galloping on a street", "dreamlike-art/dreamlike-photoreal-2.0", 2],
	["an astronaut is skiing down the hill", "dreamlike-art/dreamlike-photoreal-2.0", 2],
	["a gorilla walking alone down the street", "dreamlike-art/dreamlike-photoreal-2.0", 2],
	["a gorilla dancing on times square", "dreamlike-art/dreamlike-photoreal-2.0", 2],
	["A panda dancing dancing like crazy on Times Square", "dreamlike-art/dreamlike-photoreal-2.0", 2],
	]


	def create_demo(model: Model):
	"""
	Constructs and returns the interactive elements of the Gradio interface for textual inputs.
	Binds the local inference 'model' context to user-facing input handlers to coordinate state
	between the UI framework and the PyTorch execution context.
	"""
	import torch
	is_cpu = not torch.cuda.is_available()

	# Wrapper function ensuring correct keyword argument mapping between the Gradio UI
	# components and the model's process_text2video method. This prevents positional argument
	# misalignment (e.g., video_length being passed as motion_field_strength_x).
	def generate_video(prompt, model_name, video_length):
	return model.process_text2video(
	prompt=prompt,
	model_name=model_name,
	video_length=int(video_length),
	)

	# Instantiate the declarative layout constructor.
	with gr.Blocks() as demo:
	with gr.Row():
	gr.HTML(
	"""
	<div style="background: rgba(142,45,226,0.1); padding: 1.5rem; border-left: 5px solid #8E2DE2; border-radius: 10px; margin-bottom: 1.5rem; display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 1rem;">
	<div style="flex: 1; min-width: 300px;">
	<h2 style="font-weight: 700; font-size: 1.6rem; margin: 0; color: #4A00E0;">
	Zero-Shot Video Studio
	</h2>
	<p style="margin-top: 0.5rem; color: #555; font-size: 1rem; font-weight: 500; line-height: 1.4;">
	Transform cinematic text prompts into dynamic, temporally consistent AI video. Choose a diffusion model, describe your vision, and generate instantly.
	</p>
	</div>
	<a href="https://colab.research.google.com/github/Amey-Thakur/ZERO-SHOT-VIDEO-GENERATION/blob/main/ZERO-SHOT-VIDEO-GENERATION.ipynb" target="_blank" style="text-decoration: none; transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1); display: inline-flex; align-items: center; background: white; padding: 0.7rem 1.2rem; border-radius: 12px; border: 1px solid rgba(142,45,226,0.15); box-shadow: 0 4px 15px rgba(0,0,0,0.05);">
	<div style="display: flex; flex-direction: column; align-items: flex-start; margin-right: 1rem;">
	<span style="font-size: 0.7rem; font-weight: 800; color: #8E2DE2; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 2px;">Neural Architecture</span>
	<span style="font-size: 0.95rem; font-weight: 600; color: #4A00E0;">Verified Research Notebook</span>
	</div>
	<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab" style="height: 22px;"/>
	</a>
	</div>
	"""
	)

	if is_cpu:
	gr.HTML(
	"""
	<div style="background: rgba(255,165,0,0.1); padding: 1rem; border-left: 4px solid #FFA500; border-radius: 8px; margin-bottom: 1rem;">
	<p style="margin: 0; color: #856404; font-size: 0.9rem; line-height: 1.6;">
	<strong>⚡ CPU Mode.</strong> Running on free-tier hardware. Resolution and frames are reduced to fit this environment.
	Full resolution on T4 GPU works on Google Colab. Click the notebook link above to try it.
	</p>
	</div>
	"""
	)

	with gr.Row(equal_height=False):
	with gr.Column(scale=1, variant="panel"):
	gr.Markdown("### ✨ Model & Concept Configuration")
	# Configuration block controlling diffusion model weights and textual targets.
	model_name = gr.Dropdown(
	label="Diffusion Strategy (Model)",
	choices=get_model_list(),
	value="dreamlike-art/dreamlike-photoreal-2.0",
	)
	prompt = gr.Textbox(
	label='Cinematic Prompt',
	placeholder="Describe the scene in detail (e.g. 'an astronaut waving the arm on the moon')...",
	lines=3
	)
	run_button = gr.Button(value='Generate Sequence 🎬', variant='primary', size="lg")

	# Expandable execution variables defining trajectory lengths (temporal depth).
	with gr.Accordion('🛠️ Advanced Options', open=False):

	# Adapting video constraints algorithmically based on the execution domain constraints.
	if is_cpu:
	video_length = gr.Slider(
	label="Video Timeline (Frames)", minimum=2, maximum=4, step=1, value=2)
	elif on_huggingspace:
	video_length = gr.Slider(
	label="Video Timeline (Frames)", minimum=8, maximum=16, step=1, value=8)
	else:
	video_length = gr.Number(
	label="Video Timeline (Frames)", value=8, precision=0)

	with gr.Column(scale=1):
	# Instantiation of the rendering element to visualize synthesized structures.
	gr.Markdown("### 🎞️ Output Stream")
	result = gr.Video(label="Synthesized Video Result", height=380)

	inputs = [
	prompt,
	model_name,
	video_length,
	]

	# Bind curated input permutations to expedite visualization pathways.
	# cache_examples is disabled: video generation is GPU-intensive and will timeout
	# on free-tier Spaces. Users click "Generate" to run inference on-demand instead.
	gr.Examples(examples=examples,
	inputs=inputs,
	outputs=result,
	fn=generate_video,
	run_on_click=False,
	cache_examples=False,
	)

	# Trigger execution of the generative framework upon interactive activation.
	run_button.click(fn=generate_video,
	inputs=inputs,
	outputs=result,)
	return demo