veo / app.py
citoreh's picture
Update app.py
6142548 verified
import json
import os
import gradio as gr
import google.generativeai as genai
from pydantic import BaseModel, Field, AnyUrl
from typing import List, Optional
# Removed google.colab import for Hugging Face Spaces compatibility
# Retrieve the API key from environment variables (for Hugging Face Spaces)
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
if not GOOGLE_API_KEY:
raise ValueError("GOOGLE_API_KEY environment variable is required")
# Configure the API key
genai.configure(api_key=GOOGLE_API_KEY)
class Shot(BaseModel):
"""Technical camera details for a specific clip."""
composition: str = Field(
...,
description="How the shot is framed and the lens used. Examples: 'Medium close-up, 35mm lens, deep focus, smooth gimbal', 'Extreme wide shot, 14mm lens, drone establishing shot with slow reveal', 'Dutch angle, 85mm portrait lens, handheld with intentional camera shake', 'Over-the-shoulder shot, 50mm lens, shallow depth of field'.",
)
camera_motion: str = Field(
None,
description="Describes the movement of the camera during the shot. Examples: 'slow dolly-in 60 cm', 'fast-paced tracking shot following the subject', 'static tripod shot with no movement', 'smooth jib arm crane movement from low to high', 'handheld push-in with slight wobble', 'circular dolly around subject'.",
)
frame_rate: str = Field(
"24 fps",
description="Frames per second, defining the motion look (24fps is cinematic). Examples: '24 fps', '60 fps for slow-motion effect', '120 fps for extreme slow motion', '12 fps for vintage or stop-motion feel'.",
)
film_grain: float = Field(
None,
description="Adds a stylistic film grain effect (0=none, higher values=more grain). Examples: 0.05, 0.15, 0.0, 0.3.",
)
camera: str = Field(
...,
description="Camera lens, shot type, and equipment style for this clip. Examples: 'smooth gimbal 35mm', 'handheld iPhone with anamorphic lens adapter', 'RED camera on Steadicam rig', 'vintage 16mm film camera with prime lens', 'GoPro on a helmet mount', 'Arri Alexa Mini with a zoom lens'.",
)
class Subject(BaseModel):
"""Describes the character's appearance and wardrobe within a specific clip."""
description: str = Field(
...,
description="A full, descriptive prompt of the character for this shot. Examples: 'Nyx Cipher β€” 27-year-old, 173 cm, toned-athletic build; deep-bronze skin glistening with water; jet-black slicked-back hair; almond hazel eyes behind mirrored sunglasses; small star tattoo behind right ear; wearing metallic-coral bikini and gold hoop earrings', 'Marcus Chen β€” 45-year-old chef, 180 cm, sturdy build; weathered hands from years of cooking; salt-and-pepper beard; warm brown eyes with laugh lines; wearing pristine white chef's coat with rolled sleeves', 'Luna-7 β€” ageless android appearing 25, 165 cm, sleek synthetic build; luminescent pale blue skin with circuit patterns; chrome-silver hair in geometric bob; violet LED eyes; wearing form-fitting matte black bodysuit with glowing accents'.",
)
wardrobe: str = Field(
...,
description="The specific outfit worn in this clip. This can be based on the character's default_outfit. Examples: 'metallic-coral bikini, mirrored sunglasses, gold hoop earrings', 'weathered leather jacket, ripped jeans, combat boots, fingerless gloves', 'flowing emerald silk gown with intricate beadwork, diamond tiara', 'tactical gear with kevlar vest, utility belt, night vision goggles'.",
)
class Scene(BaseModel):
"""Describes the setting and environment of the clip."""
location: str = Field(
...,
description="The physical place where the scene occurs. Examples: 'rooftop infinity pool overlooking a neon-tropic city skyline', 'abandoned Victorian mansion with overgrown ivy and broken windows', 'bustling Tokyo street market during cherry blossom season', 'underground speakeasy with dim lighting and jazz atmosphere'.",
)
time_of_day: str = Field(
"mid-day",
description="The time of day, which heavily influences lighting. Examples: 'mid-day', 'golden hour just before sunset', 'blue hour twilight', 'dead of night with only moonlight', 'early morning with soft dawn light', 'overcast afternoon'.",
)
environment: str = Field(
...,
description="Specific details about the surroundings. Examples: 'sunlit pool water reflecting shifting patterns; floating dollar-sign inflatables; heavy rain creating puddles that reflect neon signs; steam rising from manholes', 'gentle snowfall accumulating on windowsills; warm light spilling from cozy windows', 'desert wind kicking up sand clouds; distant lightning illuminating cacti silhouettes'.",
)
weather: Optional[str] = Field(
None,
description="Specific weather conditions for the scene. Examples: 'heavy rain', 'gentle snowfall', 'sunny', 'overcast', 'foggy', 'windy'.",
)
class VisualDetails(BaseModel):
"""Describes the actions and props within the clip."""
action: str = Field(
...,
description="What the character is physically doing in the scene. Examples: 'Nyx leans on pool edge and, on beat four, fans her hand cheekily toward camera as droplets sparkle in the air', 'Marcus carefully plates microgreens with tweezers, each movement precise and deliberate', 'Luna-7 interfaces with a holographic display, her fingers dancing through floating data streams', 'character parkours across rooftops, leaping between buildings with fluid grace'.",
)
props: str = Field(
None,
description="Objects that appear or are interacted with in the scene. Examples: 'floating dollar-sign inflatables', 'antique brass telescope pointing toward star-filled sky', 'holographic chess set with pieces that glow and float', 'vintage motorcycle with chrome details and leather saddlebags'.",
)
class Cinematography(BaseModel):
"""Defines the artistic visual style for this clip."""
lighting: str = Field(
...,
description="Specific lighting direction for this shot. Examples: 'high-key mid-day sunlight with specular highlights on wet skin', 'dramatic chiaroscuro lighting with deep shadows and bright highlights', 'soft window light with gauzy curtains creating dappled patterns', 'neon-lit night scene with colorful reflections on wet pavement', 'candlelit interior with warm, flickering ambiance'.",
)
tone: str = Field(
...,
description="The intended mood and feeling of the clip. Examples: 'vibrant, playful, confident', 'dark, suspenseful, and mysterious', 'warm, nostalgic, and sentimental', 'ethereal, dreamlike, and surreal', 'gritty, intense, and raw'.",
)
color_grade: str = Field(
...,
description="The color correction and mood for this clip. Examples: 'hyper-saturated neon-tropic (hot-pink, aqua, tangerine)', 'desaturated, gritty, and cool-toned for a noir look', 'warm, golden tones to evoke nostalgia', 'high-contrast black and white with selective color pops', 'teal and orange blockbuster color scheme'.",
)
class AudioTrack(BaseModel):
"""Defines the sound elements specific to this clip."""
lyrics: Optional[str] = Field(
None,
description="The lyrics to be lip-synced or heard. Examples: 'Splash-cash, bling-blapβ€”pool water pshh! Charts skrrt! like my wave, hot tropics whoosh!', 'In the silence of the ancient halls, whispers of forgotten souls call', 'Dancing through the neon lights, city never sleeps at night', 'Breaking chains of yesterday, finding strength to walk away'.",
)
emotion: Optional[str] = Field(
None,
description="The emotional tone of the vocal performance. Examples: 'confident, tongue-in-cheek', 'somber and melancholic', 'energetic and joyful', 'haunting and ethereal', 'aggressive and defiant', 'tender and vulnerable'.",
)
flow: Optional[str] = Field(
None,
description="The rhythm and cadence of the lyrical delivery (especially for rap). Examples: 'double-time for first bar, brief half-time tag', 'slow, spoken-word style with dramatic pauses', 'melodic and sing-song with flowing transitions', 'staccato rapid-fire delivery', 'syncopated rhythm with off-beat emphasis'.",
)
wave_download_url: Optional[AnyUrl] = Field(
None,
description="A URL to a pre-existing audio file for this clip (if available).",
)
youtube_reference: Optional[AnyUrl] = Field(
None,
description="A URL to a YouTube video as a reference for style or content.",
)
audio_base64: Optional[str] = Field(
None,
description="A base64 encoded string of the audio data, for embedding it directly.",
)
# -- Fields from former AudioDefaults --
format: str = Field(
"wav",
description="The desired audio file format. Examples: 'wav', 'mp3', 'flac', 'aac'.",
)
sample_rate_hz: int = Field(
48000,
description="The audio quality in Hertz, affecting fidelity. Examples: 48000, 44100, 96000, 192000.",
)
channels: int = Field(
2,
description="The number of audio channels. Examples: 2 (stereo), 1 (mono), 6 (5.1 surround), 8 (7.1 surround).",
)
style: str = Field(
None,
description="Describes the musical genre, tempo, and elements for this track. Examples: 'trap-pop rap, 145 BPM, swung hats, sub-bass', 'orchestral score with sweeping strings and dramatic percussion, 60 BPM', 'lo-fi hip hop, 80 BPM, jazzy chords, vinyl crackle', 'synthwave with arpeggiated basslines and retro drums, 120 BPM'.",
)
class Dialogue(BaseModel):
"""Defines the spoken lines and how they are presented."""
character: str = Field(
...,
description="The character who is speaking. Examples: 'Nyx Cipher', 'The Mysterious Stranger', 'AI System Voice', 'Narrator'.",
)
line: str = Field(
...,
description="The exact line of dialogue or lyrics. Examples: 'Splash-cash, bling-blapβ€”pool water pshh! Charts skrrt! like my wave, hot tropics whoosh!', 'The memories are all that remain of what we once were', 'Access granted. Welcome to the future', 'In a world where nothing is as it seems...'.",
)
subtitles: bool = Field(
default=False,
description="A boolean to determine if subtitles should be rendered for this line. Subtitles should always be false. Never add subtitles to the video.",
)
class Performance(BaseModel):
"""Controls for the character's animated performance in this clip."""
mouth_shape_intensity: Optional[float] = Field(
None,
description="Clip-specific override for lip-sync exaggeration (0=subtle, 1=exaggerated). Examples: 0.85, 0.3, 1.0, 0.1.",
)
eye_contact_ratio: Optional[float] = Field(
None,
description="Clip-specific override for how often the character looks at the camera. Examples: 0.7, 0.1, 1.0, 0.5.",
)
# -- Main Clip Model --
class Clip(BaseModel):
"""Defines a single video segment or shot."""
id: str = Field(
...,
description="A unique identifier for this specific clip. Examples: 'S1_SplashCash', 'Forest_Intro_001', 'Cyberpunk_Market_Scene_3B', 'Chase_Sequence_Final'.",
)
shot: Shot
subject: Subject
scene: Scene
visual_details: VisualDetails
cinematography: Cinematography
audio_track: AudioTrack
dialogue: Dialogue
performance: Optional[Performance] = Field(default=None)
duration_sec: int = Field(
...,
description="The exact duration of this clip in seconds. Examples: 8, 15, 3, 30, 45.",
)
aspect_ratio: str = Field(
"16:9",
description="The aspect ratio for this specific clip. Examples: '16:9' (standard widescreen), '9:16' (vertical/mobile), '2.35:1' (cinematic), '4:3' (classic), '1:1' (square).",
)
class CharacterProfile(BaseModel):
"""A detailed, consistent profile of the character's core attributes."""
name: str = Field(
...,
description="The primary name of the character. Examples: 'Nyx Cipher', 'Kaelen the Shadowmancer', 'Unit 734', 'Dr. Sarah Chen'.",
)
age: int = Field(
...,
description="Character's apparent age. Examples: 27, 350, 5, 72, 16.",
)
height: str = Field(
...,
description="Character's height, can include multiple units. Examples: '5'8\" / 173 cm', '7'2\" / 218 cm', '4'11\" / 150 cm', '6'0\" / 183 cm'.",
)
build: str = Field(
...,
description="Describes the character's body type and physique. Examples: 'lean, athletic, swimmer's shoulders', 'stocky and muscular', 'delicate and ethereal', 'tall and lanky with dancer's grace', 'compact and powerful'.",
)
skin_tone: str = Field(
...,
description="Defines the color and texture of the character's skin. Examples: 'deep bronze with a subtle sun-kissed glow', 'pale porcelain with a dusting of freckles', 'rich ebony with natural luminescence', 'olive-toned with weathered texture', 'metallic, iridescent scales'.",
)
hair: str = Field(
...,
description="Describes hair color, length, and style. Examples: 'jet-black, shoulder-length, slicked straight back and dripping', 'silver-white pixie cut with asymmetrical bangs', 'auburn curls cascading past the shoulders', 'buzz-cut platinum blonde', 'bald with intricate henna patterns'.",
)
eyes: str = Field(
...,
description="Details the shape and color of the character's eyes. Examples: 'almond-shaped hazel with faint gold flecks', 'wide, ice-blue and piercing', 'deep brown with warm amber highlights', 'green eyes with heterochromia (one blue)', 'glowing crimson without pupils'.",
)
distinguishing_marks: str = Field(
None,
description="Unique features like tattoos, scars, or piercings. Examples: 'tiny star tattoo tucked behind her right ear; gold stud in upper left helix', 'jagged lightning-bolt scar across the left temple', 'intricate sleeve tattoo depicting ocean waves', 'network of glowing cybernetic implants along the jawline'.",
)
demeanour: str = Field(
...,
description="The character's typical personality, mood, and expression. Examples: 'playfully self-assured, almost dare-you smirk', 'stoic and world-weary with gentle eyes', 'manic energy with unpredictable mood swings', 'calm and collected with hidden intensity', 'warm and approachable with infectious laughter'.",
)
# -- Fields from former GlobalStyle --
default_outfit: str = Field(
...,
description="The character's default or primary outfit. Examples: 'metallic-coral bikini, mirrored sunglasses, gold hoop earrings', 'charcoal wool coat over vintage band t-shirt and distressed jeans', 'flowing white linen dress with delicate embroidery', 'tactical black jumpsuit with utility harness', 'three-piece pinstripe suit with pocket watch'.",
)
mouth_shape_intensity: float = Field(
...,
description="Controls the exaggeration of mouth movements for lip-syncing (0=subtle, 1=exaggerated). Examples: 0.85, 0.5, 1.0, 0.25.",
)
eye_contact_ratio: float = Field(
...,
description="The percentage of time the character should be looking directly at the camera. Examples: 0.7, 0.2, 0.9, 0.5.",
)
class VideoSchema(BaseModel):
"""The root model, containing a list of characters to be generated."""
characters: List[CharacterProfile] = Field(
...,
description="A detailed, consistent profile of the character's core attributes.",
)
clips: List[Clip] = Field(
...,
description="An array containing definitions for each individual video segment or shot.",
)
def generate_script_gradio(idea: str, progress=gr.Progress()) -> tuple[str, str, str]:
"""
Generates a video script and returns the content, file path, and status.
Args:
idea: The user's video idea
progress: Gradio progress tracker
Returns:
tuple: (script_content, file_path, status_message)
"""
try:
if not idea.strip():
return "", "", "❌ Please enter a video idea"
# Create output directory
output_dir = "scripts"
os.makedirs(output_dir, exist_ok=True)
progress(0.1, desc="Initializing script generation...")
# Generate script using Gemini
progress(0.3, desc="Generating script with AI...")
model = genai.GenerativeModel("gemini-1.5-pro")
prompt = f"""
Create a detailed video script for this idea: {idea}
The video can only be a maximum of 8 seconds. Make sure to always include a dialogue.
Return a JSON object with this exact structure:
{{
"characters": [
{{
"name": "character name",
"age": 25,
"height": "5'8\" / 173 cm",
"build": "athletic build description",
"skin_tone": "skin color and texture",
"hair": "hair color and style",
"eyes": "eye color and shape",
"distinguishing_marks": "tattoos, scars, etc",
"demeanour": "personality and mood",
"default_outfit": "clothing description",
"mouth_shape_intensity": 0.7,
"eye_contact_ratio": 0.6
}}
],
"clips": [
{{
"id": "unique_clip_id",
"shot": {{
"composition": "camera framing and lens",
"camera_motion": "camera movement",
"frame_rate": "24 fps",
"film_grain": 0.1,
"camera": "camera type and setup"
}},
"subject": {{
"description": "full character description for this shot",
"wardrobe": "specific outfit for this clip"
}},
"scene": {{
"location": "where the scene takes place",
"time_of_day": "lighting time",
"environment": "environmental details",
"weather": "weather conditions"
}},
"visual_details": {{
"action": "what the character is doing",
"props": "objects in the scene"
}},
"cinematography": {{
"lighting": "lighting setup and mood",
"tone": "emotional tone of the clip",
"color_grade": "color correction style"
}},
"audio_track": {{
"lyrics": "dialogue or lyrics",
"emotion": "emotional tone",
"flow": "rhythm and cadence",
"format": "wav",
"sample_rate_hz": 48000,
"channels": 2,
"style": "audio style description"
}},
"dialogue": {{
"character": "character name",
"line": "exact dialogue line",
"subtitles": false
}},
"duration_sec": 8,
"aspect_ratio": "16:9"
}}
]
}}
"""
response = model.generate_content(
prompt,
generation_config=genai.types.GenerationConfig(
response_mime_type="application/json",
temperature=0.7,
),
)
progress(0.7, desc="Processing generated script...")
# Parse the response
video_schema = json.loads(response.text)
# Create filename
safe_filename = idea[:30].lower().replace(' ', '-').replace('/', '-').replace('\\', '-')
# Remove any other potentially problematic characters
safe_filename = ''.join(c for c in safe_filename if c.isalnum() or c in '-_')
filename = f"{safe_filename}.json"
filepath = os.path.join(output_dir, filename)
progress(0.9, desc="Saving script to file...")
# Save to file
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(video_schema, f, indent=2, ensure_ascii=False)
# Format script for display
script_content = json.dumps(video_schema, indent=2, ensure_ascii=False)
progress(1.0, desc="Script generation completed!")
status_message = f"βœ… Script generated successfully!\nπŸ“„ Saved to: {filepath}"
return script_content, filepath, status_message
except Exception as e:
error_message = f"❌ Error generating script: {str(e)}"
return "", "", error_message
def download_script(file_path: str):
"""Return the file path for download if it exists."""
if file_path and os.path.exists(file_path):
return file_path
return None
# Create Gradio interface
def create_interface():
with gr.Blocks(
title="Video Script Generator",
theme=gr.themes.Soft(),
css="""
.container { max-width: 1200px; margin: auto; }
.header { text-align: center; margin-bottom: 2rem; }
.script-output { max-height: 500px; overflow-y: auto; }
"""
) as demo:
gr.HTML("""
<div class="header">
<h1>🎬 AI Video Script Generator</h1>
<p>Generate detailed video scripts with character profiles, scenes, and cinematography</p>
</div>
""")
with gr.Row():
with gr.Column(scale=1):
idea_input = gr.Textbox(
label="πŸ’‘ Video Idea",
placeholder="Enter your video concept (e.g., 'A mysterious detective solving a case in a neon-lit cyberpunk city')",
lines=4,
max_lines=8
)
generate_btn = gr.Button(
"🎯 Generate Script",
variant="primary",
size="lg"
)
status_output = gr.Textbox(
label="πŸ“Š Status",
interactive=False,
lines=2
)
with gr.Column(scale=2):
script_output = gr.Code(
label="πŸ“œ Generated Script (JSON)",
language="json",
lines=20,
elem_classes=["script-output"]
)
file_output = gr.File(
label="πŸ’Ύ Download Script",
visible=False
)
# Examples
gr.Examples(
examples=[
["A cyberpunk hacker infiltrating a corporate database"],
["A chef preparing a magical dish in an enchanted kitchen"],
["A space explorer discovering an alien artifact"],
["A detective questioning a suspect in a film noir setting"],
["A musician performing on a rooftop at sunset"]
],
inputs=idea_input,
label="πŸ’­ Example Ideas"
)
# Event handlers
def handle_generation(idea):
script_content, file_path, status = generate_script_gradio(idea)
# Return outputs for all components
file_component = gr.File(value=file_path, visible=bool(file_path))
return script_content, status, file_component
generate_btn.click(
fn=handle_generation,
inputs=[idea_input],
outputs=[script_output, status_output, file_output]
)
# Also trigger on Enter key in the text input
idea_input.submit(
fn=handle_generation,
inputs=[idea_input],
outputs=[script_output, status_output, file_output]
)
return demo
# Launch the app
if __name__ == "__main__":
demo = create_interface()
demo.launch(
share=True,
inbrowser=True,
show_error=True
)