Ovi / app.py
akhaliq's picture
akhaliq HF Staff
Update app.py
42f1e7f verified
raw
history blame
16.5 kB
import gradio as gr
import os
from huggingface_hub import InferenceClient
import tempfile
import shutil
from pathlib import Path
from typing import Optional
import time
# -------------------------
# Utilities
# -------------------------
def cleanup_temp_files():
"""Clean up old temporary video files"""
try:
temp_dir = tempfile.gettempdir()
for file_path in Path(temp_dir).glob("*.mp4"):
try:
if file_path.stat().st_mtime < (time.time() - 300):
file_path.unlink(missing_ok=True)
except Exception:
pass
except Exception as e:
print(f"Cleanup error: {e}")
def _client_from_token(token: Optional[str]) -> InferenceClient:
"""Create InferenceClient from user's OAuth token"""
if not token:
raise gr.Error("Please sign in first. This app requires your Hugging Face login.")
# IMPORTANT: do not set bill_to when using user OAuth tokens
# This ensures the user is billed, not Hugging Face
return InferenceClient(
provider="fal-ai",
api_key=token,
)
def _save_bytes_as_temp_mp4(data: bytes) -> str:
"""Save video bytes to temporary MP4 file"""
temp_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
try:
temp_file.write(data)
temp_file.flush()
return temp_file.name
finally:
temp_file.close()
def generate_video_with_auth(image, prompt, profile: gr.OAuthProfile | None, progress=gr.Progress()):
"""
Generate a video from an image using the Ovi model with authentication check.
Args:
image: Input image (PIL Image or file path)
prompt: Text prompt describing the desired motion/animation
profile: OAuth profile for authentication
progress: Gradio progress tracker
Returns:
Tuple of (video_path, status_message)
"""
try:
# Check authentication
if profile is None:
return None, "❌ Sign in with Hugging Face to continue. This app uses your inference provider credits."
if image is None:
return None, "❌ Please upload an image first!"
if not prompt or prompt.strip() == "":
return None, "❌ Please enter a prompt describing the desired motion!"
progress(0.2, desc="Processing image...")
cleanup_temp_files()
# Read the image file
if isinstance(image, str):
# If image is a file path
with open(image, "rb") as image_file:
input_image = image_file.read()
else:
# If image is PIL Image or array
import io
from PIL import Image as PILImage
if isinstance(image, PILImage.Image):
buffer = io.BytesIO()
image.save(buffer, format='PNG')
input_image = buffer.getvalue()
else:
# Assume it's a numpy array
pil_image = PILImage.fromarray(image)
buffer = io.BytesIO()
pil_image.save(buffer, format='PNG')
input_image = buffer.getvalue()
progress(0.4, desc="Generating video with AI...")
# Create client with user's OAuth token (not HF_TOKEN)
# IMPORTANT: Do not use bill_to parameter - this ensures user gets billed
client = InferenceClient(
provider="fal-ai",
api_key=profile.oauth_info.access_token, # Use user's token
)
# Generate video using the inference client
try:
video = client.image_to_video(
input_image,
prompt=prompt,
model="chetwinlow1/Ovi",
)
except Exception as e:
import requests
if isinstance(e, requests.HTTPError) and getattr(e.response, "status_code", None) == 403:
return None, "❌ Access denied by provider (403). Make sure your HF account has credits/permission for provider 'fal-ai' and model 'chetwinlow1/Ovi'."
raise
progress(0.9, desc="Finalizing video...")
# Save the video to a temporary file
video_path = _save_bytes_as_temp_mp4(video)
progress(1.0, desc="Complete!")
return video_path, f"βœ… Video generated successfully! Prompt: '{prompt[:60]}...'"
except gr.Error as e:
return None, f"❌ {str(e)}"
except Exception as e:
return None, f"❌ Generation failed. If this keeps happening, check your provider quota or try again later. Error: {str(e)}"
def clear_all():
"""Clear all inputs and outputs"""
return None, "", None, ""
# Custom CSS for better styling
custom_css = """
.container {
max-width: 1200px;
margin: auto;
}
.header-link {
text-decoration: none;
color: #2196F3;
font-weight: bold;
}
.header-link:hover {
text-decoration: underline;
}
.status-box {
padding: 10px;
border-radius: 5px;
margin-top: 10px;
}
.notice {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 14px 16px;
border-radius: 12px;
margin: 18px auto 6px;
max-width: 860px;
text-align: center;
font-size: 0.98rem;
}
.info-box {
background-color: #f0f7ff;
border-left: 4px solid #4285f4;
padding: 1em;
margin: 1em 0;
border-radius: 4px;
}
.special-tokens-box {
background: linear-gradient(135deg, #ffeaa7 0%, #fdcb6e 100%);
padding: 1em;
margin: 1em 0;
border-radius: 8px;
border-left: 4px solid #e17055;
}
"""
# Create the Gradio interface
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="Ovi Image-to-Video Generator (Paid)") as demo:
# Header with payment notice
gr.HTML(
"""
<div style="text-align:center; padding:2em 1em 1em;">
<h1 style="font-size:2.2em; margin-bottom:6px;">🎬 Ovi: Image-to-Video with Audio</h1>
<p style="color:#777; margin:0 0 8px;">Generate synchronized video and audio from images</p>
<div class="notice">
<b>Heads up:</b> This is a paid app that uses <b>your</b> inference provider credits when you run generations.
Free users get <b>$0.10 in included credits</b>. <b>PRO users</b> get <b>$2 in included credits</b>
and can continue using beyond that (with billing).
<a href='http://huggingface.co/subscribe/pro?source=ovi' target='_blank' style='color:#fff; text-decoration:underline; font-weight:bold;'>Subscribe to PRO</a>
for more credits. Please sign in with your Hugging Face account to continue.
</div>
<p style="font-size: 0.9em; color: #999; margin-top: 10px;">
Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color:#667eea; text-decoration:underline;">anycoder</a>
</p>
</div>
"""
)
gr.Markdown(
"""
### Transform your static images into dynamic videos with synchronized audio using AI!
Powered by **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation** via [HuggingFace Inference Providers](https://huggingface.co/docs/huggingface_hub/guides/inference)
"""
)
# Add login button - required for OAuth
login_btn = gr.LoginButton("Sign in with Hugging Face")
gr.HTML(
"""
<div class="info-box">
<strong>πŸ’‘ Tips for best results:</strong>
<ul>
<li>Use clear, well-lit images with a single main subject</li>
<li>Write specific prompts describing the desired motion or action</li>
<li>Keep prompts concise and focused on movement and audio elements</li>
<li>Processing generates 5-second videos at 24 FPS with synchronized audio</li>
<li>Processing may take 30-60 seconds depending on server load</li>
</ul>
</div>
"""
)
gr.HTML(
"""
<div class="special-tokens-box">
<strong>✨ Special Tokens for Enhanced Control:</strong>
<ul>
<li><strong>Speech:</strong> <code>&lt;S&gt;Your speech content here&lt;E&gt;</code> - Text enclosed in these tags will be converted to speech</li>
<li><strong>Audio Description:</strong> <code>&lt;AUDCAP&gt;Audio description here&lt;ENDAUDCAP&gt;</code> - Describes the audio or sound effects present in the video</li>
</ul>
<br>
<strong>πŸ“ Example Prompt:</strong><br>
<code>Dogs bark loudly at a man wearing a red shirt. The man says &lt;S&gt;Please stop barking at me!&lt;E&gt;. &lt;AUDCAP&gt;Dogs barking, angry man yelling in stern voice&lt;ENDAUDCAP&gt;.</code>
</div>
"""
)
with gr.Row():
with gr.Column(scale=1):
image_input = gr.Image(
label="πŸ“Έ Upload Image",
type="pil",
sources=["upload", "clipboard"],
height=400,
)
prompt_input = gr.Textbox(
label="✍️ Text Prompt",
placeholder="Describe the motion and audio you want... (e.g., 'A person walking forward while talking')",
lines=4,
max_lines=6
)
with gr.Row():
generate_btn = gr.Button(
"🎬 Generate Video",
variant="primary",
scale=2
)
clear_btn = gr.Button(
"πŸ—‘οΈ Clear",
variant="secondary",
scale=1
)
status_output = gr.Textbox(
label="Status",
interactive=False,
visible=True,
elem_classes=["status-box"]
)
gr.Examples(
examples=[
[
"5.png",
'A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the "CHOICE FM" logo and various social media handles like "@ilovechoicefm" with "RALEIGH" below it. The man intently addresses the microphone, articulating, <S>is talent. It\'s all about authenticity. You gotta be who you really are, especially if you\'re working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>'
]
],
inputs=[image_input, prompt_input],
label="Example Prompts",
)
with gr.Column(scale=1):
video_output = gr.Video(
label="πŸŽ₯ Generated Video",
height=400,
autoplay=True,
show_download_button=True
)
gr.Markdown(
"""
### About Ovi Model
**Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation**
Developed by Chetwin Low, Weimin Wang (Character AI) & Calder Katyal (Yale University)
🌟 **Key Features:**
- 🎬 **Video+Audio Generation**: Generates synchronized video and audio content simultaneously
- πŸ“ **Flexible Input**: Supports text-only or text+image conditioning
- ⏱️ **5-second Videos**: Generates 5-second videos at 24 FPS
- πŸ“ **Multiple Aspect Ratios**: Supports 720Γ—720 area at various ratios (9:16, 16:9, 1:1, etc)
Ovi is a veo-3 like model that simultaneously generates both video and audio content from text or text+image inputs.
---
### πŸ’³ Pricing Information
This app uses the Hugging Face Inference API (provider: fal-ai) which charges based on usage:
- **Free users**: $0.10 in included credits
- **PRO users**: $2 in included credits + ability to continue with billing
[Subscribe to PRO](http://huggingface.co/subscribe/pro?source=ovi) for more credits and features!
"""
)
# How to Use section
with gr.Accordion("πŸ“– How to Use", open=False):
gr.Markdown(
"""
### Getting Started:
1. **Sign in** with your Hugging Face account using the button above
2. **Upload** your image - any photo or illustration
3. **Describe** the motion and audio you want in the prompt
4. **Use special tokens** for speech and audio descriptions (optional but recommended)
5. **Generate** and watch your image come to life with synchronized audio!
### Special Tokens Guide:
**Speech Token**: `<S>text<E>`
- Use this to add spoken dialogue to your video
- Example: `The person says <S>Hello, how are you?<E>`
**Audio Description Token**: `<AUDCAP>description<ENDAUDCAP>`
- Use this to describe background sounds and audio effects
- Example: `<AUDCAP>Birds chirping, gentle wind blowing<ENDAUDCAP>`
### Tips for Better Results:
- Be specific and descriptive in your prompts
- Combine visual motion descriptions with audio elements
- Use high-quality input images for better results
- Experiment with different prompts and special tokens
- Processing takes 30-60 seconds per generation
### ⚠️ Important Notes:
- This is a **paid app** that uses your inference provider credits
- Each generation consumes credits based on processing time
- Free accounts have limited credits ($0.10)
- PRO accounts get more credits ($2) and can continue with billing
- Videos are 5 seconds long at 24 FPS
- Supports multiple aspect ratios (9:16, 16:9, 1:1, etc)
"""
)
gr.Markdown(
"""
---
### πŸ”— Resources
- [Ovi Model Card](https://huggingface.co/chetwinlow1/Ovi)
- [Character AI](https://character.ai)
- [Hugging Face Inference API Docs](https://huggingface.co/docs/huggingface_hub/guides/inference)
- [Subscribe to PRO](http://huggingface.co/subscribe/pro?source=ovi)
### πŸ“Š Model Specifications
- **Provider**: fal-ai
- **Model**: chetwinlow1/Ovi
- **Output**: 5-second videos at 24 FPS with audio
- **Input**: Image + Text prompt
- **Resolution**: 720Γ—720 area (various aspect ratios)
"""
)
# Event handlers with authentication
generate_btn.click(
fn=generate_video_with_auth,
inputs=[image_input, prompt_input],
outputs=[video_output, status_output],
show_progress="full",
queue=False,
api_name=False,
show_api=False,
)
clear_btn.click(
fn=clear_all,
inputs=[],
outputs=[image_input, prompt_input, video_output, status_output],
queue=False,
)
# Launch the app
if __name__ == "__main__":
try:
cleanup_temp_files()
if os.path.exists("gradio_cached_examples"):
shutil.rmtree("gradio_cached_examples", ignore_errors=True)
except Exception as e:
print(f"Initial cleanup error: {e}")
demo.queue(status_update_rate="auto", api_open=False, default_concurrency_limit=None)
demo.launch(
show_api=False,
share=False,
show_error=True,
enable_monitoring=False,
quiet=True,
)