|
|
import gradio as gr |
|
|
import os |
|
|
from huggingface_hub import InferenceClient |
|
|
import tempfile |
|
|
import shutil |
|
|
from pathlib import Path |
|
|
from typing import Optional |
|
|
import time |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def cleanup_temp_files(): |
|
|
"""Clean up old temporary video files""" |
|
|
try: |
|
|
temp_dir = tempfile.gettempdir() |
|
|
for file_path in Path(temp_dir).glob("*.mp4"): |
|
|
try: |
|
|
if file_path.stat().st_mtime < (time.time() - 300): |
|
|
file_path.unlink(missing_ok=True) |
|
|
except Exception: |
|
|
pass |
|
|
except Exception as e: |
|
|
print(f"Cleanup error: {e}") |
|
|
|
|
|
def _client_from_token(token: Optional[str]) -> InferenceClient: |
|
|
"""Create InferenceClient from user's OAuth token""" |
|
|
if not token: |
|
|
raise gr.Error("Please sign in first. This app requires your Hugging Face login.") |
|
|
|
|
|
|
|
|
return InferenceClient( |
|
|
provider="fal-ai", |
|
|
api_key=token, |
|
|
) |
|
|
|
|
|
def _save_bytes_as_temp_mp4(data: bytes) -> str: |
|
|
"""Save video bytes to temporary MP4 file""" |
|
|
temp_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) |
|
|
try: |
|
|
temp_file.write(data) |
|
|
temp_file.flush() |
|
|
return temp_file.name |
|
|
finally: |
|
|
temp_file.close() |
|
|
|
|
|
def generate_video_with_auth(image, prompt, profile: gr.OAuthProfile | None, progress=gr.Progress()): |
|
|
""" |
|
|
Generate a video from an image using the Ovi model with authentication check. |
|
|
|
|
|
Args: |
|
|
image: Input image (PIL Image or file path) |
|
|
prompt: Text prompt describing the desired motion/animation |
|
|
profile: OAuth profile for authentication |
|
|
progress: Gradio progress tracker |
|
|
|
|
|
Returns: |
|
|
Tuple of (video_path, status_message) |
|
|
""" |
|
|
try: |
|
|
|
|
|
if profile is None: |
|
|
return None, "β Sign in with Hugging Face to continue. This app uses your inference provider credits." |
|
|
|
|
|
if image is None: |
|
|
return None, "β Please upload an image first!" |
|
|
|
|
|
if not prompt or prompt.strip() == "": |
|
|
return None, "β Please enter a prompt describing the desired motion!" |
|
|
|
|
|
progress(0.2, desc="Processing image...") |
|
|
|
|
|
cleanup_temp_files() |
|
|
|
|
|
|
|
|
if isinstance(image, str): |
|
|
|
|
|
with open(image, "rb") as image_file: |
|
|
input_image = image_file.read() |
|
|
else: |
|
|
|
|
|
import io |
|
|
from PIL import Image as PILImage |
|
|
|
|
|
if isinstance(image, PILImage.Image): |
|
|
buffer = io.BytesIO() |
|
|
image.save(buffer, format='PNG') |
|
|
input_image = buffer.getvalue() |
|
|
else: |
|
|
|
|
|
pil_image = PILImage.fromarray(image) |
|
|
buffer = io.BytesIO() |
|
|
pil_image.save(buffer, format='PNG') |
|
|
input_image = buffer.getvalue() |
|
|
|
|
|
progress(0.4, desc="Generating video with AI...") |
|
|
|
|
|
|
|
|
|
|
|
client = InferenceClient( |
|
|
provider="fal-ai", |
|
|
api_key=profile.oauth_info.access_token, |
|
|
) |
|
|
|
|
|
|
|
|
try: |
|
|
video = client.image_to_video( |
|
|
input_image, |
|
|
prompt=prompt, |
|
|
model="chetwinlow1/Ovi", |
|
|
) |
|
|
except Exception as e: |
|
|
import requests |
|
|
if isinstance(e, requests.HTTPError) and getattr(e.response, "status_code", None) == 403: |
|
|
return None, "β Access denied by provider (403). Make sure your HF account has credits/permission for provider 'fal-ai' and model 'chetwinlow1/Ovi'." |
|
|
raise |
|
|
|
|
|
progress(0.9, desc="Finalizing video...") |
|
|
|
|
|
|
|
|
video_path = _save_bytes_as_temp_mp4(video) |
|
|
|
|
|
progress(1.0, desc="Complete!") |
|
|
|
|
|
return video_path, f"β
Video generated successfully! Prompt: '{prompt[:60]}...'" |
|
|
|
|
|
except gr.Error as e: |
|
|
return None, f"β {str(e)}" |
|
|
except Exception as e: |
|
|
return None, f"β Generation failed. If this keeps happening, check your provider quota or try again later. Error: {str(e)}" |
|
|
|
|
|
def clear_all(): |
|
|
"""Clear all inputs and outputs""" |
|
|
return None, "", None, "" |
|
|
|
|
|
|
|
|
custom_css = """ |
|
|
.container { |
|
|
max-width: 1200px; |
|
|
margin: auto; |
|
|
} |
|
|
.header-link { |
|
|
text-decoration: none; |
|
|
color: #2196F3; |
|
|
font-weight: bold; |
|
|
} |
|
|
.header-link:hover { |
|
|
text-decoration: underline; |
|
|
} |
|
|
.status-box { |
|
|
padding: 10px; |
|
|
border-radius: 5px; |
|
|
margin-top: 10px; |
|
|
} |
|
|
.notice { |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
|
color: white; |
|
|
padding: 14px 16px; |
|
|
border-radius: 12px; |
|
|
margin: 18px auto 6px; |
|
|
max-width: 860px; |
|
|
text-align: center; |
|
|
font-size: 0.98rem; |
|
|
} |
|
|
.info-box { |
|
|
background-color: #f0f7ff; |
|
|
border-left: 4px solid #4285f4; |
|
|
padding: 1em; |
|
|
margin: 1em 0; |
|
|
border-radius: 4px; |
|
|
} |
|
|
.special-tokens-box { |
|
|
background: linear-gradient(135deg, #ffeaa7 0%, #fdcb6e 100%); |
|
|
padding: 1em; |
|
|
margin: 1em 0; |
|
|
border-radius: 8px; |
|
|
border-left: 4px solid #e17055; |
|
|
} |
|
|
""" |
|
|
|
|
|
|
|
|
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="Ovi Image-to-Video Generator (Paid)") as demo: |
|
|
|
|
|
|
|
|
gr.HTML( |
|
|
""" |
|
|
<div style="text-align:center; padding:2em 1em 1em;"> |
|
|
<h1 style="font-size:2.2em; margin-bottom:6px;">π¬ Ovi: Image-to-Video with Audio</h1> |
|
|
<p style="color:#777; margin:0 0 8px;">Generate synchronized video and audio from images</p> |
|
|
<div class="notice"> |
|
|
<b>Heads up:</b> This is a paid app that uses <b>your</b> inference provider credits when you run generations. |
|
|
Free users get <b>$0.10 in included credits</b>. <b>PRO users</b> get <b>$2 in included credits</b> |
|
|
and can continue using beyond that (with billing). |
|
|
<a href='http://huggingface.co/subscribe/pro?source=ovi' target='_blank' style='color:#fff; text-decoration:underline; font-weight:bold;'>Subscribe to PRO</a> |
|
|
for more credits. Please sign in with your Hugging Face account to continue. |
|
|
</div> |
|
|
<p style="font-size: 0.9em; color: #999; margin-top: 10px;"> |
|
|
Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color:#667eea; text-decoration:underline;">anycoder</a> |
|
|
</p> |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
### Transform your static images into dynamic videos with synchronized audio using AI! |
|
|
|
|
|
Powered by **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation** via [HuggingFace Inference Providers](https://huggingface.co/docs/huggingface_hub/guides/inference) |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
login_btn = gr.LoginButton("Sign in with Hugging Face") |
|
|
|
|
|
gr.HTML( |
|
|
""" |
|
|
<div class="info-box"> |
|
|
<strong>π‘ Tips for best results:</strong> |
|
|
<ul> |
|
|
<li>Use clear, well-lit images with a single main subject</li> |
|
|
<li>Write specific prompts describing the desired motion or action</li> |
|
|
<li>Keep prompts concise and focused on movement and audio elements</li> |
|
|
<li>Processing generates 5-second videos at 24 FPS with synchronized audio</li> |
|
|
<li>Processing may take 30-60 seconds depending on server load</li> |
|
|
</ul> |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
gr.HTML( |
|
|
""" |
|
|
<div class="special-tokens-box"> |
|
|
<strong>β¨ Special Tokens for Enhanced Control:</strong> |
|
|
<ul> |
|
|
<li><strong>Speech:</strong> <code><S>Your speech content here<E></code> - Text enclosed in these tags will be converted to speech</li> |
|
|
<li><strong>Audio Description:</strong> <code><AUDCAP>Audio description here<ENDAUDCAP></code> - Describes the audio or sound effects present in the video</li> |
|
|
</ul> |
|
|
<br> |
|
|
<strong>π Example Prompt:</strong><br> |
|
|
<code>Dogs bark loudly at a man wearing a red shirt. The man says <S>Please stop barking at me!<E>. <AUDCAP>Dogs barking, angry man yelling in stern voice<ENDAUDCAP>.</code> |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
image_input = gr.Image( |
|
|
label="πΈ Upload Image", |
|
|
type="pil", |
|
|
sources=["upload", "clipboard"], |
|
|
height=400, |
|
|
) |
|
|
|
|
|
prompt_input = gr.Textbox( |
|
|
label="βοΈ Text Prompt", |
|
|
placeholder="Describe the motion and audio you want... (e.g., 'A person walking forward while talking')", |
|
|
lines=4, |
|
|
max_lines=6 |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
generate_btn = gr.Button( |
|
|
"π¬ Generate Video", |
|
|
variant="primary", |
|
|
scale=2 |
|
|
) |
|
|
|
|
|
clear_btn = gr.Button( |
|
|
"ποΈ Clear", |
|
|
variant="secondary", |
|
|
scale=1 |
|
|
) |
|
|
|
|
|
status_output = gr.Textbox( |
|
|
label="Status", |
|
|
interactive=False, |
|
|
visible=True, |
|
|
elem_classes=["status-box"] |
|
|
) |
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
[ |
|
|
"5.png", |
|
|
'A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the "CHOICE FM" logo and various social media handles like "@ilovechoicefm" with "RALEIGH" below it. The man intently addresses the microphone, articulating, <S>is talent. It\'s all about authenticity. You gotta be who you really are, especially if you\'re working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>' |
|
|
] |
|
|
], |
|
|
inputs=[image_input, prompt_input], |
|
|
label="Example Prompts", |
|
|
) |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
video_output = gr.Video( |
|
|
label="π₯ Generated Video", |
|
|
height=400, |
|
|
autoplay=True, |
|
|
show_download_button=True |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
### About Ovi Model |
|
|
|
|
|
**Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation** |
|
|
|
|
|
Developed by Chetwin Low, Weimin Wang (Character AI) & Calder Katyal (Yale University) |
|
|
|
|
|
π **Key Features:** |
|
|
- π¬ **Video+Audio Generation**: Generates synchronized video and audio content simultaneously |
|
|
- π **Flexible Input**: Supports text-only or text+image conditioning |
|
|
- β±οΈ **5-second Videos**: Generates 5-second videos at 24 FPS |
|
|
- π **Multiple Aspect Ratios**: Supports 720Γ720 area at various ratios (9:16, 16:9, 1:1, etc) |
|
|
|
|
|
Ovi is a veo-3 like model that simultaneously generates both video and audio content from text or text+image inputs. |
|
|
|
|
|
--- |
|
|
|
|
|
### π³ Pricing Information |
|
|
|
|
|
This app uses the Hugging Face Inference API (provider: fal-ai) which charges based on usage: |
|
|
- **Free users**: $0.10 in included credits |
|
|
- **PRO users**: $2 in included credits + ability to continue with billing |
|
|
|
|
|
[Subscribe to PRO](http://huggingface.co/subscribe/pro?source=ovi) for more credits and features! |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Accordion("π How to Use", open=False): |
|
|
gr.Markdown( |
|
|
""" |
|
|
### Getting Started: |
|
|
1. **Sign in** with your Hugging Face account using the button above |
|
|
2. **Upload** your image - any photo or illustration |
|
|
3. **Describe** the motion and audio you want in the prompt |
|
|
4. **Use special tokens** for speech and audio descriptions (optional but recommended) |
|
|
5. **Generate** and watch your image come to life with synchronized audio! |
|
|
|
|
|
### Special Tokens Guide: |
|
|
|
|
|
**Speech Token**: `<S>text<E>` |
|
|
- Use this to add spoken dialogue to your video |
|
|
- Example: `The person says <S>Hello, how are you?<E>` |
|
|
|
|
|
**Audio Description Token**: `<AUDCAP>description<ENDAUDCAP>` |
|
|
- Use this to describe background sounds and audio effects |
|
|
- Example: `<AUDCAP>Birds chirping, gentle wind blowing<ENDAUDCAP>` |
|
|
|
|
|
### Tips for Better Results: |
|
|
- Be specific and descriptive in your prompts |
|
|
- Combine visual motion descriptions with audio elements |
|
|
- Use high-quality input images for better results |
|
|
- Experiment with different prompts and special tokens |
|
|
- Processing takes 30-60 seconds per generation |
|
|
|
|
|
### β οΈ Important Notes: |
|
|
- This is a **paid app** that uses your inference provider credits |
|
|
- Each generation consumes credits based on processing time |
|
|
- Free accounts have limited credits ($0.10) |
|
|
- PRO accounts get more credits ($2) and can continue with billing |
|
|
- Videos are 5 seconds long at 24 FPS |
|
|
- Supports multiple aspect ratios (9:16, 16:9, 1:1, etc) |
|
|
""" |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
--- |
|
|
|
|
|
### π Resources |
|
|
|
|
|
- [Ovi Model Card](https://huggingface.co/chetwinlow1/Ovi) |
|
|
- [Character AI](https://character.ai) |
|
|
- [Hugging Face Inference API Docs](https://huggingface.co/docs/huggingface_hub/guides/inference) |
|
|
- [Subscribe to PRO](http://huggingface.co/subscribe/pro?source=ovi) |
|
|
|
|
|
### π Model Specifications |
|
|
|
|
|
- **Provider**: fal-ai |
|
|
- **Model**: chetwinlow1/Ovi |
|
|
- **Output**: 5-second videos at 24 FPS with audio |
|
|
- **Input**: Image + Text prompt |
|
|
- **Resolution**: 720Γ720 area (various aspect ratios) |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
generate_btn.click( |
|
|
fn=generate_video_with_auth, |
|
|
inputs=[image_input, prompt_input], |
|
|
outputs=[video_output, status_output], |
|
|
show_progress="full", |
|
|
queue=False, |
|
|
api_name=False, |
|
|
show_api=False, |
|
|
) |
|
|
|
|
|
clear_btn.click( |
|
|
fn=clear_all, |
|
|
inputs=[], |
|
|
outputs=[image_input, prompt_input, video_output, status_output], |
|
|
queue=False, |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
try: |
|
|
cleanup_temp_files() |
|
|
if os.path.exists("gradio_cached_examples"): |
|
|
shutil.rmtree("gradio_cached_examples", ignore_errors=True) |
|
|
except Exception as e: |
|
|
print(f"Initial cleanup error: {e}") |
|
|
|
|
|
demo.queue(status_update_rate="auto", api_open=False, default_concurrency_limit=None) |
|
|
demo.launch( |
|
|
show_api=False, |
|
|
share=False, |
|
|
show_error=True, |
|
|
enable_monitoring=False, |
|
|
quiet=True, |
|
|
) |