| import gradio as gr |
| import os |
| from huggingface_hub import InferenceClient |
| from pathlib import Path |
| import tempfile |
|
|
| |
| client = InferenceClient( |
| provider="fal-ai", |
| api_key=os.environ.get("HF_TOKEN"), |
| bill_to="huggingface", |
| ) |
|
|
| def generate_video_with_auth(image, prompt, profile: gr.OAuthProfile | None, progress=gr.Progress()): |
| """ |
| Generate a video from an image using the Ovi model with authentication check. |
| |
| Args: |
| image: Input image (PIL Image or file path) |
| prompt: Text prompt describing the desired motion/animation |
| profile: OAuth profile for authentication |
| progress: Gradio progress tracker |
| |
| Returns: |
| Path to the generated video file |
| """ |
| if profile is None: |
| raise gr.Error("Click Sign in with Hugging Face button to use this app for free") |
| |
| if image is None: |
| raise gr.Error("Please upload an image first!") |
| |
| if not prompt or prompt.strip() == "": |
| raise gr.Error("Please enter a prompt describing the desired motion!") |
| |
| try: |
| progress(0.2, desc="Processing image...") |
| |
| |
| if isinstance(image, str): |
| with open(image, "rb") as image_file: |
| input_image = image_file.read() |
| else: |
| |
| temp_image = tempfile.NamedTemporaryFile(delete=False, suffix=".png") |
| image.save(temp_image.name) |
| with open(temp_image.name, "rb") as image_file: |
| input_image = image_file.read() |
| |
| progress(0.4, desc="Generating video with AI...") |
| |
| |
| video = client.image_to_video( |
| input_image, |
| prompt=prompt, |
| model="chetwinlow1/Ovi", |
| ) |
| |
| progress(0.9, desc="Finalizing video...") |
| |
| |
| output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") |
| |
| |
| if isinstance(video, bytes): |
| with open(output_path.name, "wb") as f: |
| f.write(video) |
| elif isinstance(video, str) and os.path.exists(video): |
| |
| import shutil |
| shutil.copy(video, output_path.name) |
| else: |
| |
| with open(output_path.name, "wb") as f: |
| f.write(video) |
| |
| progress(1.0, desc="Complete!") |
| |
| return output_path.name |
| |
| except Exception as e: |
| raise gr.Error(f"Error generating video: {str(e)}") |
|
|
| |
| with gr.Blocks( |
| theme=gr.themes.Soft( |
| primary_hue="blue", |
| secondary_hue="indigo", |
| ), |
| css=""" |
| .header-link { |
| font-size: 0.9em; |
| color: #666; |
| text-decoration: none; |
| margin-bottom: 1em; |
| display: inline-block; |
| } |
| .header-link:hover { |
| color: #333; |
| text-decoration: underline; |
| } |
| .main-header { |
| text-align: center; |
| margin-bottom: 2em; |
| } |
| .info-box { |
| background-color: #f0f7ff; |
| border-left: 4px solid #4285f4; |
| padding: 1em; |
| margin: 1em 0; |
| border-radius: 4px; |
| } |
| .auth-warning { |
| color: #ff6b00; |
| font-weight: bold; |
| text-align: center; |
| margin: 1em 0; |
| } |
| """, |
| title="Image to Video Generator with Ovi", |
| ) as demo: |
| |
| gr.HTML( |
| """ |
| <div class="main-header"> |
| <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" class="header-link"> |
| Built with anycoder β¨ |
| </a> |
| </div> |
| """ |
| ) |
| |
| gr.Markdown( |
| """ |
| # π¬ Image to Video Generator with Ovi |
| |
| Transform your static images into dynamic videos with synchronized audio using AI! Upload an image and describe the motion you want to see. |
| |
| Powered by **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation** via HuggingFace Inference API. |
| """ |
| ) |
| |
| gr.HTML( |
| """ |
| <div class="auth-warning"> |
| β οΈ You must Sign in with Hugging Face using the button below to use this app. |
| </div> |
| """ |
| ) |
| |
| |
| gr.LoginButton() |
| |
| gr.HTML( |
| """ |
| <div class="info-box"> |
| <strong>π‘ Tips for best results:</strong> |
| <ul> |
| <li>Use clear, well-lit images with a single main subject</li> |
| <li>Write specific prompts describing the desired motion or action</li> |
| <li>Keep prompts concise and focused on movement and audio elements</li> |
| <li>Processing generates 5-second videos at 24 FPS with synchronized audio</li> |
| <li>Processing may take 30-60 seconds depending on server load</li> |
| </ul> |
| </div> |
| """ |
| ) |
| |
| gr.HTML( |
| """ |
| <div class="info-box"> |
| <strong>β¨ Special Tokens for Enhanced Control:</strong> |
| <ul> |
| <li><strong>Speech:</strong> <code><S>Your speech content here<E></code> - Text enclosed in these tags will be converted to speech</li> |
| <li><strong>Audio Description:</strong> <code><AUDCAP>Audio description here<ENDAUDCAP></code> - Describes the audio or sound effects present in the video</li> |
| </ul> |
| <br> |
| <strong>π Example Prompt:</strong><br> |
| <code>Dogs bark loudly at a man wearing a red shirt. The man says <S>Please stop barking at me!<E>. <AUDCAP>Dogs barking, angry man yelling in stern voice<ENDAUDCAP>.</code> |
| </div> |
| """ |
| ) |
| |
| with gr.Row(): |
| with gr.Column(scale=1): |
| image_input = gr.Image( |
| label="πΈ Upload Image", |
| type="filepath", |
| sources=["upload", "clipboard"], |
| height=400, |
| ) |
| |
| prompt_input = gr.Textbox( |
| label="βοΈ Text Prompt", |
| lines=3, |
| ) |
| |
| generate_btn = gr.Button( |
| "π¬ Generate Video", |
| variant="primary", |
| size="lg", |
| ) |
| |
| clear_btn = gr.Button( |
| "ποΈ Clear", |
| variant="secondary", |
| ) |
| |
| gr.Examples( |
| examples=[ |
| [ |
| "5.png", |
| 'A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the "CHOICE FM" logo and various social media handles like "@ilovechoicefm" with "RALEIGH" below it. The man intently addresses the microphone, articulating, <S>is talent. It\'s all about authenticity. You gotta be who you really are, especially if you\'re working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>' |
| ] |
| ], |
| inputs=[image_input, prompt_input], |
| label="Example", |
| ) |
| |
| with gr.Column(scale=1): |
| video_output = gr.Video( |
| label="π₯ Generated Video", |
| height=400, |
| autoplay=True, |
| ) |
| |
| gr.Markdown( |
| """ |
| ### About Ovi Model |
| |
| **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation** |
| |
| Developed by Chetwin Low, Weimin Wang (Character AI) & Calder Katyal (Yale University) |
| |
| π **Key Features:** |
| - π¬ **Video+Audio Generation**: Generates synchronized video and audio content simultaneously |
| - π **Flexible Input**: Supports text-only or text+image conditioning |
| - β±οΈ **5-second Videos**: Generates 5-second videos at 24 FPS |
| - π **Multiple Aspect Ratios**: Supports 720Γ720 area at various ratios (9:16, 16:9, 1:1, etc) |
| |
| Ovi is a veo-3 like model that simultaneously generates both video and audio content from text or text+image inputs. |
| """ |
| ) |
| |
| |
| generate_btn.click( |
| fn=generate_video_with_auth, |
| inputs=[image_input, prompt_input], |
| outputs=[video_output], |
| queue=False, |
| api_name=False, |
| show_api=False, |
| ) |
| |
| clear_btn.click( |
| fn=lambda: (None, "", None), |
| inputs=None, |
| outputs=[image_input, prompt_input, video_output], |
| queue=False, |
| ) |
| |
| gr.Markdown( |
| """ |
| --- |
| |
| ### π How it works |
| |
| 1. **Sign in** with your Hugging Face account |
| 2. **Upload** your image - any photo or illustration |
| 3. **Describe** the motion you want to see in the prompt |
| 4. **Generate** and watch your image come to life! |
| |
| ### β οΈ Notes |
| |
| - Video generation may take 30-60 seconds |
| - Generates 5-second videos at 24 FPS with synchronized audio |
| - Supports multiple aspect ratios (9:16, 16:9, 1:1, etc) at 720Γ720 area |
| - Requires a valid HuggingFace token with Inference API access |
| - Best results with clear, high-quality images |
| - The model works best with realistic subjects and natural motions |
| |
| ### π Resources |
| |
| - [Ovi Model Card](https://huggingface.co/chetwinlow1/Ovi) |
| - [HuggingFace Inference API](https://huggingface.co/docs/huggingface_hub/guides/inference) |
| - [Character AI](https://character.ai) |
| """ |
| ) |
|
|
| |
| if __name__ == "__main__": |
| demo.launch( |
| show_api=False, |
| enable_monitoring=False, |
| quiet=True, |
| ) |