| | import gradio as gr |
| | import os |
| | from huggingface_hub import InferenceClient |
| | from pathlib import Path |
| | import tempfile |
| |
|
| | |
| | client = InferenceClient( |
| | provider="fal-ai", |
| | api_key=os.environ.get("HF_TOKEN"), |
| | bill_to="huggingface", |
| | ) |
| |
|
| | def generate_video_with_auth(image, prompt, profile: gr.OAuthProfile | None, progress=gr.Progress()): |
| | """ |
| | Generate a video from an image using the Ovi model with authentication check. |
| | |
| | Args: |
| | image: Input image (PIL Image or file path) |
| | prompt: Text prompt describing the desired motion/animation |
| | profile: OAuth profile for authentication |
| | progress: Gradio progress tracker |
| | |
| | Returns: |
| | Path to the generated video file |
| | """ |
| | if profile is None: |
| | raise gr.Error("Click Sign in with Hugging Face button to use this app for free") |
| | |
| | if image is None: |
| | raise gr.Error("Please upload an image first!") |
| | |
| | if not prompt or prompt.strip() == "": |
| | raise gr.Error("Please enter a prompt describing the desired motion!") |
| | |
| | try: |
| | progress(0.2, desc="Processing image...") |
| | |
| | |
| | if isinstance(image, str): |
| | with open(image, "rb") as image_file: |
| | input_image = image_file.read() |
| | else: |
| | |
| | temp_image = tempfile.NamedTemporaryFile(delete=False, suffix=".png") |
| | image.save(temp_image.name) |
| | with open(temp_image.name, "rb") as image_file: |
| | input_image = image_file.read() |
| | |
| | progress(0.4, desc="Generating video with AI...") |
| | |
| | |
| | video = client.image_to_video( |
| | input_image, |
| | prompt=prompt, |
| | model="chetwinlow1/Ovi", |
| | ) |
| | |
| | progress(0.9, desc="Finalizing video...") |
| | |
| | |
| | output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") |
| | |
| | |
| | if isinstance(video, bytes): |
| | with open(output_path.name, "wb") as f: |
| | f.write(video) |
| | elif isinstance(video, str) and os.path.exists(video): |
| | |
| | import shutil |
| | shutil.copy(video, output_path.name) |
| | else: |
| | |
| | with open(output_path.name, "wb") as f: |
| | f.write(video) |
| | |
| | progress(1.0, desc="Complete!") |
| | |
| | return output_path.name |
| | |
| | except Exception as e: |
| | raise gr.Error(f"Error generating video: {str(e)}") |
| |
|
| | |
| | with gr.Blocks( |
| | theme=gr.themes.Soft( |
| | primary_hue="blue", |
| | secondary_hue="indigo", |
| | ), |
| | css=""" |
| | .header-link { |
| | font-size: 0.9em; |
| | color: #666; |
| | text-decoration: none; |
| | margin-bottom: 1em; |
| | display: inline-block; |
| | } |
| | .header-link:hover { |
| | color: #333; |
| | text-decoration: underline; |
| | } |
| | .main-header { |
| | text-align: center; |
| | margin-bottom: 2em; |
| | } |
| | .info-box { |
| | background-color: #f0f7ff; |
| | border-left: 4px solid #4285f4; |
| | padding: 1em; |
| | margin: 1em 0; |
| | border-radius: 4px; |
| | } |
| | .auth-warning { |
| | color: #ff6b00; |
| | font-weight: bold; |
| | text-align: center; |
| | margin: 1em 0; |
| | } |
| | """, |
| | title="Image to Video Generator with Ovi", |
| | ) as demo: |
| | |
| | gr.HTML( |
| | """ |
| | <div class="main-header"> |
| | <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" class="header-link"> |
| | Built with anycoder β¨ |
| | </a> |
| | </div> |
| | """ |
| | ) |
| | |
| | gr.Markdown( |
| | """ |
| | # π¬ Image to Video Generator with Ovi |
| | |
| | Transform your static images into dynamic videos with synchronized audio using AI! Upload an image and describe the motion you want to see. |
| | |
| | Powered by **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation** via HuggingFace Inference API. |
| | """ |
| | ) |
| | |
| | gr.HTML( |
| | """ |
| | <div class="auth-warning"> |
| | β οΈ You must Sign in with Hugging Face using the button below to use this app. |
| | </div> |
| | """ |
| | ) |
| | |
| | |
| | gr.LoginButton() |
| | |
| | gr.HTML( |
| | """ |
| | <div class="info-box"> |
| | <strong>π‘ Tips for best results:</strong> |
| | <ul> |
| | <li>Use clear, well-lit images with a single main subject</li> |
| | <li>Write specific prompts describing the desired motion or action</li> |
| | <li>Keep prompts concise and focused on movement and audio elements</li> |
| | <li>Processing generates 5-second videos at 24 FPS with synchronized audio</li> |
| | <li>Processing may take 30-60 seconds depending on server load</li> |
| | </ul> |
| | </div> |
| | """ |
| | ) |
| | |
| | gr.HTML( |
| | """ |
| | <div class="info-box"> |
| | <strong>β¨ Special Tokens for Enhanced Control:</strong> |
| | <ul> |
| | <li><strong>Speech:</strong> <code><S>Your speech content here<E></code> - Text enclosed in these tags will be converted to speech</li> |
| | <li><strong>Audio Description:</strong> <code><AUDCAP>Audio description here<ENDAUDCAP></code> - Describes the audio or sound effects present in the video</li> |
| | </ul> |
| | <br> |
| | <strong>π Example Prompt:</strong><br> |
| | <code>Dogs bark loudly at a man wearing a red shirt. The man says <S>Please stop barking at me!<E>. <AUDCAP>Dogs barking, angry man yelling in stern voice<ENDAUDCAP>.</code> |
| | </div> |
| | """ |
| | ) |
| | |
| | with gr.Row(): |
| | with gr.Column(scale=1): |
| | image_input = gr.Image( |
| | label="πΈ Upload Image", |
| | type="filepath", |
| | sources=["upload", "clipboard"], |
| | height=400, |
| | ) |
| | |
| | prompt_input = gr.Textbox( |
| | label="βοΈ Text Prompt", |
| | lines=3, |
| | ) |
| | |
| | generate_btn = gr.Button( |
| | "π¬ Generate Video", |
| | variant="primary", |
| | size="lg", |
| | ) |
| | |
| | clear_btn = gr.Button( |
| | "ποΈ Clear", |
| | variant="secondary", |
| | ) |
| | |
| | gr.Examples( |
| | examples=[ |
| | [ |
| | "5.png", |
| | 'A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the "CHOICE FM" logo and various social media handles like "@ilovechoicefm" with "RALEIGH" below it. The man intently addresses the microphone, articulating, <S>is talent. It\'s all about authenticity. You gotta be who you really are, especially if you\'re working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>' |
| | ] |
| | ], |
| | inputs=[image_input, prompt_input], |
| | label="Example", |
| | ) |
| | |
| | with gr.Column(scale=1): |
| | video_output = gr.Video( |
| | label="π₯ Generated Video", |
| | height=400, |
| | autoplay=True, |
| | ) |
| | |
| | gr.Markdown( |
| | """ |
| | ### About Ovi Model |
| | |
| | **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation** |
| | |
| | Developed by Chetwin Low, Weimin Wang (Character AI) & Calder Katyal (Yale University) |
| | |
| | π **Key Features:** |
| | - π¬ **Video+Audio Generation**: Generates synchronized video and audio content simultaneously |
| | - π **Flexible Input**: Supports text-only or text+image conditioning |
| | - β±οΈ **5-second Videos**: Generates 5-second videos at 24 FPS |
| | - π **Multiple Aspect Ratios**: Supports 720Γ720 area at various ratios (9:16, 16:9, 1:1, etc) |
| | |
| | Ovi is a veo-3 like model that simultaneously generates both video and audio content from text or text+image inputs. |
| | """ |
| | ) |
| | |
| | |
| | generate_btn.click( |
| | fn=generate_video_with_auth, |
| | inputs=[image_input, prompt_input], |
| | outputs=[video_output], |
| | queue=False, |
| | api_name=False, |
| | show_api=False, |
| | ) |
| | |
| | clear_btn.click( |
| | fn=lambda: (None, "", None), |
| | inputs=None, |
| | outputs=[image_input, prompt_input, video_output], |
| | queue=False, |
| | ) |
| | |
| | gr.Markdown( |
| | """ |
| | --- |
| | |
| | ### π How it works |
| | |
| | 1. **Sign in** with your Hugging Face account |
| | 2. **Upload** your image - any photo or illustration |
| | 3. **Describe** the motion you want to see in the prompt |
| | 4. **Generate** and watch your image come to life! |
| | |
| | ### β οΈ Notes |
| | |
| | - Video generation may take 30-60 seconds |
| | - Generates 5-second videos at 24 FPS with synchronized audio |
| | - Supports multiple aspect ratios (9:16, 16:9, 1:1, etc) at 720Γ720 area |
| | - Requires a valid HuggingFace token with Inference API access |
| | - Best results with clear, high-quality images |
| | - The model works best with realistic subjects and natural motions |
| | |
| | ### π Resources |
| | |
| | - [Ovi Model Card](https://huggingface.co/chetwinlow1/Ovi) |
| | - [HuggingFace Inference API](https://huggingface.co/docs/huggingface_hub/guides/inference) |
| | - [Character AI](https://character.ai) |
| | """ |
| | ) |
| |
|
| | |
| | if __name__ == "__main__": |
| | demo.launch( |
| | show_api=False, |
| | enable_monitoring=False, |
| | quiet=True, |
| | ) |