Sahil commited on
Commit
71ad6f4
Β·
verified Β·
1 Parent(s): 66865aa

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +296 -0
app.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from huggingface_hub import InferenceClient
4
+ from pathlib import Path
5
+ import tempfile
6
+
7
+ # Initialize the inference client
8
+ client = InferenceClient(
9
+ provider="fal-ai",
10
+ api_key=os.environ.get("HF_TOKEN"),
11
+ bill_to="huggingface",
12
+ )
13
+
14
+ def generate_video_with_auth(image, prompt, profile: gr.OAuthProfile | None, progress=gr.Progress()):
15
+ """
16
+ Generate a video from an image using the Ovi model with authentication check.
17
+
18
+ Args:
19
+ image: Input image (PIL Image or file path)
20
+ prompt: Text prompt describing the desired motion/animation
21
+ profile: OAuth profile for authentication
22
+ progress: Gradio progress tracker
23
+
24
+ Returns:
25
+ Path to the generated video file
26
+ """
27
+ if profile is None:
28
+ raise gr.Error("Click Sign in with Hugging Face button to use this app for free")
29
+
30
+ if image is None:
31
+ raise gr.Error("Please upload an image first!")
32
+
33
+ if not prompt or prompt.strip() == "":
34
+ raise gr.Error("Please enter a prompt describing the desired motion!")
35
+
36
+ try:
37
+ progress(0.2, desc="Processing image...")
38
+
39
+ # Read the image file
40
+ if isinstance(image, str):
41
+ with open(image, "rb") as image_file:
42
+ input_image = image_file.read()
43
+ else:
44
+ # If image is a PIL Image, save it temporarily
45
+ temp_image = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
46
+ image.save(temp_image.name)
47
+ with open(temp_image.name, "rb") as image_file:
48
+ input_image = image_file.read()
49
+
50
+ progress(0.4, desc="Generating video with AI...")
51
+
52
+ # Generate video using the inference client
53
+ video = client.image_to_video(
54
+ input_image,
55
+ prompt=prompt,
56
+ model="chetwinlow1/Ovi",
57
+ )
58
+
59
+ progress(0.9, desc="Finalizing video...")
60
+
61
+ # Save the video to a temporary file
62
+ output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
63
+
64
+ # Check if video is bytes or a file path
65
+ if isinstance(video, bytes):
66
+ with open(output_path.name, "wb") as f:
67
+ f.write(video)
68
+ elif isinstance(video, str) and os.path.exists(video):
69
+ # If it's a path, copy it
70
+ import shutil
71
+ shutil.copy(video, output_path.name)
72
+ else:
73
+ # Try to write it directly
74
+ with open(output_path.name, "wb") as f:
75
+ f.write(video)
76
+
77
+ progress(1.0, desc="Complete!")
78
+
79
+ return output_path.name
80
+
81
+ except Exception as e:
82
+ raise gr.Error(f"Error generating video: {str(e)}")
83
+
84
+ # Create the Gradio interface
85
+ with gr.Blocks(
86
+ theme=gr.themes.Soft(
87
+ primary_hue="blue",
88
+ secondary_hue="indigo",
89
+ ),
90
+ css="""
91
+ .header-link {
92
+ font-size: 0.9em;
93
+ color: #666;
94
+ text-decoration: none;
95
+ margin-bottom: 1em;
96
+ display: inline-block;
97
+ }
98
+ .header-link:hover {
99
+ color: #333;
100
+ text-decoration: underline;
101
+ }
102
+ .main-header {
103
+ text-align: center;
104
+ margin-bottom: 2em;
105
+ }
106
+ .info-box {
107
+ background-color: #f0f7ff;
108
+ border-left: 4px solid #4285f4;
109
+ padding: 1em;
110
+ margin: 1em 0;
111
+ border-radius: 4px;
112
+ }
113
+ .auth-warning {
114
+ color: #ff6b00;
115
+ font-weight: bold;
116
+ text-align: center;
117
+ margin: 1em 0;
118
+ }
119
+ """,
120
+ title="Image to Video Generator with Ovi",
121
+ ) as demo:
122
+
123
+ gr.HTML(
124
+ """
125
+ <div class="main-header">
126
+ <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" class="header-link">
127
+ Built with anycoder ✨
128
+ </a>
129
+ </div>
130
+ """
131
+ )
132
+
133
+ gr.Markdown(
134
+ """
135
+ # 🎬 Image to Video Generator with Ovi
136
+
137
+ Transform your static images into dynamic videos with synchronized audio using AI! Upload an image and describe the motion you want to see.
138
+
139
+ Powered by **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation** via HuggingFace Inference API.
140
+ """
141
+ )
142
+
143
+ gr.HTML(
144
+ """
145
+ <div class="auth-warning">
146
+ ⚠️ You must Sign in with Hugging Face using the button below to use this app.
147
+ </div>
148
+ """
149
+ )
150
+
151
+ # Add login button - required for OAuth
152
+ gr.LoginButton()
153
+
154
+ gr.HTML(
155
+ """
156
+ <div class="info-box">
157
+ <strong>πŸ’‘ Tips for best results:</strong>
158
+ <ul>
159
+ <li>Use clear, well-lit images with a single main subject</li>
160
+ <li>Write specific prompts describing the desired motion or action</li>
161
+ <li>Keep prompts concise and focused on movement and audio elements</li>
162
+ <li>Processing generates 5-second videos at 24 FPS with synchronized audio</li>
163
+ <li>Processing may take 30-60 seconds depending on server load</li>
164
+ </ul>
165
+ </div>
166
+ """
167
+ )
168
+
169
+ gr.HTML(
170
+ """
171
+ <div class="info-box">
172
+ <strong>✨ Special Tokens for Enhanced Control:</strong>
173
+ <ul>
174
+ <li><strong>Speech:</strong> <code>&lt;S&gt;Your speech content here&lt;E&gt;</code> - Text enclosed in these tags will be converted to speech</li>
175
+ <li><strong>Audio Description:</strong> <code>&lt;AUDCAP&gt;Audio description here&lt;ENDAUDCAP&gt;</code> - Describes the audio or sound effects present in the video</li>
176
+ </ul>
177
+ <br>
178
+ <strong>πŸ“ Example Prompt:</strong><br>
179
+ <code>Dogs bark loudly at a man wearing a red shirt. The man says &lt;S&gt;Please stop barking at me!&lt;E&gt;. &lt;AUDCAP&gt;Dogs barking, angry man yelling in stern voice&lt;ENDAUDCAP&gt;.</code>
180
+ </div>
181
+ """
182
+ )
183
+
184
+ with gr.Row():
185
+ with gr.Column(scale=1):
186
+ image_input = gr.Image(
187
+ label="πŸ“Έ Upload Image",
188
+ type="filepath",
189
+ sources=["upload", "clipboard"],
190
+ height=400,
191
+ )
192
+
193
+ prompt_input = gr.Textbox(
194
+ label="✍️ Text Prompt",
195
+ lines=3,
196
+ )
197
+
198
+ generate_btn = gr.Button(
199
+ "🎬 Generate Video",
200
+ variant="primary",
201
+ size="lg",
202
+ )
203
+
204
+ clear_btn = gr.Button(
205
+ "πŸ—‘οΈ Clear",
206
+ variant="secondary",
207
+ )
208
+
209
+ gr.Examples(
210
+ examples=[
211
+ [
212
+ "5.png",
213
+ 'A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the "CHOICE FM" logo and various social media handles like "@ilovechoicefm" with "RALEIGH" below it. The man intently addresses the microphone, articulating, <S>is talent. It\'s all about authenticity. You gotta be who you really are, especially if you\'re working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>'
214
+ ]
215
+ ],
216
+ inputs=[image_input, prompt_input],
217
+ label="Example",
218
+ )
219
+
220
+ with gr.Column(scale=1):
221
+ video_output = gr.Video(
222
+ label="πŸŽ₯ Generated Video",
223
+ height=400,
224
+ autoplay=True,
225
+ )
226
+
227
+ gr.Markdown(
228
+ """
229
+ ### About Ovi Model
230
+
231
+ **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation**
232
+
233
+ Developed by Chetwin Low, Weimin Wang (Character AI) & Calder Katyal (Yale University)
234
+
235
+ 🌟 **Key Features:**
236
+ - 🎬 **Video+Audio Generation**: Generates synchronized video and audio content simultaneously
237
+ - πŸ“ **Flexible Input**: Supports text-only or text+image conditioning
238
+ - ⏱️ **5-second Videos**: Generates 5-second videos at 24 FPS
239
+ - πŸ“ **Multiple Aspect Ratios**: Supports 720Γ—720 area at various ratios (9:16, 16:9, 1:1, etc)
240
+
241
+ Ovi is a veo-3 like model that simultaneously generates both video and audio content from text or text+image inputs.
242
+ """
243
+ )
244
+
245
+ # Event handlers with authentication
246
+ generate_btn.click(
247
+ fn=generate_video_with_auth,
248
+ inputs=[image_input, prompt_input],
249
+ outputs=[video_output],
250
+ queue=False,
251
+ api_name=False,
252
+ show_api=False,
253
+ )
254
+
255
+ clear_btn.click(
256
+ fn=lambda: (None, "", None),
257
+ inputs=None,
258
+ outputs=[image_input, prompt_input, video_output],
259
+ queue=False,
260
+ )
261
+
262
+ gr.Markdown(
263
+ """
264
+ ---
265
+
266
+ ### πŸš€ How it works
267
+
268
+ 1. **Sign in** with your Hugging Face account
269
+ 2. **Upload** your image - any photo or illustration
270
+ 3. **Describe** the motion you want to see in the prompt
271
+ 4. **Generate** and watch your image come to life!
272
+
273
+ ### ⚠️ Notes
274
+
275
+ - Video generation may take 30-60 seconds
276
+ - Generates 5-second videos at 24 FPS with synchronized audio
277
+ - Supports multiple aspect ratios (9:16, 16:9, 1:1, etc) at 720Γ—720 area
278
+ - Requires a valid HuggingFace token with Inference API access
279
+ - Best results with clear, high-quality images
280
+ - The model works best with realistic subjects and natural motions
281
+
282
+ ### πŸ”— Resources
283
+
284
+ - [Ovi Model Card](https://huggingface.co/chetwinlow1/Ovi)
285
+ - [HuggingFace Inference API](https://huggingface.co/docs/huggingface_hub/guides/inference)
286
+ - [Character AI](https://character.ai)
287
+ """
288
+ )
289
+
290
+ # Launch the app
291
+ if __name__ == "__main__":
292
+ demo.launch(
293
+ show_api=False,
294
+ enable_monitoring=False,
295
+ quiet=True,
296
+ )