hkchengrex commited on
Commit
7d49f3e
·
verified ·
1 Parent(s): 778c2d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +150 -0
app.py CHANGED
@@ -205,3 +205,153 @@ def text_to_audio(
205
  log.info(f'Saved audio to {audio_save_path}')
206
 
207
  return audio_save_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  log.info(f'Saved audio to {audio_save_path}')
206
 
207
  return audio_save_path
208
+
209
+ video_to_audio_tab = gr.Interface(
210
+ fn=video_to_audio,
211
+ description="""
212
+ Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
213
+ Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
214
+
215
+ Ho Kei Cheng, Masato Ishii, Akio Hayakawa, Takashi Shibuya, Alexander Schwing, Yuki Mitsufuji
216
+
217
+ University of Illinois Urbana-Champaign, Sony AI, and Sony Group Corporation
218
+
219
+ CVPR 2025
220
+
221
+ NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
222
+ Doing so does not improve results.
223
+
224
+ The model has been trained on 8-second videos. Using much longer or shorter videos will degrade performance. Around 5s~12s should be fine.
225
+ """,
226
+ inputs=[
227
+ gr.Video(),
228
+ gr.Text(label='Prompt'),
229
+ gr.Text(label='Negative prompt', value='music'),
230
+ gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
231
+ gr.Number(label='Num steps', value=25, precision=0, minimum=1),
232
+ gr.Number(label='Guidance Strength', value=4.5, minimum=1),
233
+ gr.Number(label='Duration (sec)', value=8, minimum=1),
234
+ ],
235
+ outputs='playable_video',
236
+ cache_examples=False,
237
+ title='MMAudio — Video-to-Audio Synthesis',
238
+ examples=[
239
+ [
240
+ 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
241
+ 'waves, seagulls',
242
+ '',
243
+ 0,
244
+ 25,
245
+ 4.5,
246
+ 10,
247
+ ],
248
+ [
249
+ 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_serpent.mp4',
250
+ '',
251
+ 'music',
252
+ 0,
253
+ 25,
254
+ 4.5,
255
+ 10,
256
+ ],
257
+ [
258
+ 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_seahorse.mp4',
259
+ 'bubbles',
260
+ '',
261
+ 0,
262
+ 25,
263
+ 4.5,
264
+ 10,
265
+ ],
266
+ [
267
+ 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_india.mp4',
268
+ 'Indian holy music',
269
+ '',
270
+ 0,
271
+ 25,
272
+ 4.5,
273
+ 10,
274
+ ],
275
+ [
276
+ 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_galloping.mp4',
277
+ 'galloping',
278
+ '',
279
+ 0,
280
+ 25,
281
+ 4.5,
282
+ 10,
283
+ ],
284
+ [
285
+ 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_kraken.mp4',
286
+ 'waves, storm',
287
+ '',
288
+ 0,
289
+ 25,
290
+ 4.5,
291
+ 10,
292
+ ],
293
+ [
294
+ 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_nyc.mp4',
295
+ '',
296
+ '',
297
+ 0,
298
+ 25,
299
+ 4.5,
300
+ 10,
301
+ ],
302
+ [
303
+ 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/mochi_storm.mp4',
304
+ 'storm',
305
+ '',
306
+ 0,
307
+ 25,
308
+ 4.5,
309
+ 10,
310
+ ],
311
+ [
312
+ 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_spring.mp4',
313
+ '',
314
+ '',
315
+ 0,
316
+ 25,
317
+ 4.5,
318
+ 10,
319
+ ],
320
+ [
321
+ 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_typing.mp4',
322
+ 'typing',
323
+ '',
324
+ 0,
325
+ 25,
326
+ 4.5,
327
+ 10,
328
+ ],
329
+ [
330
+ 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_wake_up.mp4',
331
+ '',
332
+ '',
333
+ 0,
334
+ 25,
335
+ 4.5,
336
+ 10,
337
+ ],
338
+ ])
339
+
340
+ text_to_audio_tab = gr.Interface(
341
+ fn=text_to_audio,
342
+ inputs=[
343
+ gr.Text(label='Prompt'),
344
+ gr.Text(label='Negative prompt'),
345
+ gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
346
+ gr.Number(label='Num steps', value=25, precision=0, minimum=1),
347
+ gr.Number(label='Guidance Strength', value=4.5, minimum=1),
348
+ gr.Number(label='Duration (sec)', value=8, minimum=1),
349
+ ],
350
+ outputs='audio',
351
+ cache_examples=False,
352
+ title='MMAudio — Text-to-Audio Synthesis',
353
+ )
354
+
355
+ if __name__ == "__main__":
356
+ gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab],
357
+ ['Video-to-Audio', 'Text-to-Audio']).launch(allowed_paths=[output_dir])