cbsjtu01 commited on
Commit
aa81c96
·
1 Parent(s): 21025b7

Add examples using Git LFS

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
37
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ *.png filter=lfs diff=lfs merge=lfs -text
39
+ *.jpg filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -424,39 +424,130 @@ def fn_video_driven(source_image, driving_video, crop, progress=gr.Progress()):
424
  traceback.print_exc()
425
  raise gr.Error(f"Error: {e}")
426
 
427
- # Gradio 4.x 语法:去除了 css,使用 sources=["upload"]
428
- with gr.Blocks(title="IMTalker Demo") as demo:
429
  gr.Markdown("# 🗣️ IMTalker: Efficient Audio-driven Talking Face Generation")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
  with gr.Tabs():
 
 
 
431
  with gr.TabItem("Audio Driven"):
432
  with gr.Row():
433
  with gr.Column():
434
- # 固定分辨率为 512x512
435
  a_img = gr.Image(label="Source Image", type="numpy", height=512, width=512)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  a_aud = gr.Audio(label="Driving Audio", type="filepath")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
  with gr.Accordion("Settings", open=True):
438
  a_crop = gr.Checkbox(label="Auto Crop Face", value=True)
439
  a_seed = gr.Number(label="Seed", value=42)
440
  a_nfe = gr.Slider(5, 50, value=10, step=1, label="Steps (NFE)")
441
  a_cfg = gr.Slider(1.0, 5.0, value=3.0, label="CFG Scale")
 
442
  a_btn = gr.Button("Generate (Audio Driven)", variant="primary")
 
443
  with gr.Column():
444
- # 固定分辨率为 512x512
445
  a_out = gr.Video(label="Result", height=512, width=512)
 
446
  a_btn.click(fn_audio_driven, [a_img, a_aud, a_crop, a_seed, a_nfe, a_cfg], a_out)
447
 
 
 
 
448
  with gr.TabItem("Video Driven"):
449
  with gr.Row():
450
  with gr.Column():
451
- # 固定分辨率为 512x512
452
  v_img = gr.Image(label="Source Image", type="numpy", height=512, width=512)
453
- # Gradio 4.x 语法, 固定分辨率
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
  v_vid = gr.Video(label="Driving Video", sources=["upload"], height=512, width=512)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
  v_crop = gr.Checkbox(label="Auto Crop (Both Source & Driving)", value=True)
456
  v_btn = gr.Button("Generate (Video Driven)", variant="primary")
 
457
  with gr.Column():
458
- # 固定分辨率为 512x512
459
  v_out = gr.Video(label="Result", height=512, width=512)
 
460
  v_btn.click(fn_video_driven, [v_img, v_vid, v_crop], v_out)
461
 
462
  if __name__ == "__main__":
 
424
  traceback.print_exc()
425
  raise gr.Error(f"Error: {e}")
426
 
427
+ with gr.Blocks(title="IMTalker Demo", theme=gr.themes.Base()) as demo:
 
428
  gr.Markdown("# 🗣️ IMTalker: Efficient Audio-driven Talking Face Generation")
429
+
430
+ # 最佳实践说明
431
+ with gr.Accordion("💡 Best Practices (Click to read)", open=False):
432
+ gr.Markdown("""
433
+ To obtain the highest quality generation results, we recommend following these guidelines:
434
+
435
+ 1. **Input Image Composition**:
436
+ Please ensure the input image features the person's head as the primary subject. Since our model is explicitly trained on facial data, it does not support full-body video generation.
437
+ * The inference pipeline automatically **crops the input image** to focus on the face by default.
438
+ * **Note on Resolution**: The model generates video at a fixed resolution of **512×512**. Using extremely high-resolution inputs will result in downscaling, so prioritize facial clarity over raw image dimensions.
439
+
440
+ 2. **Audio Selection**:
441
+ Our model was trained primarily on **English datasets**. Consequently, we recommend using **English audio** inputs to achieve the best lip-synchronization performance and naturalness.
442
+
443
+ 3. **Background Quality**:
444
+ We strongly recommend using source images with **solid colored** or **blurred (bokeh)** backgrounds. Complex or highly detailed backgrounds may lead to visual artifacts or jitter in the generated video.
445
+ """)
446
+
447
  with gr.Tabs():
448
+ # ==========================
449
+ # Tab 1: Audio Driven
450
+ # ==========================
451
  with gr.TabItem("Audio Driven"):
452
  with gr.Row():
453
  with gr.Column():
454
+ # 1. 图片输入
455
  a_img = gr.Image(label="Source Image", type="numpy", height=512, width=512)
456
+
457
+ # --- 图片示例 (独立) ---
458
+ # 请确保 examples 文件夹下有对应的 source_x.png 文件
459
+ gr.Examples(
460
+ examples=[
461
+ ["examples/source_1.png"],
462
+ ["examples/source_2.png"],
463
+ ["examples/source_3.jpg"],
464
+ ["examples/source_4.png"],
465
+ ["examples/source_5.png"],
466
+ ["examples/source_6.png"],
467
+ ],
468
+ inputs=[a_img],
469
+ label="Example Images",
470
+ cache_examples=False,
471
+ )
472
+
473
+ # 2. 音频输入
474
  a_aud = gr.Audio(label="Driving Audio", type="filepath")
475
+
476
+ # --- 音频示例 (独立) ---
477
+ # 请确保 examples 文件夹下有对应的 audio_x.wav 文件
478
+ gr.Examples(
479
+ examples=[
480
+ ["examples/audio_1.wav"],
481
+ ["examples/audio_2.wav"],
482
+ ["examples/audio_3.wav"],
483
+ ["examples/audio_4.wav"],
484
+ ["examples/audio_5.wav"],
485
+ ],
486
+ inputs=[a_aud],
487
+ label="Example Audios",
488
+ cache_examples=False,
489
+ )
490
+
491
  with gr.Accordion("Settings", open=True):
492
  a_crop = gr.Checkbox(label="Auto Crop Face", value=True)
493
  a_seed = gr.Number(label="Seed", value=42)
494
  a_nfe = gr.Slider(5, 50, value=10, step=1, label="Steps (NFE)")
495
  a_cfg = gr.Slider(1.0, 5.0, value=3.0, label="CFG Scale")
496
+
497
  a_btn = gr.Button("Generate (Audio Driven)", variant="primary")
498
+
499
  with gr.Column():
 
500
  a_out = gr.Video(label="Result", height=512, width=512)
501
+
502
  a_btn.click(fn_audio_driven, [a_img, a_aud, a_crop, a_seed, a_nfe, a_cfg], a_out)
503
 
504
+ # ==========================
505
+ # Tab 2: Video Driven
506
+ # ==========================
507
  with gr.TabItem("Video Driven"):
508
  with gr.Row():
509
  with gr.Column():
510
+ # 1. 图片输入
511
  v_img = gr.Image(label="Source Image", type="numpy", height=512, width=512)
512
+
513
+ # --- 图片示例 (独立) ---
514
+ gr.Examples(
515
+ examples=[
516
+ ["examples/source_1.png"],
517
+ ["examples/source_2.png"],
518
+ ["examples/source_3.jpg"],
519
+ ["examples/source_4.png"],
520
+ ["examples/source_5.png"],
521
+ ["examples/source_6.png"],
522
+ ],
523
+ inputs=[v_img],
524
+ label="Example Images",
525
+ cache_examples=False,
526
+ )
527
+
528
+ # 2. 视频输入
529
  v_vid = gr.Video(label="Driving Video", sources=["upload"], height=512, width=512)
530
+
531
+ # --- 视频示例 (独立) ---
532
+ # 请确保 examples 文件夹下有对应的 driving_x.mp4 文件
533
+ gr.Examples(
534
+ examples=[
535
+ ["examples/driving_1.mp4"],
536
+ ["examples/driving_2.mp4"],
537
+ ["examples/driving_3.mp4"],
538
+ ["examples/driving_4.mp4"],
539
+ ],
540
+ inputs=[v_vid],
541
+ label="Example Videos",
542
+ cache_examples=False,
543
+ )
544
+
545
  v_crop = gr.Checkbox(label="Auto Crop (Both Source & Driving)", value=True)
546
  v_btn = gr.Button("Generate (Video Driven)", variant="primary")
547
+
548
  with gr.Column():
 
549
  v_out = gr.Video(label="Result", height=512, width=512)
550
+
551
  v_btn.click(fn_video_driven, [v_img, v_vid, v_crop], v_out)
552
 
553
  if __name__ == "__main__":
examples/audio_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0339acacf4d18602f14552f514a18d29cafe7f61c5323488d40180d40504d79a
3
+ size 318842
examples/audio_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d79d560070feee7360ea4fb2167096f60363a211bb4cc553d8c0762953707cb
3
+ size 1355854
examples/audio_3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c44fd843576c6fcc0f6b957b08745020f3cc9a0dc8700cad2774306b4c5dea1e
3
+ size 2588750
examples/audio_4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8df9e722e8904dc6e09840a1c79b0af77e75cd9f3092b5fc445aa29f9553ee07
3
+ size 1896526
examples/audio_5.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d79d560070feee7360ea4fb2167096f60363a211bb4cc553d8c0762953707cb
3
+ size 1355854
examples/driving_1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58316e577e572ee02cf867c56258a9b4dc7a39ba239818dbdcce58042c740491
3
+ size 379764
examples/driving_2.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4ac9e2b357ced719302835fcab786603afba9fcff83e07c2fabb6a4b2da66cd
3
+ size 358120
examples/driving_3.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0e9737dfd38d4fa90378a317640f00a3879b11403745033df052066ed8e168b
3
+ size 946304
examples/driving_4.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef5c86e49b1b43dcb1449b499eb5a7f0cbae2f78aec08b5598193be1e4257099
3
+ size 1430968
examples/source_1.png ADDED

Git LFS Details

  • SHA256: 1d114c4b3b912da01b357b347bd3ee7631f53c18812cd9d33a560879659802b9
  • Pointer size: 132 Bytes
  • Size of remote file: 1.53 MB
examples/source_2.png ADDED

Git LFS Details

  • SHA256: bf53606c234f43ddcbea16614aade50645f48850f9c4210523a8d1c09f873868
  • Pointer size: 131 Bytes
  • Size of remote file: 569 kB
examples/source_3.jpg ADDED

Git LFS Details

  • SHA256: e9f13b994be3d08c9489759cacbb8225b3bf277f1c91e696e955aec0a55e1785
  • Pointer size: 130 Bytes
  • Size of remote file: 53.2 kB
examples/source_4.png ADDED

Git LFS Details

  • SHA256: 51aac944f9accb66bfd6c8f8886a826fe6ea3766abbe7ff9275ba2d22f09f9f2
  • Pointer size: 130 Bytes
  • Size of remote file: 54.6 kB
examples/source_5.png ADDED

Git LFS Details

  • SHA256: 9d43949b7a464743728423e9ea79cb81d12354cf09b9c219f13adab32424a617
  • Pointer size: 131 Bytes
  • Size of remote file: 349 kB
examples/source_6.png ADDED

Git LFS Details

  • SHA256: aa1ca796971f9504727fefe2ef6001ccac99f106be073d212022b6abbb3afb25
  • Pointer size: 131 Bytes
  • Size of remote file: 787 kB