Add examples using Git LFS
Browse files- .gitattributes +4 -0
- app.py +98 -7
- examples/audio_1.wav +3 -0
- examples/audio_2.wav +3 -0
- examples/audio_3.wav +3 -0
- examples/audio_4.wav +3 -0
- examples/audio_5.wav +3 -0
- examples/driving_1.mp4 +3 -0
- examples/driving_2.mp4 +3 -0
- examples/driving_3.mp4 +3 -0
- examples/driving_4.mp4 +3 -0
- examples/source_1.png +3 -0
- examples/source_2.png +3 -0
- examples/source_3.jpg +3 -0
- examples/source_4.png +3 -0
- examples/source_5.png +3 -0
- examples/source_6.png +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
|
@@ -424,39 +424,130 @@ def fn_video_driven(source_image, driving_video, crop, progress=gr.Progress()):
|
|
| 424 |
traceback.print_exc()
|
| 425 |
raise gr.Error(f"Error: {e}")
|
| 426 |
|
| 427 |
-
|
| 428 |
-
with gr.Blocks(title="IMTalker Demo") as demo:
|
| 429 |
gr.Markdown("# 🗣️ IMTalker: Efficient Audio-driven Talking Face Generation")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
with gr.Tabs():
|
|
|
|
|
|
|
|
|
|
| 431 |
with gr.TabItem("Audio Driven"):
|
| 432 |
with gr.Row():
|
| 433 |
with gr.Column():
|
| 434 |
-
#
|
| 435 |
a_img = gr.Image(label="Source Image", type="numpy", height=512, width=512)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
a_aud = gr.Audio(label="Driving Audio", type="filepath")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
with gr.Accordion("Settings", open=True):
|
| 438 |
a_crop = gr.Checkbox(label="Auto Crop Face", value=True)
|
| 439 |
a_seed = gr.Number(label="Seed", value=42)
|
| 440 |
a_nfe = gr.Slider(5, 50, value=10, step=1, label="Steps (NFE)")
|
| 441 |
a_cfg = gr.Slider(1.0, 5.0, value=3.0, label="CFG Scale")
|
|
|
|
| 442 |
a_btn = gr.Button("Generate (Audio Driven)", variant="primary")
|
|
|
|
| 443 |
with gr.Column():
|
| 444 |
-
# 固定分辨率为 512x512
|
| 445 |
a_out = gr.Video(label="Result", height=512, width=512)
|
|
|
|
| 446 |
a_btn.click(fn_audio_driven, [a_img, a_aud, a_crop, a_seed, a_nfe, a_cfg], a_out)
|
| 447 |
|
|
|
|
|
|
|
|
|
|
| 448 |
with gr.TabItem("Video Driven"):
|
| 449 |
with gr.Row():
|
| 450 |
with gr.Column():
|
| 451 |
-
#
|
| 452 |
v_img = gr.Image(label="Source Image", type="numpy", height=512, width=512)
|
| 453 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
v_vid = gr.Video(label="Driving Video", sources=["upload"], height=512, width=512)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 455 |
v_crop = gr.Checkbox(label="Auto Crop (Both Source & Driving)", value=True)
|
| 456 |
v_btn = gr.Button("Generate (Video Driven)", variant="primary")
|
|
|
|
| 457 |
with gr.Column():
|
| 458 |
-
# 固定分辨率为 512x512
|
| 459 |
v_out = gr.Video(label="Result", height=512, width=512)
|
|
|
|
| 460 |
v_btn.click(fn_video_driven, [v_img, v_vid, v_crop], v_out)
|
| 461 |
|
| 462 |
if __name__ == "__main__":
|
|
|
|
| 424 |
traceback.print_exc()
|
| 425 |
raise gr.Error(f"Error: {e}")
|
| 426 |
|
| 427 |
+
with gr.Blocks(title="IMTalker Demo", theme=gr.themes.Base()) as demo:
|
|
|
|
| 428 |
gr.Markdown("# 🗣️ IMTalker: Efficient Audio-driven Talking Face Generation")
|
| 429 |
+
|
| 430 |
+
# 最佳实践说明
|
| 431 |
+
with gr.Accordion("💡 Best Practices (Click to read)", open=False):
|
| 432 |
+
gr.Markdown("""
|
| 433 |
+
To obtain the highest quality generation results, we recommend following these guidelines:
|
| 434 |
+
|
| 435 |
+
1. **Input Image Composition**:
|
| 436 |
+
Please ensure the input image features the person's head as the primary subject. Since our model is explicitly trained on facial data, it does not support full-body video generation.
|
| 437 |
+
* The inference pipeline automatically **crops the input image** to focus on the face by default.
|
| 438 |
+
* **Note on Resolution**: The model generates video at a fixed resolution of **512×512**. Using extremely high-resolution inputs will result in downscaling, so prioritize facial clarity over raw image dimensions.
|
| 439 |
+
|
| 440 |
+
2. **Audio Selection**:
|
| 441 |
+
Our model was trained primarily on **English datasets**. Consequently, we recommend using **English audio** inputs to achieve the best lip-synchronization performance and naturalness.
|
| 442 |
+
|
| 443 |
+
3. **Background Quality**:
|
| 444 |
+
We strongly recommend using source images with **solid colored** or **blurred (bokeh)** backgrounds. Complex or highly detailed backgrounds may lead to visual artifacts or jitter in the generated video.
|
| 445 |
+
""")
|
| 446 |
+
|
| 447 |
with gr.Tabs():
|
| 448 |
+
# ==========================
|
| 449 |
+
# Tab 1: Audio Driven
|
| 450 |
+
# ==========================
|
| 451 |
with gr.TabItem("Audio Driven"):
|
| 452 |
with gr.Row():
|
| 453 |
with gr.Column():
|
| 454 |
+
# 1. 图片输入
|
| 455 |
a_img = gr.Image(label="Source Image", type="numpy", height=512, width=512)
|
| 456 |
+
|
| 457 |
+
# --- 图片示例 (独立) ---
|
| 458 |
+
# 请确保 examples 文件夹下有对应的 source_x.png 文件
|
| 459 |
+
gr.Examples(
|
| 460 |
+
examples=[
|
| 461 |
+
["examples/source_1.png"],
|
| 462 |
+
["examples/source_2.png"],
|
| 463 |
+
["examples/source_3.jpg"],
|
| 464 |
+
["examples/source_4.png"],
|
| 465 |
+
["examples/source_5.png"],
|
| 466 |
+
["examples/source_6.png"],
|
| 467 |
+
],
|
| 468 |
+
inputs=[a_img],
|
| 469 |
+
label="Example Images",
|
| 470 |
+
cache_examples=False,
|
| 471 |
+
)
|
| 472 |
+
|
| 473 |
+
# 2. 音频输入
|
| 474 |
a_aud = gr.Audio(label="Driving Audio", type="filepath")
|
| 475 |
+
|
| 476 |
+
# --- 音频示例 (独立) ---
|
| 477 |
+
# 请确保 examples 文件夹下有对应的 audio_x.wav 文件
|
| 478 |
+
gr.Examples(
|
| 479 |
+
examples=[
|
| 480 |
+
["examples/audio_1.wav"],
|
| 481 |
+
["examples/audio_2.wav"],
|
| 482 |
+
["examples/audio_3.wav"],
|
| 483 |
+
["examples/audio_4.wav"],
|
| 484 |
+
["examples/audio_5.wav"],
|
| 485 |
+
],
|
| 486 |
+
inputs=[a_aud],
|
| 487 |
+
label="Example Audios",
|
| 488 |
+
cache_examples=False,
|
| 489 |
+
)
|
| 490 |
+
|
| 491 |
with gr.Accordion("Settings", open=True):
|
| 492 |
a_crop = gr.Checkbox(label="Auto Crop Face", value=True)
|
| 493 |
a_seed = gr.Number(label="Seed", value=42)
|
| 494 |
a_nfe = gr.Slider(5, 50, value=10, step=1, label="Steps (NFE)")
|
| 495 |
a_cfg = gr.Slider(1.0, 5.0, value=3.0, label="CFG Scale")
|
| 496 |
+
|
| 497 |
a_btn = gr.Button("Generate (Audio Driven)", variant="primary")
|
| 498 |
+
|
| 499 |
with gr.Column():
|
|
|
|
| 500 |
a_out = gr.Video(label="Result", height=512, width=512)
|
| 501 |
+
|
| 502 |
a_btn.click(fn_audio_driven, [a_img, a_aud, a_crop, a_seed, a_nfe, a_cfg], a_out)
|
| 503 |
|
| 504 |
+
# ==========================
|
| 505 |
+
# Tab 2: Video Driven
|
| 506 |
+
# ==========================
|
| 507 |
with gr.TabItem("Video Driven"):
|
| 508 |
with gr.Row():
|
| 509 |
with gr.Column():
|
| 510 |
+
# 1. 图片输入
|
| 511 |
v_img = gr.Image(label="Source Image", type="numpy", height=512, width=512)
|
| 512 |
+
|
| 513 |
+
# --- 图片示例 (独立) ---
|
| 514 |
+
gr.Examples(
|
| 515 |
+
examples=[
|
| 516 |
+
["examples/source_1.png"],
|
| 517 |
+
["examples/source_2.png"],
|
| 518 |
+
["examples/source_3.jpg"],
|
| 519 |
+
["examples/source_4.png"],
|
| 520 |
+
["examples/source_5.png"],
|
| 521 |
+
["examples/source_6.png"],
|
| 522 |
+
],
|
| 523 |
+
inputs=[v_img],
|
| 524 |
+
label="Example Images",
|
| 525 |
+
cache_examples=False,
|
| 526 |
+
)
|
| 527 |
+
|
| 528 |
+
# 2. 视频输入
|
| 529 |
v_vid = gr.Video(label="Driving Video", sources=["upload"], height=512, width=512)
|
| 530 |
+
|
| 531 |
+
# --- 视频示例 (独立) ---
|
| 532 |
+
# 请确保 examples 文件夹下有对应的 driving_x.mp4 文件
|
| 533 |
+
gr.Examples(
|
| 534 |
+
examples=[
|
| 535 |
+
["examples/driving_1.mp4"],
|
| 536 |
+
["examples/driving_2.mp4"],
|
| 537 |
+
["examples/driving_3.mp4"],
|
| 538 |
+
["examples/driving_4.mp4"],
|
| 539 |
+
],
|
| 540 |
+
inputs=[v_vid],
|
| 541 |
+
label="Example Videos",
|
| 542 |
+
cache_examples=False,
|
| 543 |
+
)
|
| 544 |
+
|
| 545 |
v_crop = gr.Checkbox(label="Auto Crop (Both Source & Driving)", value=True)
|
| 546 |
v_btn = gr.Button("Generate (Video Driven)", variant="primary")
|
| 547 |
+
|
| 548 |
with gr.Column():
|
|
|
|
| 549 |
v_out = gr.Video(label="Result", height=512, width=512)
|
| 550 |
+
|
| 551 |
v_btn.click(fn_video_driven, [v_img, v_vid, v_crop], v_out)
|
| 552 |
|
| 553 |
if __name__ == "__main__":
|
examples/audio_1.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0339acacf4d18602f14552f514a18d29cafe7f61c5323488d40180d40504d79a
|
| 3 |
+
size 318842
|
examples/audio_2.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0d79d560070feee7360ea4fb2167096f60363a211bb4cc553d8c0762953707cb
|
| 3 |
+
size 1355854
|
examples/audio_3.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c44fd843576c6fcc0f6b957b08745020f3cc9a0dc8700cad2774306b4c5dea1e
|
| 3 |
+
size 2588750
|
examples/audio_4.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8df9e722e8904dc6e09840a1c79b0af77e75cd9f3092b5fc445aa29f9553ee07
|
| 3 |
+
size 1896526
|
examples/audio_5.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0d79d560070feee7360ea4fb2167096f60363a211bb4cc553d8c0762953707cb
|
| 3 |
+
size 1355854
|
examples/driving_1.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:58316e577e572ee02cf867c56258a9b4dc7a39ba239818dbdcce58042c740491
|
| 3 |
+
size 379764
|
examples/driving_2.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d4ac9e2b357ced719302835fcab786603afba9fcff83e07c2fabb6a4b2da66cd
|
| 3 |
+
size 358120
|
examples/driving_3.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f0e9737dfd38d4fa90378a317640f00a3879b11403745033df052066ed8e168b
|
| 3 |
+
size 946304
|
examples/driving_4.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ef5c86e49b1b43dcb1449b499eb5a7f0cbae2f78aec08b5598193be1e4257099
|
| 3 |
+
size 1430968
|
examples/source_1.png
ADDED
|
Git LFS Details
|
examples/source_2.png
ADDED
|
Git LFS Details
|
examples/source_3.jpg
ADDED
|
Git LFS Details
|
examples/source_4.png
ADDED
|
Git LFS Details
|
examples/source_5.png
ADDED
|
Git LFS Details
|
examples/source_6.png
ADDED
|
Git LFS Details
|