Spaces:

chenxie95
/

IMTalker

Running on Zero

App Files Files Community

cbsjtu01 commited on Nov 25, 2025

Commit

aa81c96

1 Parent(s): 21025b7

Add examples using Git LFS

Browse files

Files changed (17) hide show

.gitattributes +4 -0
app.py +98 -7
examples/audio_1.wav +3 -0
examples/audio_2.wav +3 -0
examples/audio_3.wav +3 -0
examples/audio_4.wav +3 -0
examples/audio_5.wav +3 -0
examples/driving_1.mp4 +3 -0
examples/driving_2.mp4 +3 -0
examples/driving_3.mp4 +3 -0
examples/driving_4.mp4 +3 -0
examples/source_1.png +3 -0
examples/source_2.png +3 -0
examples/source_3.jpg +3 -0
examples/source_4.png +3 -0
examples/source_5.png +3 -0
examples/source_6.png +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -424,39 +424,130 @@ def fn_video_driven(source_image, driving_video, crop, progress=gr.Progress()):
         traceback.print_exc()
         raise gr.Error(f"Error: {e}")
-# Gradio 4.x 语法：去除了 css，使用 sources=["upload"]
-with gr.Blocks(title="IMTalker Demo") as demo:
     gr.Markdown("# 🗣️ IMTalker: Efficient Audio-driven Talking Face Generation")
     with gr.Tabs():
         with gr.TabItem("Audio Driven"):
             with gr.Row():
                 with gr.Column():
-                    # 固定分辨率为 512x512
                     a_img = gr.Image(label="Source Image", type="numpy", height=512, width=512)
                     a_aud = gr.Audio(label="Driving Audio", type="filepath")
                     with gr.Accordion("Settings", open=True):
                         a_crop = gr.Checkbox(label="Auto Crop Face", value=True)
                         a_seed = gr.Number(label="Seed", value=42)
                         a_nfe = gr.Slider(5, 50, value=10, step=1, label="Steps (NFE)")
                         a_cfg = gr.Slider(1.0, 5.0, value=3.0, label="CFG Scale")
                     a_btn = gr.Button("Generate (Audio Driven)", variant="primary")
                 with gr.Column():
-                    # 固定分辨率为 512x512
                     a_out = gr.Video(label="Result", height=512, width=512)
             a_btn.click(fn_audio_driven, [a_img, a_aud, a_crop, a_seed, a_nfe, a_cfg], a_out)
         with gr.TabItem("Video Driven"):
             with gr.Row():
                 with gr.Column():
-                    # 固定分辨率为 512x512
                     v_img = gr.Image(label="Source Image", type="numpy", height=512, width=512)
-                    # Gradio 4.x 语法, 固定分辨率
                     v_vid = gr.Video(label="Driving Video", sources=["upload"], height=512, width=512)
                     v_crop = gr.Checkbox(label="Auto Crop (Both Source & Driving)", value=True)
                     v_btn = gr.Button("Generate (Video Driven)", variant="primary")
                 with gr.Column():
-                    # 固定分辨率为 512x512
                     v_out = gr.Video(label="Result", height=512, width=512)
             v_btn.click(fn_video_driven, [v_img, v_vid, v_crop], v_out)
 if __name__ == "__main__":

         traceback.print_exc()
         raise gr.Error(f"Error: {e}")
+with gr.Blocks(title="IMTalker Demo", theme=gr.themes.Base()) as demo:
     gr.Markdown("# 🗣️ IMTalker: Efficient Audio-driven Talking Face Generation")
+    # 最佳实践说明
+    with gr.Accordion("💡 Best Practices (Click to read)", open=False):
+        gr.Markdown("""
+        To obtain the highest quality generation results, we recommend following these guidelines:
+        1.  **Input Image Composition**:
+            Please ensure the input image features the person's head as the primary subject. Since our model is explicitly trained on facial data, it does not support full-body video generation.
+            * The inference pipeline automatically **crops the input image** to focus on the face by default.
+            * **Note on Resolution**: The model generates video at a fixed resolution of **512×512**. Using extremely high-resolution inputs will result in downscaling, so prioritize facial clarity over raw image dimensions.
+        2.  **Audio Selection**:
+            Our model was trained primarily on **English datasets**. Consequently, we recommend using **English audio** inputs to achieve the best lip-synchronization performance and naturalness.
+        3.  **Background Quality**:
+            We strongly recommend using source images with **solid colored** or **blurred (bokeh)** backgrounds. Complex or highly detailed backgrounds may lead to visual artifacts or jitter in the generated video.
+        """)
     with gr.Tabs():
+        # ==========================
+        # Tab 1: Audio Driven
+        # ==========================
         with gr.TabItem("Audio Driven"):
             with gr.Row():
                 with gr.Column():
+                    # 1. 图片输入
                     a_img = gr.Image(label="Source Image", type="numpy", height=512, width=512)
+                    # --- 图片示例 (独立) ---
+                    # 请确保 examples 文件夹下有对应的 source_x.png 文件
+                    gr.Examples(
+                        examples=[
+                            ["examples/source_1.png"],
+                            ["examples/source_2.png"],
+                            ["examples/source_3.jpg"],
+                            ["examples/source_4.png"],
+                            ["examples/source_5.png"],
+                            ["examples/source_6.png"],
+                        ],
+                        inputs=[a_img],
+                        label="Example Images",
+                        cache_examples=False,
+                    )
+                    # 2. 音频输入
                     a_aud = gr.Audio(label="Driving Audio", type="filepath")
+                    # --- 音频示例 (独立) ---
+                    # 请确保 examples 文件夹下有对应的 audio_x.wav 文件
+                    gr.Examples(
+                        examples=[
+                            ["examples/audio_1.wav"],
+                            ["examples/audio_2.wav"],
+                            ["examples/audio_3.wav"],
+                            ["examples/audio_4.wav"],
+                            ["examples/audio_5.wav"],
+                        ],
+                        inputs=[a_aud],
+                        label="Example Audios",
+                        cache_examples=False,
+                    )
                     with gr.Accordion("Settings", open=True):
                         a_crop = gr.Checkbox(label="Auto Crop Face", value=True)
                         a_seed = gr.Number(label="Seed", value=42)
                         a_nfe = gr.Slider(5, 50, value=10, step=1, label="Steps (NFE)")
                         a_cfg = gr.Slider(1.0, 5.0, value=3.0, label="CFG Scale")
                     a_btn = gr.Button("Generate (Audio Driven)", variant="primary")
                 with gr.Column():
                     a_out = gr.Video(label="Result", height=512, width=512)
             a_btn.click(fn_audio_driven, [a_img, a_aud, a_crop, a_seed, a_nfe, a_cfg], a_out)
+        # ==========================
+        # Tab 2: Video Driven
+        # ==========================
         with gr.TabItem("Video Driven"):
             with gr.Row():
                 with gr.Column():
+                    # 1. 图片输入
                     v_img = gr.Image(label="Source Image", type="numpy", height=512, width=512)
+                    # --- 图片示例 (独立) ---
+                    gr.Examples(
+                        examples=[
+                            ["examples/source_1.png"],
+                            ["examples/source_2.png"],
+                            ["examples/source_3.jpg"],
+                            ["examples/source_4.png"],
+                            ["examples/source_5.png"],
+                            ["examples/source_6.png"],
+                        ],
+                        inputs=[v_img],
+                        label="Example Images",
+                        cache_examples=False,
+                    )
+                    # 2. 视频输入
                     v_vid = gr.Video(label="Driving Video", sources=["upload"], height=512, width=512)
+                    # --- 视频示例 (独立) ---
+                    # 请确保 examples 文件夹下有对应的 driving_x.mp4 文件
+                    gr.Examples(
+                        examples=[
+                            ["examples/driving_1.mp4"],
+                            ["examples/driving_2.mp4"],
+                            ["examples/driving_3.mp4"],
+                            ["examples/driving_4.mp4"],
+                        ],
+                        inputs=[v_vid],
+                        label="Example Videos",
+                        cache_examples=False,
+                    )
                     v_crop = gr.Checkbox(label="Auto Crop (Both Source & Driving)", value=True)
                     v_btn = gr.Button("Generate (Video Driven)", variant="primary")
                 with gr.Column():
                     v_out = gr.Video(label="Result", height=512, width=512)
             v_btn.click(fn_video_driven, [v_img, v_vid, v_crop], v_out)
 if __name__ == "__main__":

examples/audio_1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0339acacf4d18602f14552f514a18d29cafe7f61c5323488d40180d40504d79a
+size 318842

examples/audio_2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d79d560070feee7360ea4fb2167096f60363a211bb4cc553d8c0762953707cb
+size 1355854

examples/audio_3.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c44fd843576c6fcc0f6b957b08745020f3cc9a0dc8700cad2774306b4c5dea1e
+size 2588750

examples/audio_4.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8df9e722e8904dc6e09840a1c79b0af77e75cd9f3092b5fc445aa29f9553ee07
+size 1896526

examples/audio_5.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d79d560070feee7360ea4fb2167096f60363a211bb4cc553d8c0762953707cb
+size 1355854

examples/driving_1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58316e577e572ee02cf867c56258a9b4dc7a39ba239818dbdcce58042c740491
+size 379764

examples/driving_2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4ac9e2b357ced719302835fcab786603afba9fcff83e07c2fabb6a4b2da66cd
+size 358120

examples/driving_3.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0e9737dfd38d4fa90378a317640f00a3879b11403745033df052066ed8e168b
+size 946304

examples/driving_4.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef5c86e49b1b43dcb1449b499eb5a7f0cbae2f78aec08b5598193be1e4257099
+size 1430968

examples/source_1.png ADDED Viewed

Git LFS Details

SHA256: 1d114c4b3b912da01b357b347bd3ee7631f53c18812cd9d33a560879659802b9
Pointer size: 132 Bytes
Size of remote file: 1.53 MB

examples/source_2.png ADDED Viewed

Git LFS Details

SHA256: bf53606c234f43ddcbea16614aade50645f48850f9c4210523a8d1c09f873868
Pointer size: 131 Bytes
Size of remote file: 569 kB

examples/source_3.jpg ADDED Viewed

Git LFS Details

SHA256: e9f13b994be3d08c9489759cacbb8225b3bf277f1c91e696e955aec0a55e1785
Pointer size: 130 Bytes
Size of remote file: 53.2 kB

examples/source_4.png ADDED Viewed

Git LFS Details

SHA256: 51aac944f9accb66bfd6c8f8886a826fe6ea3766abbe7ff9275ba2d22f09f9f2
Pointer size: 130 Bytes
Size of remote file: 54.6 kB

examples/source_5.png ADDED Viewed

Git LFS Details

SHA256: 9d43949b7a464743728423e9ea79cb81d12354cf09b9c219f13adab32424a617
Pointer size: 131 Bytes
Size of remote file: 349 kB

examples/source_6.png ADDED Viewed

Git LFS Details

SHA256: aa1ca796971f9504727fefe2ef6001ccac99f106be073d212022b6abbb3afb25
Pointer size: 131 Bytes
Size of remote file: 787 kB