| | import spaces |
| | import os |
| | import shutil |
| | from huggingface_hub import snapshot_download |
| | import gradio as gr |
| | os.chdir(os.path.dirname(os.path.abspath(__file__))) |
| |
|
| | hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models") |
| | joyhallo_dir = snapshot_download(repo_id="jdh-algo/JoyHallo-v1", local_dir="pretrained_models/joyhallo") |
| | wav_dir = snapshot_download(repo_id="TencentGameMate/chinese-wav2vec2-base", local_dir="pretrained_models/chinese-wav2vec2-base") |
| | print(hallo_dir, joyhallo_dir) |
| | print(os.listdir(hallo_dir)) |
| |
|
| | from scripts.inference import predict |
| |
|
| | @spaces.GPU(duration=120) |
| | def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)): |
| | |
| | return predict(source_image, driving_audio, 1.0, 1.0, 1.0, 1.2) |
| |
|
| |
|
| | css = ''' |
| | div#warning-ready { |
| | background-color: #ecfdf5; |
| | padding: 0 16px 16px; |
| | margin: 20px 0; |
| | color: #030303!important; |
| | } |
| | div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p { |
| | color: #057857!important; |
| | } |
| | div#warning-duplicate { |
| | background-color: #ebf5ff; |
| | padding: 0 16px 16px; |
| | margin: 20px 0; |
| | color: #030303!important; |
| | } |
| | div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p { |
| | color: #0f4592!important; |
| | } |
| | div#warning-duplicate strong { |
| | color: #0f4592; |
| | } |
| | p.actions { |
| | display: flex; |
| | align-items: center; |
| | margin: 20px 0; |
| | } |
| | div#warning-duplicate .actions a { |
| | display: inline-block; |
| | margin-right: 10px; |
| | } |
| | .dark #warning-duplicate { |
| | background-color: #0c0c0c !important; |
| | border: 1px solid white !important; |
| | } |
| | ''' |
| |
|
| | with gr.Blocks(css=css) as demo: |
| | gr.Markdown("# JoyHallo: Digital human model for Mandarin") |
| | gr.Markdown("Generate talking head avatars driven by Mandarin speech. Data requirements:") |
| | gr.Markdown(""" |
| | Image: |
| | 1. Cropped to square shape. |
| | 2. Face should be facing forward and occupy 50%-70% of the image. |
| | |
| | Audio: |
| | 1. Use wav format. |
| | 2. Mandarin, English or mixed, with clear audio and suitable background music. |
| | |
| | ! Important: Too long audio will casue a very long processing time, please keep the audio length within 5s. |
| | """) |
| | with gr.Row(): |
| | with gr.Column(): |
| | avatar_face = gr.Image(type="filepath", label="Face") |
| | driving_audio = gr.Audio(type="filepath", label="Driving audio") |
| | generate = gr.Button("Generate") |
| | with gr.Column(): |
| | output_video = gr.Video(label="Your talking head") |
| |
|
| | generate.click( |
| | fn=run_inference, |
| | inputs=[avatar_face, driving_audio], |
| | outputs=output_video |
| | ) |
| | |
| | demo.launch() |