| | import spaces |
| | import gradio as gr |
| | import os |
| | import numpy as np |
| | from pydub import AudioSegment |
| | import hashlib |
| | from sonic import Sonic |
| | from PIL import Image |
| | import torch |
| |
|
| | |
| | cmd = ( |
| | 'python3 -m pip install "huggingface_hub[cli]"; ' |
| | 'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; ' |
| | 'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; ' |
| | 'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;' |
| | ) |
| | os.system(cmd) |
| |
|
| | pipe = Sonic() |
| |
|
| | def get_md5(content): |
| | md5hash = hashlib.md5(content) |
| | return md5hash.hexdigest() |
| |
|
| | @spaces.GPU(duration=300) |
| | def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0): |
| | expand_ratio = 0.5 |
| | min_resolution = 512 |
| | inference_steps = 25 |
| |
|
| | |
| | audio = AudioSegment.from_file(audio_path) |
| | duration = len(audio) / 1000.0 |
| | print(f"Audio duration: {duration} seconds, using inference_steps: {inference_steps}") |
| |
|
| | face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio) |
| | print(f"Face detection info: {face_info}") |
| |
|
| | if face_info['face_num'] > 0: |
| | crop_image_path = img_path + '.crop.png' |
| | pipe.crop_image(img_path, crop_image_path, face_info['crop_bbox']) |
| | img_path = crop_image_path |
| | os.makedirs(os.path.dirname(res_video_path), exist_ok=True) |
| |
|
| | |
| | pipe.process( |
| | img_path, |
| | audio_path, |
| | res_video_path, |
| | min_resolution=min_resolution, |
| | inference_steps=inference_steps, |
| | dynamic_scale=dynamic_scale |
| | ) |
| | return res_video_path |
| | else: |
| | return -1 |
| |
|
| | tmp_path = './tmp_path/' |
| | res_path = './res_path/' |
| | os.makedirs(tmp_path, exist_ok=True) |
| | os.makedirs(res_path, exist_ok=True) |
| |
|
| | def process_sonic(image, audio, dynamic_scale): |
| | |
| | if image is None: |
| | raise gr.Error("Please upload an image") |
| | if audio is None: |
| | raise gr.Error("Please upload an audio file") |
| | |
| | img_md5 = get_md5(np.array(image)) |
| | audio_md5 = get_md5(audio[1]) |
| | print(f"Processing with image hash: {img_md5}, audio hash: {audio_md5}") |
| | |
| | sampling_rate, arr = audio[:2] |
| | if len(arr.shape) == 1: |
| | arr = arr[:, None] |
| | |
| | |
| | audio_segment = AudioSegment( |
| | arr.tobytes(), |
| | frame_rate=sampling_rate, |
| | sample_width=arr.dtype.itemsize, |
| | channels=arr.shape[1] |
| | ) |
| | audio_segment = audio_segment.set_frame_rate(sampling_rate) |
| | |
| | |
| | image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png')) |
| | audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav')) |
| | res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4')) |
| | |
| | |
| | if not os.path.exists(image_path): |
| | image.save(image_path) |
| | if not os.path.exists(audio_path): |
| | audio_segment.export(audio_path, format="wav") |
| | |
| | |
| | if os.path.exists(res_video_path): |
| | print(f"Using cached result: {res_video_path}") |
| | return res_video_path |
| | else: |
| | print(f"Generating new video with dynamic scale: {dynamic_scale}") |
| | return get_video_res(image_path, audio_path, res_video_path, dynamic_scale) |
| |
|
| | |
| | def get_example(): |
| | return [] |
| |
|
| | css = """ |
| | .gradio-container { |
| | font-family: 'Arial', sans-serif; |
| | } |
| | .main-header { |
| | text-align: center; |
| | color: #2a2a2a; |
| | margin-bottom: 2em; |
| | } |
| | .parameter-section { |
| | background-color: #f5f5f5; |
| | padding: 1em; |
| | border-radius: 8px; |
| | margin: 1em 0; |
| | } |
| | .example-section { |
| | margin-top: 2em; |
| | } |
| | """ |
| |
|
| | with gr.Blocks(css=css) as demo: |
| | gr.HTML(""" |
| | <div class="main-header"> |
| | <h1>๐ญ Sonic: Advanced Portrait Animation</h1> |
| | <p>Transform still images into dynamic videos synchronized with audio</p> |
| | </div> |
| | """) |
| | |
| | with gr.Row(): |
| | with gr.Column(): |
| | image_input = gr.Image( |
| | type='pil', |
| | label="Portrait Image", |
| | elem_id="image_input" |
| | ) |
| | |
| | audio_input = gr.Audio( |
| | label="Voice/Audio Input", |
| | elem_id="audio_input", |
| | type="numpy" |
| | ) |
| | |
| | with gr.Column(): |
| | dynamic_scale = gr.Slider( |
| | minimum=0.5, |
| | maximum=2.0, |
| | value=1.0, |
| | step=0.1, |
| | label="Animation Intensity", |
| | info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)" |
| | ) |
| | |
| | process_btn = gr.Button( |
| | "Generate Animation", |
| | variant="primary", |
| | elem_id="process_btn" |
| | ) |
| | |
| | with gr.Column(): |
| | video_output = gr.Video( |
| | label="Generated Animation", |
| | elem_id="video_output" |
| | ) |
| | |
| | process_btn.click( |
| | fn=process_sonic, |
| | inputs=[image_input, audio_input, dynamic_scale], |
| | outputs=video_output, |
| | api_name="animate" |
| | ) |
| | |
| | gr.Examples( |
| | examples=get_example(), |
| | fn=process_sonic, |
| | inputs=[image_input, audio_input, dynamic_scale], |
| | outputs=video_output, |
| | cache_examples=False |
| | ) |
| | |
| | gr.HTML(""" |
| | <div style="text-align: center; margin-top: 2em;"> |
| | <div style="margin-bottom: 1em;"> |
| | <a href="https://github.com/jixiaozhong/Sonic" target="_blank" style="text-decoration: none;"> |
| | <img src="https://img.shields.io/badge/GitHub-Repo-blue?style=for-the-badge&logo=github" alt="GitHub Repo"> |
| | </a> |
| | <a href="https://arxiv.org/pdf/2411.16331" target="_blank" style="text-decoration: none;"> |
| | <img src="https://img.shields.io/badge/Paper-arXiv-red?style=for-the-badge&logo=arxiv" alt="arXiv Paper"> |
| | </a> |
| | </div> |
| | <p>๐ Note: For optimal results, use clear portrait images and high-quality audio</p> |
| | </div> |
| | """) |
| |
|
| | |
| | demo.launch(share=True) |
| |
|