b3h-young123's picture
Add files using upload-large-folder tool
7ec9430 verified
import spaces
import gradio as gr
import os
import numpy as np
from pydub import AudioSegment
import hashlib
from sonic import Sonic
from PIL import Image
import torch
# ๋ชจ๋ธ ์ดˆ๊ธฐํ™”
cmd = (
'python3 -m pip install "huggingface_hub[cli]"; '
'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; '
'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; '
'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;'
)
os.system(cmd)
pipe = Sonic()
def get_md5(content):
md5hash = hashlib.md5(content)
return md5hash.hexdigest()
@spaces.GPU(duration=300) # ๊ธด ๋น„๋””์˜ค ์ฒ˜๋ฆฌ๋ฅผ ์œ„ํ•ด duration 300์ดˆ๋กœ ์„ค์ •
def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
expand_ratio = 0.5
min_resolution = 512
inference_steps = 25 # 2์ดˆ ๋ถ„๋Ÿ‰์˜ ๋น„๋””์˜ค(25 ํ”„๋ ˆ์ž„)๋กœ ๊ณ ์ •
# ์˜ค๋””์˜ค ๊ธธ์ด(์ฐธ๊ณ ์šฉ) ์ถœ๋ ฅ
audio = AudioSegment.from_file(audio_path)
duration = len(audio) / 1000.0 # ์ดˆ ๋‹จ์œ„
print(f"Audio duration: {duration} seconds, using inference_steps: {inference_steps}")
face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
print(f"Face detection info: {face_info}")
if face_info['face_num'] > 0:
crop_image_path = img_path + '.crop.png'
pipe.crop_image(img_path, crop_image_path, face_info['crop_bbox'])
img_path = crop_image_path
os.makedirs(os.path.dirname(res_video_path), exist_ok=True)
# ๊ณ ์ •๋œ inference_steps(25)๋กœ ๋น„๋””์˜ค ์ƒ์„ฑ
pipe.process(
img_path,
audio_path,
res_video_path,
min_resolution=min_resolution,
inference_steps=inference_steps,
dynamic_scale=dynamic_scale
)
return res_video_path
else:
return -1
tmp_path = './tmp_path/'
res_path = './res_path/'
os.makedirs(tmp_path, exist_ok=True)
os.makedirs(res_path, exist_ok=True)
def process_sonic(image, audio, dynamic_scale):
# ์ž…๋ ฅ ๊ฒ€์ฆ
if image is None:
raise gr.Error("Please upload an image")
if audio is None:
raise gr.Error("Please upload an audio file")
img_md5 = get_md5(np.array(image))
audio_md5 = get_md5(audio[1])
print(f"Processing with image hash: {img_md5}, audio hash: {audio_md5}")
sampling_rate, arr = audio[:2]
if len(arr.shape) == 1:
arr = arr[:, None]
# numpy array๋กœ๋ถ€ํ„ฐ AudioSegment ์ƒ์„ฑ
audio_segment = AudioSegment(
arr.tobytes(),
frame_rate=sampling_rate,
sample_width=arr.dtype.itemsize,
channels=arr.shape[1]
)
audio_segment = audio_segment.set_frame_rate(sampling_rate)
# ํŒŒ์ผ ๊ฒฝ๋กœ ์ƒ์„ฑ
image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png'))
audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav'))
res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4'))
# ์ž…๋ ฅ ํŒŒ์ผ์ด ์—†์œผ๋ฉด ์ €์žฅ
if not os.path.exists(image_path):
image.save(image_path)
if not os.path.exists(audio_path):
audio_segment.export(audio_path, format="wav")
# ์บ์‹œ๋œ ๊ฒฐ๊ณผ๊ฐ€ ์žˆ์œผ๋ฉด ๋ฐ˜ํ™˜, ์—†์œผ๋ฉด ์ƒˆ๋กœ ์ƒ์„ฑ
if os.path.exists(res_video_path):
print(f"Using cached result: {res_video_path}")
return res_video_path
else:
print(f"Generating new video with dynamic scale: {dynamic_scale}")
return get_video_res(image_path, audio_path, res_video_path, dynamic_scale)
# ์˜ˆ์‹œ ๋ฐ์ดํ„ฐ๋ฅผ ์œ„ํ•œ dummy ํ•จ์ˆ˜ (ํ•„์š”์‹œ ์‹ค์ œ ์˜ˆ์‹œ ๋ฐ์ดํ„ฐ๋ฅผ ์ถ”๊ฐ€ํ•˜์„ธ์š”)
def get_example():
return []
css = """
.gradio-container {
font-family: 'Arial', sans-serif;
}
.main-header {
text-align: center;
color: #2a2a2a;
margin-bottom: 2em;
}
.parameter-section {
background-color: #f5f5f5;
padding: 1em;
border-radius: 8px;
margin: 1em 0;
}
.example-section {
margin-top: 2em;
}
"""
with gr.Blocks(css=css) as demo:
gr.HTML("""
<div class="main-header">
<h1>๐ŸŽญ Sonic: Advanced Portrait Animation</h1>
<p>Transform still images into dynamic videos synchronized with audio</p>
</div>
""")
with gr.Row():
with gr.Column():
image_input = gr.Image(
type='pil',
label="Portrait Image",
elem_id="image_input"
)
audio_input = gr.Audio(
label="Voice/Audio Input",
elem_id="audio_input",
type="numpy"
)
with gr.Column():
dynamic_scale = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="Animation Intensity",
info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)"
)
process_btn = gr.Button(
"Generate Animation",
variant="primary",
elem_id="process_btn"
)
with gr.Column():
video_output = gr.Video(
label="Generated Animation",
elem_id="video_output"
)
process_btn.click(
fn=process_sonic,
inputs=[image_input, audio_input, dynamic_scale],
outputs=video_output,
api_name="animate"
)
gr.Examples(
examples=get_example(),
fn=process_sonic,
inputs=[image_input, audio_input, dynamic_scale],
outputs=video_output,
cache_examples=False
)
gr.HTML("""
<div style="text-align: center; margin-top: 2em;">
<div style="margin-bottom: 1em;">
<a href="https://github.com/jixiaozhong/Sonic" target="_blank" style="text-decoration: none;">
<img src="https://img.shields.io/badge/GitHub-Repo-blue?style=for-the-badge&logo=github" alt="GitHub Repo">
</a>
<a href="https://arxiv.org/pdf/2411.16331" target="_blank" style="text-decoration: none;">
<img src="https://img.shields.io/badge/Paper-arXiv-red?style=for-the-badge&logo=arxiv" alt="arXiv Paper">
</a>
</div>
<p>๐Ÿ”” Note: For optimal results, use clear portrait images and high-quality audio</p>
</div>
""")
# ๊ณต๊ฐœ ๋งํฌ ์ƒ์„ฑ: share=True
demo.launch(share=True)