Spaces:

VIDraft
/

Portrait-Animation

Runtime error

App Files Files Community

openfree commited on May 10, 2025

Commit

cca593e

verified ·

1 Parent(s): 723bc72

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -29

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ import torch
 # 초기 실행 시 필요한 모델들을 다운로드
 cmd = (
-    'python3 -m pip install "huggingface_hub[cli]" accelerate; '  # accelerate도 같이 설치 권장
     'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; '
     'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir  checkpoints/stable-video-diffusion-img2vid-xt; '
     'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;'
@@ -39,25 +39,21 @@ def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
     expand_ratio = 0.0
     min_resolution = 512
-    # 오디오 길이
     audio = AudioSegment.from_file(audio_path)
     duration = len(audio) / 1000.0  # 초 단위
-    # 오디오 길이에 따라 inference_steps 계산 (초당 약 12.5 프레임)
-    # 최소 25 프레임, 최대 750 프레임 (60초 => 60*12.5=750)
     inference_steps = min(max(int(duration * 12.5), 25), 750)
     print(f"[INFO] Audio duration: {duration:.2f} seconds, using inference_steps={inference_steps}")
-    # 얼굴 인식 (face_info는 참고용)
     face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
     print(f"[INFO] Face detection info: {face_info}")
-    # 얼굴이 하나라도 검출되면(>0), 원본 이미지 비율 유지
     if face_info['face_num'] > 0:
         os.makedirs(os.path.dirname(res_video_path), exist_ok=True)
-        # Sonic pipeline으로 비디오 생성
         pipe.process(
             img_path,
             audio_path,
@@ -68,43 +64,40 @@ def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
         )
         return res_video_path
     else:
         return -1
 def process_sonic(image, audio, dynamic_scale):
     """
-    Gradio 인터페이스 상에서 호출되는 함수.
-    1. 이미지/오디오 입력 검증
-    2. MD5 해시 통해 파일명 생성 후 캐싱
-    3. 이미 결과 파일이 있으면 재활용, 없으면 새로 비디오 생성
     """
     if image is None:
         raise gr.Error("Please upload an image")
     if audio is None:
         raise gr.Error("Please upload an audio file")
-    # 이미지 MD5 해시 계산
     buf_img = io.BytesIO()
     image.save(buf_img, format="PNG")
     img_bytes = buf_img.getvalue()
     img_md5 = get_md5(img_bytes)
-    # 오디오 MD5 해시 계산
     sampling_rate, arr = audio[:2]
     if len(arr.shape) == 1:
         arr = arr[:, None]
     audio_segment = AudioSegment(
         arr.tobytes(),
         frame_rate=sampling_rate,
         sample_width=arr.dtype.itemsize,
         channels=arr.shape[1]
     )
-    # (중요) Whisper 호환을 위해 mono/16kHz 변환
-    audio_segment = audio_segment.set_channels(1)
-    audio_segment = audio_segment.set_frame_rate(16000)
-    # 최대 60초 제한
     MAX_DURATION_MS = 60000
     if len(audio_segment) > MAX_DURATION_MS:
         audio_segment = audio_segment[:MAX_DURATION_MS]
@@ -114,12 +107,11 @@ def process_sonic(image, audio, dynamic_scale):
     audio_bytes = buf_audio.getvalue()
     audio_md5 = get_md5(audio_bytes)
-    # 파일 경로 생성
     image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png'))
     audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav'))
     res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4'))
-    # 이미지/오디오 파일 캐싱
     if not os.path.exists(image_path):
         with open(image_path, "wb") as f:
             f.write(img_bytes)
@@ -127,7 +119,7 @@ def process_sonic(image, audio, dynamic_scale):
         with open(audio_path, "wb") as f:
             f.write(audio_bytes)
-    # 이미 결과가 존재하면 캐시된 결과 사용
     if os.path.exists(res_video_path):
         print(f"[INFO] Using cached result: {res_video_path}")
         return res_video_path
@@ -137,7 +129,6 @@ def process_sonic(image, audio, dynamic_scale):
         return video_result
 def get_example():
-    """예시 데이터를 로딩하는 더미 함수 (현재는 빈 리스트)."""
     return []
 css = """
@@ -175,13 +166,11 @@ with gr.Blocks(css=css) as demo:
                 label="Portrait Image",
                 elem_id="image_input"
             )
             audio_input = gr.Audio(
                 label="Voice/Audio Input (up to 1 minute)",
                 elem_id="audio_input",
                 type="numpy"
             )
             with gr.Column():
                 dynamic_scale = gr.Slider(
                     minimum=0.5,
@@ -191,7 +180,6 @@ with gr.Blocks(css=css) as demo:
                     label="Animation Intensity",
                     info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)"
                 )
             process_btn = gr.Button(
                 "Generate Animation",
                 variant="primary",
@@ -232,5 +220,4 @@ with gr.Blocks(css=css) as demo:
         </div>
     """)
-# 공개 링크 생성
 demo.launch(share=True)

 # 초기 실행 시 필요한 모델들을 다운로드
 cmd = (
+    'python3 -m pip install "huggingface_hub[cli]" accelerate; '
     'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; '
     'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir  checkpoints/stable-video-diffusion-img2vid-xt; '
     'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;'
     expand_ratio = 0.0
     min_resolution = 512
+    # 오디오 길이 계산
     audio = AudioSegment.from_file(audio_path)
     duration = len(audio) / 1000.0  # 초 단위
+    # 오디오 길이에 따라 inference_steps 결정 (최소 25프레임 ~ 최대 750프레임)
     inference_steps = min(max(int(duration * 12.5), 25), 750)
     print(f"[INFO] Audio duration: {duration:.2f} seconds, using inference_steps={inference_steps}")
+    # 얼굴 인식
     face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
     print(f"[INFO] Face detection info: {face_info}")
+    # 얼굴이 하나라도 검출되면 -> pipeline 진행
     if face_info['face_num'] > 0:
         os.makedirs(os.path.dirname(res_video_path), exist_ok=True)
         pipe.process(
             img_path,
             audio_path,
         )
         return res_video_path
     else:
+        # 얼굴이 전혀 없으면 -1 리턴
         return -1
 def process_sonic(image, audio, dynamic_scale):
     """
+    Gradio 인터페이스에서 호출되는 함수:
+    1. 이미지/오디오 검사
+    2. MD5 해시 -> 파일명
+    3. 캐시 검사 -> 없으면 영상 생성
     """
     if image is None:
         raise gr.Error("Please upload an image")
     if audio is None:
         raise gr.Error("Please upload an audio file")
+    # (1) 이미지 MD5
     buf_img = io.BytesIO()
     image.save(buf_img, format="PNG")
     img_bytes = buf_img.getvalue()
     img_md5 = get_md5(img_bytes)
+    # (2) 오디오 MD5
     sampling_rate, arr = audio[:2]
     if len(arr.shape) == 1:
         arr = arr[:, None]
     audio_segment = AudioSegment(
         arr.tobytes(),
         frame_rate=sampling_rate,
         sample_width=arr.dtype.itemsize,
         channels=arr.shape[1]
     )
+    # Whisper 호환을 위해 mono/16kHz로 변환
+    audio_segment = audio_segment.set_channels(1).set_frame_rate(16000)
     MAX_DURATION_MS = 60000
     if len(audio_segment) > MAX_DURATION_MS:
         audio_segment = audio_segment[:MAX_DURATION_MS]
     audio_bytes = buf_audio.getvalue()
     audio_md5 = get_md5(audio_bytes)
+    # (3) 파일 경로
     image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png'))
     audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav'))
     res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4'))
     if not os.path.exists(image_path):
         with open(image_path, "wb") as f:
             f.write(img_bytes)
         with open(audio_path, "wb") as f:
             f.write(audio_bytes)
+    # (4) 캐싱된 결과가 있으면 재사용
     if os.path.exists(res_video_path):
         print(f"[INFO] Using cached result: {res_video_path}")
         return res_video_path
         return video_result
 def get_example():
     return []
 css = """
                 label="Portrait Image",
                 elem_id="image_input"
             )
             audio_input = gr.Audio(
                 label="Voice/Audio Input (up to 1 minute)",
                 elem_id="audio_input",
                 type="numpy"
             )
             with gr.Column():
                 dynamic_scale = gr.Slider(
                     minimum=0.5,
                     label="Animation Intensity",
                     info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)"
                 )
             process_btn = gr.Button(
                 "Generate Animation",
                 variant="primary",
         </div>
     """)
 demo.launch(share=True)