Spaces:

ginipick
/

Dokdo-multimodal

Paused

App Files Files Community

aiqcamp commited on Dec 22, 2024

Commit

46cfad8

verified ·

1 Parent(s): c388283

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -8

app.py CHANGED Viewed

@@ -67,15 +67,25 @@ output_dir = Path('./output/gradio')
 setup_eval_logging()
 net, feature_utils, seq_cfg = get_model()
-@spaces.GPU(duration=30)  # 30초로 제한
-@torch.inference_mode()
 def video_to_audio(video_path: str, prompt: str, negative_prompt: str = "music",
                    seed: int = -1, num_steps: int = 15,
-                   cfg_strength: float = 4.0, target_duration: float = 4.0):
     try:
         logger.info("Starting audio generation process")
         torch.cuda.empty_cache()
         rng = torch.Generator(device=device)
         if seed >= 0:
             rng.manual_seed(seed)
@@ -84,8 +94,8 @@ def video_to_audio(video_path: str, prompt: str, negative_prompt: str = "music",
         fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
-        # load_video 함수 호출 수정
-        video_info = load_video(video_path, duration_sec=target_duration)  # duration_sec 파라미터로 변경
         if video_info is None:
             logger.error("Failed to load video")
@@ -99,16 +109,20 @@ def video_to_audio(video_path: str, prompt: str, negative_prompt: str = "music",
             logger.error("Failed to extract frames from video")
             return video_path
-        # 메모리 최적화
         clip_frames = clip_frames[:int(actual_duration * video_info.fps)]
         sync_frames = sync_frames[:int(actual_duration * video_info.fps)]
         clip_frames = clip_frames.unsqueeze(0).to(device, dtype=torch.float16)
         sync_frames = sync_frames.unsqueeze(0).to(device, dtype=torch.float16)
         seq_cfg.duration = actual_duration
         net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
         logger.info("Generating audio...")
         with torch.cuda.amp.autocast():
             audios = generate(clip_frames,
@@ -356,6 +370,15 @@ def generate_video(image, prompt):
                     final_path = add_watermark(output_path)
                     # 오디오 처리 추가
                     try:
                         logger.info("Starting audio generation process")
@@ -365,8 +388,8 @@ def generate_video(image, prompt):
                             negative_prompt="music",
                             seed=-1,
                             num_steps=20,
-                            cfg_strength=4.5,
-                            target_duration=6.0
                         )
                         if final_path_with_audio != final_path:

 setup_eval_logging()
 net, feature_utils, seq_cfg = get_model()
 def video_to_audio(video_path: str, prompt: str, negative_prompt: str = "music",
                    seed: int = -1, num_steps: int = 15,
+                   cfg_strength: float = 4.0, target_duration: float = None):  # target_duration을 선택적으로 변경
     try:
         logger.info("Starting audio generation process")
         torch.cuda.empty_cache()
+        # 비디오 길이 확인
+        cap = cv2.VideoCapture(video_path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        video_duration = total_frames / fps
+        cap.release()
+        # 실제 비디오 길이를 target_duration으로 사용
+        target_duration = video_duration
+        logger.info(f"Video duration: {target_duration} seconds")
         rng = torch.Generator(device=device)
         if seed >= 0:
             rng.manual_seed(seed)
         fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
+        # 비디오 길이에 맞춰 load_video 호출
+        video_info = load_video(video_path, duration_sec=target_duration)
         if video_info is None:
             logger.error("Failed to load video")
             logger.error("Failed to extract frames from video")
             return video_path
+        # 실제 비디오 프레임 수에 맞춰 조정
         clip_frames = clip_frames[:int(actual_duration * video_info.fps)]
         sync_frames = sync_frames[:int(actual_duration * video_info.fps)]
         clip_frames = clip_frames.unsqueeze(0).to(device, dtype=torch.float16)
         sync_frames = sync_frames.unsqueeze(0).to(device, dtype=torch.float16)
+        # sequence config 업데이트
         seq_cfg.duration = actual_duration
         net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
+        logger.info(f"Generating audio for {actual_duration} seconds...")
         logger.info("Generating audio...")
         with torch.cuda.amp.autocast():
             audios = generate(clip_frames,
                     final_path = add_watermark(output_path)
+                    # 비디오 길이 확인
+                    cap = cv2.VideoCapture(final_path)
+                    fps = cap.get(cv2.CAP_PROP_FPS)
+                    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                    video_duration = total_frames / fps
+                    cap.release()
+                    logger.info(f"Original video duration: {video_duration} seconds")
                     # 오디오 처리 추가
                     try:
                         logger.info("Starting audio generation process")
                             negative_prompt="music",
                             seed=-1,
                             num_steps=20,
+                            cfg_strength=4.5
+                            # target_duration 제거 - 자동으로 비디오 길이 사용
                         )
                         if final_path_with_audio != final_path: