Spaces:

ginipick
/

Dokdo-multimodal

Paused

App Files Files Community

aiqcamp commited on Dec 22, 2024

Commit

7a8cebd

verified ·

1 Parent(s): e76dd8a

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -17

app.py CHANGED Viewed

@@ -65,11 +65,11 @@ output_dir = Path('./output/gradio')
 setup_eval_logging()
 net, feature_utils, seq_cfg = get_model()
-@spaces.GPU(duration=60)
 @torch.inference_mode()
 def video_to_audio(video_path: str, prompt: str, negative_prompt: str = "music",
-                   seed: int = -1, num_steps: int = 20,
-                   cfg_strength: float = 4.5, target_duration: float = 6.0):
     try:
         logger.info("Starting audio generation process")
         torch.cuda.empty_cache()
@@ -83,16 +83,12 @@ def video_to_audio(video_path: str, prompt: str, negative_prompt: str = "music",
         fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
         # load_video 함수 호출 수정
-        video_info = load_video(video_path)  # duration 파라미터 제거
         if video_info is None:
             logger.error("Failed to load video")
             return video_path
-        # 비디오 길이 조정이 필요한 경우 여기서 처리
-        if hasattr(video_info, 'set_duration'):
-            video_info.set_duration(target_duration)
         clip_frames = video_info.clip_frames
         sync_frames = video_info.sync_frames
         actual_duration = video_info.duration_sec
@@ -101,6 +97,10 @@ def video_to_audio(video_path: str, prompt: str, negative_prompt: str = "music",
             logger.error("Failed to extract frames from video")
             return video_path
         clip_frames = clip_frames.unsqueeze(0).to(device, dtype=torch.float16)
         sync_frames = sync_frames.unsqueeze(0).to(device, dtype=torch.float16)
@@ -108,15 +108,16 @@ def video_to_audio(video_path: str, prompt: str, negative_prompt: str = "music",
         net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
         logger.info("Generating audio...")
-        audios = generate(clip_frames,
-                         sync_frames,
-                         [prompt],
-                         negative_text=[negative_prompt],
-                         feature_utils=feature_utils,
-                         net=net,
-                         fm=fm,
-                         rng=rng,
-                         cfg_strength=cfg_strength)
         if audios is None:
             logger.error("Failed to generate audio")

 setup_eval_logging()
 net, feature_utils, seq_cfg = get_model()
+@spaces.GPU(duration=30)  # 30초로 제한
 @torch.inference_mode()
 def video_to_audio(video_path: str, prompt: str, negative_prompt: str = "music",
+                   seed: int = -1, num_steps: int = 15,
+                   cfg_strength: float = 4.0, target_duration: float = 4.0):
     try:
         logger.info("Starting audio generation process")
         torch.cuda.empty_cache()
         fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
         # load_video 함수 호출 수정
+        video_info = load_video(video_path, duration_sec=target_duration)  # duration_sec 파라미터로 변경
         if video_info is None:
             logger.error("Failed to load video")
             return video_path
         clip_frames = video_info.clip_frames
         sync_frames = video_info.sync_frames
         actual_duration = video_info.duration_sec
             logger.error("Failed to extract frames from video")
             return video_path
+        # 메모리 최적화
+        clip_frames = clip_frames[:int(actual_duration * video_info.fps)]
+        sync_frames = sync_frames[:int(actual_duration * video_info.fps)]
         clip_frames = clip_frames.unsqueeze(0).to(device, dtype=torch.float16)
         sync_frames = sync_frames.unsqueeze(0).to(device, dtype=torch.float16)
         net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
         logger.info("Generating audio...")
+        with torch.cuda.amp.autocast():
+            audios = generate(clip_frames,
+                            sync_frames,
+                            [prompt],
+                            negative_text=[negative_prompt],
+                            feature_utils=feature_utils,
+                            net=net,
+                            fm=fm,
+                            rng=rng,
+                            cfg_strength=cfg_strength)
         if audios is None:
             logger.error("Failed to generate audio")