Spaces:

ginigen
/

STUDIO

Runtime error

App Files Files Community

openfree commited on May 29, 2025

Commit

0712469

verified ·

1 Parent(s): fe2ed65

Update app-backup.py

Browse files

Files changed (1) hide show

app-backup.py +847 -134

app-backup.py CHANGED Viewed

@@ -1,10 +1,25 @@
 import gradio as gr
 import numpy as np
 from PIL import Image, ImageDraw
 from gradio_client import Client, handle_file
 import random
 import tempfile
-import os
 import logging
 import torch
 from diffusers import AutoencoderKL, TCDScheduler
@@ -16,22 +31,139 @@ from einops import rearrange
 from scipy.io import wavfile
 from transformers import pipeline
 # 환경 변수 설정으로 torch.load 체크 우회 (임시 해결책)
 os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1"
-# Spaces GPU
-try:
-    import spaces
-except:
-    # GPU 데코레이터가 없을 때를 위한 더미 데코레이터
-    class spaces:
-        @staticmethod
-        def GPU(duration=None):
-            def decorator(func):
-                return func
-            return decorator
-# MMAudio imports
 try:
     import mmaudio
 except ImportError:
@@ -45,116 +177,36 @@ from mmaudio.model.networks import MMAudio, get_my_mmaudio
 from mmaudio.model.sequence_config import SequenceConfig
 from mmaudio.model.utils.features_utils import FeaturesUtils
-# ControlNet 모델 로드
-try:
-    from controlnet_union import ControlNetModel_Union
-    from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
-    # ControlNet 설정 및 로드
-    config_file = hf_hub_download(
-        "xinsir/controlnet-union-sdxl-1.0",
-        filename="config_promax.json",
-    )
-    config = ControlNetModel_Union.load_config(config_file)
-    controlnet_model = ControlNetModel_Union.from_config(config)
-    model_file = hf_hub_download(
-        "xinsir/controlnet-union-sdxl-1.0",
-        filename="diffusion_pytorch_model_promax.safetensors",
-    )
-    state_dict = load_state_dict(model_file)
-    loaded_keys = list(state_dict.keys())
-    result = ControlNetModel_Union._load_pretrained_model(
-        controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys
-    )
-    model = result[0]
-    model = model.to(device="cuda", dtype=torch.float16)
-    # VAE 로드
-    vae = AutoencoderKL.from_pretrained(
-        "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
-    ).to("cuda")
-    # 파이프라인 로드
-    pipe = StableDiffusionXLFillPipeline.from_pretrained(
-        "SG161222/RealVisXL_V5.0_Lightning",
-        torch_dtype=torch.float16,
-        vae=vae,
-        controlnet=model,
-        variant="fp16",
-    ).to("cuda")
-    pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
-    OUTPAINT_MODEL_LOADED = True
-except Exception as e:
-    logging.error(f"Failed to load outpainting models: {str(e)}")
-    OUTPAINT_MODEL_LOADED = False
-# MMAudio 모델 설정
 if torch.cuda.is_available():
     device = torch.device("cuda")
-    torch.backends.cuda.matmul.allow_tf32 = True
-    torch.backends.cudnn.allow_tf32 = True
-    torch.backends.cudnn.benchmark = True
 else:
     device = torch.device("cpu")
-dtype = torch.bfloat16
-# MMAudio 모델 초기화
-try:
-    model_mmaudio: ModelConfig = all_model_cfg['large_44k_v2']
-    model_mmaudio.download_if_needed()
-    output_dir = Path('./output/gradio')
-    setup_eval_logging()
-    # 번역기 설정
-    try:
-        translator = pipeline("translation",
-                             model="Helsinki-NLP/opus-mt-ko-en",
-                             device="cpu",
-                             use_fast=True,
-                             trust_remote_code=False)
-    except Exception as e:
-        logging.warning(f"Failed to load translation model: {e}")
-        translator = None
-    def get_mmaudio_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
-        with torch.cuda.device(device):
-            seq_cfg = model_mmaudio.seq_cfg
-            net: MMAudio = get_my_mmaudio(model_mmaudio.model_name).to(device, dtype).eval()
-            net.load_weights(torch.load(model_mmaudio.model_path, map_location=device, weights_only=True))
-            logging.info(f'Loaded weights from {model_mmaudio.model_path}')
-            feature_utils = FeaturesUtils(
-                tod_vae_ckpt=model_mmaudio.vae_path,
-                synchformer_ckpt=model_mmaudio.synchformer_ckpt,
-                enable_conditions=True,
-                mode=model_mmaudio.mode,
-                bigvgan_vocoder_ckpt=model_mmaudio.bigvgan_16k_path,
-                need_vae_encoder=False
-            ).to(device, dtype).eval()
-            return net, feature_utils, seq_cfg
-    net_mmaudio, feature_utils, seq_cfg = get_mmaudio_model()
-    MMAUDIO_MODEL_LOADED = True
-except Exception as e:
-    logging.error(f"Failed to load MMAudio models: {str(e)}")
-    MMAUDIO_MODEL_LOADED = False
-    translator = None
 # API URLs
 TEXT2IMG_API_URL = "http://211.233.58.201:7896"
 VIDEO_API_URL = "http://211.233.58.201:7875"
-# 로깅 설정
-logging.basicConfig(level=logging.INFO)
 # Image size presets
 IMAGE_PRESETS = {
     "커스텀": {"width": 1024, "height": 1024},
@@ -172,6 +224,120 @@ IMAGE_PRESETS = {
     "LinkedIn 배너": {"width": 1584, "height": 396},
 }
 def update_dimensions(preset):
     if preset in IMAGE_PRESETS:
         return IMAGE_PRESETS[preset]["width"], IMAGE_PRESETS[preset]["height"]
@@ -286,9 +452,7 @@ def prepare_image_and_mask(image, width, height, overlap_percentage, alignment):
     mask = Image.new('L', target_size, 255)
     mask_draw = ImageDraw.Draw(mask)
-    # 마스크 영역 그리기 (영어 정렬과 매칭)
-    white_gaps_patch = 2
     left_overlap = margin_x + overlap_x if alignment != "왼쪽" else margin_x
     right_overlap = margin_x + new_width - overlap_x if alignment != "오른쪽" else margin_x + new_width
     top_overlap = margin_y + overlap_y if alignment != "위" else margin_y
@@ -322,13 +486,17 @@ def preview_outpaint(image, width, height, overlap_percentage, alignment):
     return preview
-@spaces.GPU(duration=24)
 def outpaint_image(image, prompt, width, height, overlap_percentage, alignment, num_steps=8):
     """이미지 아웃페인팅 실행"""
     if image is None:
         return None
-    if not OUTPAINT_MODEL_LOADED:
         return Image.new('RGB', (width, height), (200, 200, 200))
     try:
@@ -345,16 +513,16 @@ def outpaint_image(image, prompt, width, height, overlap_percentage, alignment,
         final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k"
         # GPU에서 실행
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
             (
                 prompt_embeds,
                 negative_prompt_embeds,
                 pooled_prompt_embeds,
                 negative_pooled_prompt_embeds,
-            ) = pipe.encode_prompt(final_prompt, "cuda", True)
             # 생성 프로세스
-            for generated_image in pipe(
                 prompt_embeds=prompt_embeds,
                 negative_prompt_embeds=negative_prompt_embeds,
                 pooled_prompt_embeds=pooled_prompt_embeds,
@@ -381,23 +549,27 @@ def outpaint_image(image, prompt, width, height, overlap_percentage, alignment,
 # MMAudio 관련 함수들
 def translate_prompt(text):
     try:
-        if translator is None:
             return text
         if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
             with torch.no_grad():
-                translation = translator(text)[0]['translation_text']
             return translation
         return text
     except Exception as e:
         logging.error(f"Translation error: {e}")
         return text
-@spaces.GPU
 @torch.inference_mode()
 def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
                    cfg_strength: float, duration: float):
-    if not MMAUDIO_MODEL_LOADED:
         return None
     prompt = translate_prompt(prompt)
@@ -410,14 +582,14 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
     clip_frames, sync_frames, duration = load_video(video, duration)
     clip_frames = clip_frames.unsqueeze(0)
     sync_frames = sync_frames.unsqueeze(0)
-    seq_cfg.duration = duration
-    net_mmaudio.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
     audios = generate(clip_frames,
                       sync_frames, [prompt],
                       negative_text=[negative_prompt],
-                      feature_utils=feature_utils,
-                      net=net_mmaudio,
                       fm=fm,
                       rng=rng,
                       cfg_strength=cfg_strength)
@@ -427,10 +599,310 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
     make_video(video,
                video_save_path,
                audio,
-               sampling_rate=seq_cfg.sampling_rate,
-               duration_sec=seq_cfg.duration)
     return video_save_path
 # CSS
 css = """
 :root {
@@ -456,7 +928,7 @@ css = """
     padding: 20px !important;
     margin-bottom: 20px !important;
 }
-#generate-btn, #video-btn, #outpaint-btn, #preview-btn, #audio-btn {
     background: linear-gradient(135deg, #ff9a9e, #fad0c4) !important;
     font-size: 1.1rem !important;
     padding: 12px 24px !important;
@@ -473,6 +945,10 @@ demo = gr.Blocks(css=css, title="AI 이미지 & 비디오 & 오디오 생성기"
 with demo:
     gr.Markdown("# 🎨 Ginigen 스튜디오")
     with gr.Tabs() as tabs:
         # 첫 번째 탭: 텍스트 to 이미지
@@ -618,7 +1094,7 @@ with demo:
                         gr.Markdown("### 🎵 오디오 생성 설정")
                         audio_prompt = gr.Textbox(
-                            label="프롬프트 (한글 지원)" if MMAUDIO_MODEL_LOADED and translator else "프롬프트",
                             placeholder="생성하고 싶은 오디오를 설명하세요... (예: 평화로운 피아노 음악)",
                             lines=3
                         )
@@ -649,9 +1125,204 @@ with demo:
                             label="오디오가 추가된 비디오",
                             interactive=False
                         )
-                        if not MMAUDIO_MODEL_LOADED:
-                            gr.Markdown("⚠️ MMAudio 모델을 로드하지 못했습니다. 이 기능은 사용할 수 없습니다.")
     # 이벤트 연결 - 첫 번째 탭
     size_preset.change(update_dimensions, [size_preset], [width, height])
@@ -689,5 +1360,47 @@ with demo:
         [audio_video_input, audio_prompt, audio_negative_prompt, audio_seed, audio_steps, audio_cfg, audio_duration],
         [output_video_with_audio]
     )
-demo.launch()

+# Spaces GPU - 반드시 첫 번째로 import해야 함!
+import os
+IS_SPACES = os.environ.get("SPACE_ID") is not None
+if IS_SPACES:
+    import spaces
+else:
+    # GPU 데코레이터가 없을 때를 위한 더미 데코레이터
+    class spaces:
+        @staticmethod
+        def GPU(duration=None):
+            def decorator(func):
+                return func
+            return decorator
+# 이제 다른 라이브러리들을 import
 import gradio as gr
 import numpy as np
 from PIL import Image, ImageDraw
 from gradio_client import Client, handle_file
 import random
 import tempfile
 import logging
 import torch
 from diffusers import AutoencoderKL, TCDScheduler
 from scipy.io import wavfile
 from transformers import pipeline
+# 비디오 배경제거 관련 import
+# 비디오 배경제거 관련 import
+from transformers import AutoModelForImageSegmentation
+from torchvision import transforms
+# ── moviepy import ──────────────────────────────────────────
+try:
+    from moviepy.editor import (
+        VideoFileClip,
+        concatenate_videoclips,
+        ImageSequenceClip,
+        concatenate_audioclips,
+        AudioFileClip,
+        CompositeAudioClip,
+        CompositeVideoClip,
+        ColorClip
+    )
+except ImportError:
+    # 개별적으로 import 시도
+    try:
+        from moviepy.video.io.VideoFileClip import VideoFileClip
+    except ImportError:
+        from moviepy import VideoFileClip
+    try:
+        from moviepy.video.compositing.concatenate import concatenate_videoclips
+    except ImportError:
+        from moviepy import concatenate_videoclips
+    try:
+        from moviepy.video.io.ImageSequenceClip import ImageSequenceClip
+    except ImportError:
+        from moviepy.editor import ImageSequenceClip
+    try:
+        from moviepy.audio.io.AudioFileClip import AudioFileClip
+    except ImportError:
+        from moviepy.editor import AudioFileClip
+    try:
+        from moviepy.audio.AudioClip import concatenate_audioclips, CompositeAudioClip
+    except ImportError:
+        from moviepy.editor import concatenate_audioclips, CompositeAudioClip
+    try:
+        from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
+    except ImportError:
+        from moviepy.editor import CompositeVideoClip
+    try:
+        from moviepy.video.VideoClip import ColorClip
+    except ImportError:
+        from moviepy.editor import ColorClip
+# resize 함수 import 시도
+resize = None
+try:
+    from moviepy.video.fx.resize import resize
+except ImportError:
+    try:
+        from moviepy.video.fx.all import resize
+    except ImportError:
+        try:
+            # editor를 통한 import 시도
+            from moviepy.editor import resize
+        except ImportError:
+            pass  # resize를 찾을 수 없음
+# resize가 없으면 대체 함수 생성
+if resize is None:
+    def resize(clip, newsize=None, height=None, width=None):
+        """Fallback resize function when moviepy resize is not available"""
+        if hasattr(clip, 'resize'):
+            if newsize:
+                return clip.resize(newsize)
+            elif height:
+                return clip.resize(height=height)
+            elif width:
+                return clip.resize(width=width)
+        # 크기 변경이 불가능하면 원본 반환
+        return clip
+# speedx 함수 import 시도
+speedx = None
+try:
+    from moviepy.video.fx.speedx import speedx
+except ImportError:
+    try:
+        from moviepy.video.fx.all import speedx
+    except ImportError:
+        try:
+            from moviepy.editor import speedx
+        except ImportError:
+            pass  # speedx를 찾을 수 없음
+# speedx가 없으면 대체 함수 생성
+if speedx is None:
+    def speedx(clip, factor=1.0, final_duration=None):
+        """Fallback speedx function"""
+        if hasattr(clip, 'fx') and hasattr(clip.fx, 'speedx'):
+            return clip.fx.speedx(factor, final_duration)
+        elif hasattr(clip, 'fl_time'):
+            return clip.fl_time(lambda t: t * factor)
+        elif hasattr(clip, 'with_fps') and factor != 1.0:
+            # FPS를 조정하여 속도 변경 효과 구현
+            new_fps = clip.fps * factor if hasattr(clip, 'fps') else 24 * factor
+            return clip.with_fps(new_fps)
+        else:
+            # 최후의 수단: 클립 그대로 반환
+            return clip
+import time
+from concurrent.futures import ThreadPoolExecutor
+# ────────────────────────────────────────────────────────────
 # 환경 변수 설정으로 torch.load 체크 우회 (임시 해결책)
 os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1"
+# GPU 초기화를 위한 간단한 함수 (Spaces 환경에서 필수)
+@spaces.GPU(duration=1)
+def gpu_warmup():
+    """GPU 워밍업 함수 - Spaces 환경에서 GPU 사용을 위해 필요"""
+    if torch.cuda.is_available():
+        dummy = torch.zeros(1).cuda()
+        del dummy
+    return "GPU ready"
+# MMAudio imports - spaces import 이후에 와야 함
 try:
     import mmaudio
 except ImportError:
 from mmaudio.model.sequence_config import SequenceConfig
 from mmaudio.model.utils.features_utils import FeaturesUtils
+# 로깅 설정
+logging.basicConfig(level=logging.INFO)
+# 기존 코드의 모든 설정과 초기화 부분 유지
+torch.set_float32_matmul_precision("medium")
+# Device 설정을 더 명확하게
 if torch.cuda.is_available():
     device = torch.device("cuda")
+    torch_dtype = torch.float16
 else:
     device = torch.device("cpu")
+    torch_dtype = torch.float32
+logging.info(f"Using device: {device}")
+# 전역 변수로 모델 상태 관리
+MODELS_LOADED = False
+BIREFNET_MODEL = None
+BIREFNET_LITE_MODEL = None
+OUTPAINT_PIPE = None
+MMAUDIO_NET = None
+MMAUDIO_FEATURE_UTILS = None
+MMAUDIO_SEQ_CFG = None
+TRANSLATOR = None
 # API URLs
 TEXT2IMG_API_URL = "http://211.233.58.201:7896"
 VIDEO_API_URL = "http://211.233.58.201:7875"
 # Image size presets
 IMAGE_PRESETS = {
     "커스텀": {"width": 1024, "height": 1024},
     "LinkedIn 배너": {"width": 1584, "height": 396},
 }
+# Transform for BiRefNet
+transform_image = transforms.Compose([
+    transforms.Resize((768, 768)),
+    transforms.ToTensor(),
+    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+])
+@spaces.GPU(duration=60)
+def load_models():
+    """모든 모델을 로드하는 함수"""
+    global MODELS_LOADED, BIREFNET_MODEL, BIREFNET_LITE_MODEL, OUTPAINT_PIPE
+    global MMAUDIO_NET, MMAUDIO_FEATURE_UTILS, MMAUDIO_SEQ_CFG, TRANSLATOR
+    if MODELS_LOADED:
+        return True
+    try:
+        # BiRefNet 모델 로드
+        logging.info("Loading BiRefNet models...")
+        BIREFNET_MODEL = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet", trust_remote_code=True)
+        BIREFNET_MODEL.to(device)
+        BIREFNET_LITE_MODEL = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet_lite", trust_remote_code=True)
+        BIREFNET_LITE_MODEL.to(device)
+        # ControlNet 및 Outpainting 모델 로드
+        logging.info("Loading ControlNet models...")
+        from controlnet_union import ControlNetModel_Union
+        from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
+        config_file = hf_hub_download(
+            "xinsir/controlnet-union-sdxl-1.0",
+            filename="config_promax.json",
+        )
+        config = ControlNetModel_Union.load_config(config_file)
+        controlnet_model = ControlNetModel_Union.from_config(config)
+        model_file = hf_hub_download(
+            "xinsir/controlnet-union-sdxl-1.0",
+            filename="diffusion_pytorch_model_promax.safetensors",
+        )
+        state_dict = load_state_dict(model_file)
+        loaded_keys = list(state_dict.keys())
+        result = ControlNetModel_Union._load_pretrained_model(
+            controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys
+        )
+        model = result[0]
+        model = model.to(device=device, dtype=torch_dtype)
+        # VAE 로드
+        vae = AutoencoderKL.from_pretrained(
+            "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch_dtype
+        ).to(device)
+        # 파이프라인 로드
+        OUTPAINT_PIPE = StableDiffusionXLFillPipeline.from_pretrained(
+            "SG161222/RealVisXL_V5.0_Lightning",
+            torch_dtype=torch_dtype,
+            vae=vae,
+            controlnet=model,
+            variant="fp16" if device.type == "cuda" else None,
+        ).to(device)
+        OUTPAINT_PIPE.scheduler = TCDScheduler.from_config(OUTPAINT_PIPE.scheduler.config)
+        # MMAudio 모델 로드
+        logging.info("Loading MMAudio models...")
+        model_mmaudio: ModelConfig = all_model_cfg['large_44k_v2']
+        model_mmaudio.download_if_needed()
+        setup_eval_logging()
+        # 번역기 설정
+        try:
+            TRANSLATOR = pipeline("translation",
+                                 model="Helsinki-NLP/opus-mt-ko-en",
+                                 device="cpu",
+                                 use_fast=True,
+                                 trust_remote_code=False)
+        except Exception as e:
+            logging.warning(f"Failed to load translation model: {e}")
+            TRANSLATOR = None
+        # MMAudio 모델 초기화
+        if torch.cuda.is_available():
+            mmaudio_dtype = torch.bfloat16
+        else:
+            mmaudio_dtype = torch.float32
+        with torch.cuda.device(device):
+            MMAUDIO_SEQ_CFG = model_mmaudio.seq_cfg
+            MMAUDIO_NET = get_my_mmaudio(model_mmaudio.model_name).to(device, mmaudio_dtype).eval()
+            MMAUDIO_NET.load_weights(torch.load(model_mmaudio.model_path, map_location=device, weights_only=True))
+            logging.info(f'Loaded weights from {model_mmaudio.model_path}')
+            MMAUDIO_FEATURE_UTILS = FeaturesUtils(
+                tod_vae_ckpt=model_mmaudio.vae_path,
+                synchformer_ckpt=model_mmaudio.synchformer_ckpt,
+                enable_conditions=True,
+                mode=model_mmaudio.mode,
+                bigvgan_vocoder_ckpt=model_mmaudio.bigvgan_16k_path,
+                need_vae_encoder=False
+            ).to(device, mmaudio_dtype).eval()
+        MODELS_LOADED = True
+        logging.info("All models loaded successfully!")
+        return True
+    except Exception as e:
+        logging.error(f"Failed to load models: {str(e)}")
+        return False
+# 기존 함수들 모두 유지
 def update_dimensions(preset):
     if preset in IMAGE_PRESETS:
         return IMAGE_PRESETS[preset]["width"], IMAGE_PRESETS[preset]["height"]
     mask = Image.new('L', target_size, 255)
     mask_draw = ImageDraw.Draw(mask)
+    # 마스크 영역 그리기
     left_overlap = margin_x + overlap_x if alignment != "왼쪽" else margin_x
     right_overlap = margin_x + new_width - overlap_x if alignment != "오른쪽" else margin_x + new_width
     top_overlap = margin_y + overlap_y if alignment != "위" else margin_y
     return preview
+@spaces.GPU(duration=120)
 def outpaint_image(image, prompt, width, height, overlap_percentage, alignment, num_steps=8):
     """이미지 아웃페인팅 실행"""
     if image is None:
         return None
+    # 모델 로드 확인
+    if not MODELS_LOADED:
+        load_models()
+    if OUTPAINT_PIPE is None:
         return Image.new('RGB', (width, height), (200, 200, 200))
     try:
         final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k"
         # GPU에서 실행
+        with torch.autocast(device_type=device.type, dtype=torch_dtype):
             (
                 prompt_embeds,
                 negative_prompt_embeds,
                 pooled_prompt_embeds,
                 negative_pooled_prompt_embeds,
+            ) = OUTPAINT_PIPE.encode_prompt(final_prompt, str(device), True)
             # 생성 프로세스
+            for generated_image in OUTPAINT_PIPE(
                 prompt_embeds=prompt_embeds,
                 negative_prompt_embeds=negative_prompt_embeds,
                 pooled_prompt_embeds=pooled_prompt_embeds,
 # MMAudio 관련 함수들
 def translate_prompt(text):
     try:
+        if TRANSLATOR is None:
             return text
         if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
             with torch.no_grad():
+                translation = TRANSLATOR(text)[0]['translation_text']
             return translation
         return text
     except Exception as e:
         logging.error(f"Translation error: {e}")
         return text
+@spaces.GPU(duration=120)
 @torch.inference_mode()
 def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
                    cfg_strength: float, duration: float):
+    # 모델 로드 확인
+    if not MODELS_LOADED:
+        load_models()
+    if MMAUDIO_NET is None:
         return None
     prompt = translate_prompt(prompt)
     clip_frames, sync_frames, duration = load_video(video, duration)
     clip_frames = clip_frames.unsqueeze(0)
     sync_frames = sync_frames.unsqueeze(0)
+    MMAUDIO_SEQ_CFG.duration = duration
+    MMAUDIO_NET.update_seq_lengths(MMAUDIO_SEQ_CFG.latent_seq_len, MMAUDIO_SEQ_CFG.clip_seq_len, MMAUDIO_SEQ_CFG.sync_seq_len)
     audios = generate(clip_frames,
                       sync_frames, [prompt],
                       negative_text=[negative_prompt],
+                      feature_utils=MMAUDIO_FEATURE_UTILS,
+                      net=MMAUDIO_NET,
                       fm=fm,
                       rng=rng,
                       cfg_strength=cfg_strength)
     make_video(video,
                video_save_path,
                audio,
+               sampling_rate=MMAUDIO_SEQ_CFG.sampling_rate,
+               duration_sec=MMAUDIO_SEQ_CFG.duration)
     return video_save_path
+# 비디오 배경제거 관련 함수들
+def process_bg_image(image, bg, fast_mode=False):
+    """단일 이미지 배경 처리"""
+    if BIREFNET_MODEL is None or BIREFNET_LITE_MODEL is None:
+        return image
+    image_size = image.size
+    input_images = transform_image(image).unsqueeze(0).to(device)
+    model = BIREFNET_LITE_MODEL if fast_mode else BIREFNET_MODEL
+    with torch.no_grad():
+        preds = model(input_images)[-1].sigmoid().cpu()
+    pred = preds[0].squeeze()
+    pred_pil = transforms.ToPILImage()(pred)
+    mask = pred_pil.resize(image_size)
+    if isinstance(bg, str) and bg.startswith("#"):
+        color_rgb = tuple(int(bg[i:i+2], 16) for i in (1, 3, 5))
+        background = Image.new("RGBA", image_size, color_rgb + (255,))
+    elif isinstance(bg, Image.Image):
+        background = bg.convert("RGBA").resize(image_size)
+    else:
+        background = Image.open(bg).convert("RGBA").resize(image_size)
+    image = Image.composite(image, background, mask)
+    return image
+def process_video_frame(frame, bg_type, bg, fast_mode, bg_frame_index, background_frames, color):
+    """비디오 프레임 처리"""
+    try:
+        pil_image = Image.fromarray(frame)
+        if bg_type == "색상":
+            processed_image = process_bg_image(pil_image, color, fast_mode)
+        elif bg_type == "이미지":
+            processed_image = process_bg_image(pil_image, bg, fast_mode)
+        elif bg_type == "비디오":
+            background_frame = background_frames[bg_frame_index]
+            bg_frame_index += 1
+            background_image = Image.fromarray(background_frame)
+            processed_image = process_bg_image(pil_image, background_image, fast_mode)
+        else:
+            processed_image = pil_image
+        return np.array(processed_image), bg_frame_index
+    except Exception as e:
+        print(f"Error processing frame: {e}")
+        return frame, bg_frame_index
+@spaces.GPU(duration=300)
+def process_video_bg(vid, bg_type="색상", bg_image=None, bg_video=None, color="#00FF00",
+                     fps=0, video_handling="slow_down", fast_mode=True, max_workers=10):
+    """비디오 배경 처리 메인 함수"""
+    # 모델 로드 확인
+    if not MODELS_LOADED:
+        load_models()
+    if BIREFNET_MODEL is None:
+        yield gr.update(visible=False), gr.update(visible=True), "BiRefNet 모델을 로드하지 못했습니다."
+        yield None, None, "BiRefNet 모델을 로드하지 못했습니다."
+        return
+    try:
+        start_time = time.time()
+        video = VideoFileClip(vid)
+        if fps == 0:
+            fps = video.fps
+        audio = video.audio
+        frames = list(video.iter_frames(fps=fps))
+        processed_frames = []
+        yield gr.update(visible=True), gr.update(visible=False), f"처리 시작... 경과 시간: 0초"
+        if bg_type == "비디오":
+            background_video = VideoFileClip(bg_video)
+            if background_video.duration < video.duration:
+                if video_handling == "slow_down":
+                    # vfx.speedx 대신 speedx 함수 직접 사용
+                    if speedx is not None:
+                        background_video = speedx(background_video, factor=video.duration / background_video.duration)
+                    else:
+                        # speedx가 없으면 반복으로 대체
+                        background_video = concatenate_videoclips([background_video] * int(video.duration / background_video.duration + 1))
+                else:  # video_handling == "loop"
+                    background_video = concatenate_videoclips([background_video] * int(video.duration / background_video.duration + 1))
+            background_frames = list(background_video.iter_frames(fps=fps))
+        else:
+            background_frames = None
+        bg_frame_index = 0
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = [executor.submit(process_video_frame, frames[i], bg_type, bg_image, fast_mode,
+                                     bg_frame_index + i, background_frames, color) for i in range(len(frames))]
+            for i, future in enumerate(futures):
+                result, _ = future.result()
+                processed_frames.append(result)
+                elapsed_time = time.time() - start_time
+                yield result, None, f"프레임 {i+1}/{len(frames)} 처리 중... 경과 시간: {elapsed_time:.2f}초"
+        processed_video = ImageSequenceClip(processed_frames, fps=fps)
+        processed_video = processed_video.with_audio(audio)
+        with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
+            temp_filepath = temp_file.name
+            processed_video.write_videofile(temp_filepath, codec="libx264")
+        elapsed_time = time.time() - start_time
+        yield gr.update(visible=False), gr.update(visible=True), f"처리 완료! 경과 시간: {elapsed_time:.2f}초"
+        yield processed_frames[-1], temp_filepath, f"처리 완료! 경과 시간: {elapsed_time:.2f}초"
+    except Exception as e:
+        print(f"Error: {e}")
+        elapsed_time = time.time() - start_time
+        yield gr.update(visible=False), gr.update(visible=True), f"비디오 처리 오류: {e}. 경과 시간: {elapsed_time:.2f}초"
+        yield None, None, f"비디오 처리 오류: {e}. 경과 시간: {elapsed_time:.2f}초"
+@spaces.GPU(duration=180)
+def merge_videos_with_audio(video_files, audio_file, audio_volume, output_fps):
+    """여러 비디오를 병합하고 오디오를 추가하는 함수"""
+    if not video_files:
+        return None, "비디오 파일을 업로드해주세요."
+    if isinstance(video_files, list) and len(video_files) > 10:
+        return None, "최대 10개의 비디오만 업로드 가능합니다."
+    try:
+        # 상태 업데이트
+        status = "비디오 파일 정렬 중..."
+        # 파일 경로와 파일명을 튜플로 저장하고 파일명으로 정렬
+        video_paths = []
+        if isinstance(video_files, list):
+            for video_file in video_files:
+                if video_file is not None:
+                    video_paths.append(video_file)
+        else:
+            video_paths.append(video_files)
+        # 파일명으로 정렬 (경로에서 파일명만 추출하여 정렬)
+        video_paths.sort(key=lambda x: os.path.basename(x))
+        status = f"{len(video_paths)}개의 비디오 로드 중..."
+        # 비디오 클립 로드
+        video_clips = []
+        clip_sizes = []
+        for i, video_path in enumerate(video_paths):
+            status = f"비디오 {i+1}/{len(video_paths)} 로드 중: {os.path.basename(video_path)}"
+            clip = VideoFileClip(video_path)
+            video_clips.append(clip)
+            # 각 클립의 크기 저장
+            try:
+                clip_sizes.append((clip.w, clip.h))
+            except:
+                clip_sizes.append(clip.size)
+        # 첫 번째 비디오의 크기를 기준으로 함
+        target_width, target_height = clip_sizes[0]
+        # 모든 비디오의 크기가 같은지 확인
+        all_same_size = all(size == (target_width, target_height) for size in clip_sizes)
+        if not all_same_size:
+            logging.warning(f"비디오 크기가 서로 다릅니다. 첫 번째 비디오 크기({target_width}x{target_height})로 조정합니다.")
+            # 크기가 다른 비디오들을 조정
+            adjusted_clips = []
+            for clip, size in zip(video_clips, clip_sizes):
+                if size != (target_width, target_height):
+                    # resize 함수가 있으면 사용, 없으면 대체 방법 사용
+                    if resize is not None:
+                        adjusted_clip = resize(clip, newsize=(target_width, target_height))
+                    else:
+                        # resize가 없을 때 대체 방법
+                        # clip.resize() 메서드 사용 시도
+                        if hasattr(clip, 'resize'):
+                            adjusted_clip = clip.resize((target_width, target_height))
+                        else:
+                            # 최후의 수단: 그대로 사용
+                            adjusted_clip = clip
+                            logging.warning(f"Cannot resize video. Using original size.")
+                    adjusted_clips.append(adjusted_clip)
+                else:
+                    adjusted_clips.append(clip)
+            video_clips = adjusted_clips
+        # 첫 번째 비디오의 FPS를 기본값으로 사용
+        if output_fps == 0:
+            output_fps = video_clips[0].fps
+        status = "비디오 병합 중..."
+        # 비디오 병합
+        final_video = concatenate_videoclips(video_clips, method="compose")
+        # 오디오 처리
+        if audio_file:
+            status = "오디오 처리 중..."
+            try:
+                # 오디오 파일 경로 확인
+                if isinstance(audio_file, str):
+                    audio_path = audio_file
+                else:
+                    # gr.Audio에서 반환된 튜플인 경우
+                    audio_path = audio_file
+                logging.info(f"Processing audio from: {audio_path}")
+                # 오디오 로드
+                if audio_path.endswith(('.mp4', '.avi', '.mov', '.mkv')):
+                    # 비디오 파일에서 오디오 추출
+                    temp_video = VideoFileClip(audio_path)
+                    audio_clip = temp_video.audio
+                    temp_video.close()
+                else:
+                    # 오디오 파일 직접 로드
+                    audio_clip = AudioFileClip(audio_path)
+                if audio_clip is None:
+                    raise ValueError("오디오를 로드할 수 없습니다.")
+                # 볼륨 조절
+                if audio_volume != 100:
+                    audio_clip = audio_clip.volumex(audio_volume / 100)
+                # 오디오를 비디오 길이에 맞춤
+                video_duration = final_video.duration
+                audio_duration = audio_clip.duration
+                if audio_duration > video_duration:
+                    # 오디오가 더 길면 잘라냄
+                    audio_clip = audio_clip.subclip(0, video_duration)
+                elif audio_duration < video_duration:
+                    # 오디오가 더 짧으면 반복
+                    loops_needed = int(video_duration / audio_duration) + 1
+                    audio_clips_list = [audio_clip] * loops_needed
+                    looped_audio = concatenate_audioclips(audio_clips_list)
+                    audio_clip = looped_audio.subclip(0, video_duration)
+                # 기존 오디오 제거하고 새 오디오로 교체
+                # (기존 오디오와 합성하려면 아래 주석 해제)
+                final_video = final_video.set_audio(audio_clip)
+                # 기존 오디오와 새 오디오 합성을 원하는 경우:
+                # if final_video.audio:
+                #     final_audio = CompositeAudioClip([final_video.audio, audio_clip])
+                #     final_video = final_video.set_audio(final_audio)
+                # else:
+                #     final_video = final_video.set_audio(audio_clip)
+                logging.info("Audio successfully added to video")
+            except Exception as e:
+                logging.error(f"오디오 처리 중 오류 발생: {str(e)}")
+                # 오디오 처리 실패해도 비디오는 계속 처리
+                status = f"오디오 처리 실패: {str(e)}, 비디오만 병합합니다."
+        status = "비디오 저장 중..."
+        # 임시 파일로 저장
+        with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
+            temp_filepath = temp_file.name
+            # 코덱 설정 - 원본 품질 유지
+            final_video.write_videofile(
+                temp_filepath,
+                fps=output_fps,
+                codec="libx264",
+                audio_codec="aac",
+                preset="medium",  # 품질 설정
+                bitrate="5000k",  # 비트레이트 설정으로 품질 유지
+                audio_bitrate="192k"
+            )
+        # 리소스 정리
+        for clip in video_clips:
+            clip.close()
+        if 'adjusted_clips' in locals():
+            for clip in adjusted_clips:
+                if clip not in video_clips:
+                    clip.close()
+        if audio_file and 'audio_clip' in locals():
+            audio_clip.close()
+        final_video.close()
+        return temp_filepath, f"✅ 성공적으로 {len(video_paths)}개의 비디오를 병합했습니다! (크기: {target_width}x{target_height})"
+    except Exception as e:
+        logging.error(f"Video merge error: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return None, f"❌ 오류 발생: {str(e)}"
 # CSS
 css = """
 :root {
     padding: 20px !important;
     margin-bottom: 20px !important;
 }
+#generate-btn, #video-btn, #outpaint-btn, #preview-btn, #audio-btn, #bg-remove-btn, #merge-btn {
     background: linear-gradient(135deg, #ff9a9e, #fad0c4) !important;
     font-size: 1.1rem !important;
     padding: 12px 24px !important;
 with demo:
     gr.Markdown("# 🎨 Ginigen 스튜디오")
+    gr.Markdown("처음 사용 시 모델 로딩에 시간이 걸릴 수 있습니다. 잠시만 기다려주세요.")
+    # 모델 로드 상태 표시
+    model_status = gr.Textbox(label="모델 상태", value="모델 로딩 대기 중...", interactive=False)
     with gr.Tabs() as tabs:
         # 첫 번째 탭: 텍스트 to 이미지
                         gr.Markdown("### 🎵 오디오 생성 설정")
                         audio_prompt = gr.Textbox(
+                            label="프롬프트 (한글 지원)",
                             placeholder="생성하고 싶은 오디오를 설명하세요... (예: 평화로운 피아노 음악)",
                             lines=3
                         )
                             label="오디오가 추가된 비디오",
                             interactive=False
                         )
+        # 네 번째 탭: 비디오 편집
+        with gr.Tab("비디오 편집", elem_classes="tabitem"):
+            with gr.Row(equal_height=True):
+                # 입력 컬럼
+                with gr.Column(scale=1):
+                    with gr.Group(elem_classes="panel-box"):
+                        gr.Markdown("### 🎥 비디오 업로드 (최대 10개)")
+                        gr.Markdown("**파일명이 작을수록 우선순위가 높습니다** (예: 1.mp4, 2.mp4, 3.mp4)")
+                        video_files = gr.File(
+                            label="비디오 파일들",
+                            file_count="multiple",
+                            file_types=["video"],
+                            type="filepath"
+                        )
+                    with gr.Group(elem_classes="panel-box"):
+                        gr.Markdown("### 🎵 오디오 설정 (선택)")
+                        gr.Markdown("**주의**: 업로드한 오디오가 비디오의 기존 오디오를 완전히 대체합니다.")
+                        audio_file = gr.Audio(
+                            label="오디오 파일 (MP3, WAV, M4A 등)",
+                            type="filepath",
+                            sources=["upload"]
+                        )
+                        audio_volume = gr.Slider(
+                            minimum=0,
+                            maximum=200,
+                            value=100,
+                            step=1,
+                            label="오디오 볼륨 (%)",
+                            info="100% = 원본 볼륨"
+                        )
+                        gr.Markdown("""
+                        **오디오 옵션**:
+                        - 오디오가 비디오보다 짧으면 자동으로 반복됩니다
+                        - 오디오가 비디오보다 길면 비디오 길이에 맞춰 잘립니다
+                        """)
+                    with gr.Group(elem_classes="panel-box"):
+                        gr.Markdown("### ⚙️ 편집 설정")
+                        output_fps = gr.Slider(
+                            minimum=0,
+                            maximum=60,
+                            value=0,
+                            step=1,
+                            label="출력 FPS (0 = 첫 번째 비디오의 FPS 사용)"
+                        )
+                        gr.Markdown("""
+                        **크기 처리**:
+                        - 첫 번째 비디오의 크기가 기준이 됩니다
+                        - 다른 크기의 비디오는 첫 번째 비디오 크기로 조정됩니다
+                        - 최상의 결과를 위해 같은 크기의 비디오를 사용하세요
+                        """)
+                        merge_videos_btn = gr.Button("🎬 비디오 병합", variant="primary", elem_id="merge-btn")
+                # 출력 컬럼
+                with gr.Column(scale=1):
+                    with gr.Group(elem_classes="panel-box"):
+                        gr.Markdown("### 🎬 병합 결과")
+                        merge_status = gr.Textbox(label="처리 상태", interactive=False)
+                        merged_video = gr.Video(label="병합된 비디오")
+                        gr.Markdown("""
+                        ### ℹ️ 사용 방법
+                        1. 여러 비디오 파일을 업로드하세요 (최대 10개)
+                        2. 파일명이 작은 순서대로 자동 정렬됩니다
+                        3. (선택) 오디오 파일을 추가하고 볼륨을 조절하세요
+                        4. '비디오 병합' 버튼을 클릭하세요
+                        **특징**:
+                        - ✅ 첫 번째 비디오의 크기를 기준으로 통합
+                        - ✅ 업로드한 오디오가 전체 비디오에 적용됩니다
+                        - ✅ 높은 비트레이트로 품질 유지
+                        **팁**:
+                        - 파일명을 01.mp4, 02.mp4, 03.mp4 형식으로 지정하면 순서 관리가 쉽습니다
+                        - 오디오를 추가하면 기존 비디오의 오디오는 대체됩니다
+                        """)
+        # 다섯 번째 탭: 비디오 배경제거/합성
+        with gr.Tab("비디오 배경제거/합성", elem_classes="tabitem"):
+            with gr.Row(equal_height=True):
+                # 입력 컬럼
+                with gr.Column(scale=1):
+                    with gr.Group(elem_classes="panel-box"):
+                        gr.Markdown("### 🎥 비디오 업로드")
+                        bg_video_input = gr.Video(
+                            label="입력 비디오",
+                            interactive=True
+                        )
+                    with gr.Group(elem_classes="panel-box"):
+                        gr.Markdown("### 🎨 배경 설정")
+                        bg_type = gr.Radio(
+                            ["색상", "이미지", "비디오"],
+                            label="배경 유형",
+                            value="색상",
+                            interactive=True
+                        )
+                        color_picker = gr.ColorPicker(
+                            label="배경 색상",
+                            value="#00FF00",
+                            visible=True,
+                            interactive=True
+                        )
+                        bg_image_input = gr.Image(
+                            label="배경 이미지",
+                            type="filepath",
+                            visible=False,
+                            interactive=True
+                        )
+                        bg_video_bg = gr.Video(
+                            label="배경 비디오",
+                            visible=False,
+                            interactive=True
+                        )
+                        with gr.Column(visible=False) as video_handling_options:
+                            video_handling_radio = gr.Radio(
+                                ["slow_down", "loop"],
+                                label="비디오 처리 방식",
+                                value="slow_down",
+                                interactive=True,
+                                info="slow_down: 배경 비디오를 느리게 재생, loop: 배경 비디오를 반복"
+                            )
+                    with gr.Group(elem_classes="panel-box"):
+                        gr.Markdown("### ⚙️ 처리 설정")
+                        fps_slider = gr.Slider(
+                            minimum=0,
+                            maximum=60,
+                            step=1,
+                            value=0,
+                            label="출력 FPS (0 = 원본 FPS 유지)",
+                            interactive=True
+                        )
+                        fast_mode_checkbox = gr.Checkbox(
+                            label="빠른 모드 (BiRefNet_lite 사용)",
+                            value=True,
+                            interactive=True
+                        )
+                        max_workers_slider = gr.Slider(
+                            minimum=1,
+                            maximum=32,
+                            step=1,
+                            value=10,
+                            label="최대 워커 수",
+                            info="병렬로 처리할 프레임 수",
+                            interactive=True
+                        )
+                        bg_remove_btn = gr.Button("🎬 배경 변경", variant="primary", elem_id="bg-remove-btn")
+                # 출력 컬럼
+                with gr.Column(scale=1):
+                    with gr.Group(elem_classes="panel-box"):
+                        gr.Markdown("### 🎬 처리 결과")
+                        stream_image = gr.Image(label="실시간 스트리밍", visible=False)
+                        output_bg_video = gr.Video(label="최종 비디오")
+                        time_textbox = gr.Textbox(label="경과 시간", interactive=False)
+                        gr.Markdown("""
+                        ### ℹ️ 사용 방법
+                        1. 비디오를 업로드하세요
+                        2. 원하는 배경 유형을 선택하세요
+                        3. 설정을 조정하고 '배경 변경' 버튼을 클릭하세요
+                        **참고**: GPU 제한으로 한 번에 약 200프레임까지 처리 가능합니다.
+                        긴 비디오는 작은 조각으로 나누어 처리하세요.
+                        """)
+    # 모델 로드 함수 실행
+    def on_demo_load():
+        try:
+            if IS_SPACES:
+                # Spaces 환경에서 GPU 워밍업
+                gpu_warmup()
+            # 모델 로드는 첫 번째 GPU 함수 호출 시 자동으로 수행됨
+            return "모델 로딩 준비 완료"
+        except Exception as e:
+            return f"초기화 오류: {str(e)}"
     # 이벤트 연결 - 첫 번째 탭
     size_preset.change(update_dimensions, [size_preset], [width, height])
         [audio_video_input, audio_prompt, audio_negative_prompt, audio_seed, audio_steps, audio_cfg, audio_duration],
         [output_video_with_audio]
     )
+    # 이벤트 연결 - 네 번째 탭 (비디오 편집)
+    merge_videos_btn.click(
+        merge_videos_with_audio,
+        inputs=[video_files, audio_file, audio_volume, output_fps],
+        outputs=[merged_video, merge_status]
+    )
+    # 이벤트 연결 - 다섯 번째 탭 (비디오 배경제거/합성)
+    def update_bg_visibility(bg_type):
+        if bg_type == "색상":
+            return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+        elif bg_type == "이미지":
+            return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
+        elif bg_type == "비디오":
+            return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
+        else:
+            return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+    bg_type.change(
+        update_bg_visibility,
+        inputs=bg_type,
+        outputs=[color_picker, bg_image_input, bg_video_bg, video_handling_options]
+    )
+    bg_remove_btn.click(
+        process_video_bg,
+        inputs=[bg_video_input, bg_type, bg_image_input, bg_video_bg, color_picker,
+                fps_slider, video_handling_radio, fast_mode_checkbox, max_workers_slider],
+        outputs=[stream_image, output_bg_video, time_textbox]
+    )
+    # 데모 로드 시 실행
+    demo.load(on_demo_load, outputs=model_status)
+if __name__ == "__main__":
+    # Spaces 환경에서 추가 체크
+    if IS_SPACES:
+        try:
+            gpu_warmup()
+        except:
+            pass
+    demo.launch()