Spaces:

CocoBro
/

MMEdit

Configuration error

App Files Files Community

CocoBro commited on Dec 23, 2025

Commit

a2ca450

1 Parent(s): 43bf2c1

fix tf32

Browse files

Files changed (1) hide show

app.py +30 -19

app.py CHANGED Viewed

@@ -23,16 +23,14 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(
 logger = logging.getLogger("mmedit_space")
-# ---------------------------------------------------------
-# HF Repo IDs（按你的默认需求）
-# ---------------------------------------------------------
 MMEDIT_REPO_ID = os.environ.get("MMEDIT_REPO_ID", "CocoBro/MMEdit")
 MMEDIT_REVISION = os.environ.get("MMEDIT_REVISION", None)
 QWEN_REPO_ID = os.environ.get("QWEN_REPO_ID", "Qwen/Qwen2-Audio-7B-Instruct")
 QWEN_REVISION = os.environ.get("QWEN_REVISION", None)
-# 如果 Qwen gated：Space 里把 HF_TOKEN 设为 Secret
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 OUTPUT_DIR = Path(os.environ.get("OUTPUT_DIR", "./outputs"))
@@ -41,8 +39,6 @@ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 USE_AMP = os.environ.get("USE_AMP", "0") == "1"
 AMP_DTYPE = os.environ.get("AMP_DTYPE", "bf16")  # "bf16" or "fp16"
-# ZeroGPU：缓存 CPU pipeline（不要缓存 CUDA Tensor）
-# cache: key -> (model_cpu, scheduler, target_sr)
 _PIPELINE_CACHE: Dict[str, Tuple[object, object, int]] = {}
 # cache: key -> (repo_root, qwen_root)
 _MODEL_DIR_CACHE: Dict[str, Tuple[Path, Path]] = {}
@@ -89,6 +85,8 @@ def load_and_process_audio(audio_path: str, target_sr: int):
     import torchaudio
     import librosa
     path = Path(audio_path)
     if not path.exists():
         raise FileNotFoundError(f"Audio file not found: {audio_path}")
@@ -184,7 +182,11 @@ def run_edit(audio_file, caption, num_steps, guidance_scale, guidance_rescale, s
     from safetensors.torch import load_file
     import diffusers.schedulers as noise_schedulers
-    # 尝试导入项目配置
     try:
         from utils.config import register_omegaconf_resolvers
         register_omegaconf_resolvers()
@@ -192,12 +194,10 @@ def run_edit(audio_file, caption, num_steps, guidance_scale, guidance_rescale, s
     if not audio_file: return None, "Please upload audio."
-    # 局部变量，用于 finally 清理
     model = None
     try:
-        # ==========================================
-        # 1. 就在这里加载模型！利用 ZeroGPU 的大内存
         # ==========================================
         logger.info("🚀 Starting ZeroGPU Task...")
@@ -205,7 +205,7 @@ def run_edit(audio_file, caption, num_steps, guidance_scale, guidance_rescale, s
         repo_root, qwen_root = resolve_model_dirs()
         exp_cfg = OmegaConf.to_container(OmegaConf.load(repo_root / "config.yaml"), resolve=True)
-        # 路径修复逻辑
         vae_ckpt = exp_cfg["model"]["autoencoder"].get("pretrained_ckpt", "")
         if vae_ckpt:
             p1 = repo_root / "vae" / Path(vae_ckpt).name
@@ -214,7 +214,7 @@ def run_edit(audio_file, caption, num_steps, guidance_scale, guidance_rescale, s
             elif p2.exists(): exp_cfg["model"]["autoencoder"]["pretrained_ckpt"] = str(p2)
         exp_cfg["model"]["content_encoder"]["text_encoder"]["model_path"] = str(qwen_root)
-        # 实例化模型 (此时消耗大量 CPU 内存，但 ZeroGPU 环境扛得住)
         logger.info("Instantiating model (Hydra)...")
         model = hydra.utils.instantiate(exp_cfg["model"], _convert_="all")
@@ -227,7 +227,6 @@ def run_edit(audio_file, caption, num_steps, guidance_scale, guidance_rescale, s
         gc.collect()
         # ==========================================
-        # 2. 立即转到 GPU (FP16)
         # ==========================================
         device = torch.device("cuda")
         logger.info("Moving model to CUDA (FP16)...")
@@ -279,9 +278,7 @@ def run_edit(audio_file, caption, num_steps, guidance_scale, guidance_rescale, s
         with torch.no_grad(), torch.autocast("cuda", dtype=torch.float16):
             out = model.inference(scheduler=scheduler, **batch)
-        # ==========================================
-        # 4. 保存结果
-        # ==========================================
         out_audio = out[0, 0].detach().float().cpu().numpy()
         out_path = OUTPUT_DIR / f"{Path(audio_file).stem}_edited.wav"
         sf.write(str(out_path), out_audio, samplerate=target_sr)
@@ -311,9 +308,23 @@ def build_demo():
                 audio_in = gr.Audio(label="Input", type="filepath")
                 caption = gr.Textbox(label="Instruction", lines=3)
                 gr.Examples(
-                    label="Examples",
-                    examples=[["./Ym8O802VvJes.wav", "Mix in dog barking around the middle."]],
-                    inputs=[audio_in, caption],
                 )
                 with gr.Row():
                     num_steps = gr.Slider(10, 100, 50, step=1, label="Steps")

 logger = logging.getLogger("mmedit_space")
 MMEDIT_REPO_ID = os.environ.get("MMEDIT_REPO_ID", "CocoBro/MMEdit")
 MMEDIT_REVISION = os.environ.get("MMEDIT_REVISION", None)
 QWEN_REPO_ID = os.environ.get("QWEN_REPO_ID", "Qwen/Qwen2-Audio-7B-Instruct")
 QWEN_REVISION = os.environ.get("QWEN_REVISION", None)
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 OUTPUT_DIR = Path(os.environ.get("OUTPUT_DIR", "./outputs"))
 USE_AMP = os.environ.get("USE_AMP", "0") == "1"
 AMP_DTYPE = os.environ.get("AMP_DTYPE", "bf16")  # "bf16" or "fp16"
 _PIPELINE_CACHE: Dict[str, Tuple[object, object, int]] = {}
 # cache: key -> (repo_root, qwen_root)
 _MODEL_DIR_CACHE: Dict[str, Tuple[Path, Path]] = {}
     import torchaudio
     import librosa
     path = Path(audio_path)
     if not path.exists():
         raise FileNotFoundError(f"Audio file not found: {audio_path}")
     from safetensors.torch import load_file
     import diffusers.schedulers as noise_schedulers
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
     try:
         from utils.config import register_omegaconf_resolvers
         register_omegaconf_resolvers()
     if not audio_file: return None, "Please upload audio."
     model = None
     try:
         # ==========================================
         logger.info("🚀 Starting ZeroGPU Task...")
         repo_root, qwen_root = resolve_model_dirs()
         exp_cfg = OmegaConf.to_container(OmegaConf.load(repo_root / "config.yaml"), resolve=True)
+        #
         vae_ckpt = exp_cfg["model"]["autoencoder"].get("pretrained_ckpt", "")
         if vae_ckpt:
             p1 = repo_root / "vae" / Path(vae_ckpt).name
             elif p2.exists(): exp_cfg["model"]["autoencoder"]["pretrained_ckpt"] = str(p2)
         exp_cfg["model"]["content_encoder"]["text_encoder"]["model_path"] = str(qwen_root)
+        #
         logger.info("Instantiating model (Hydra)...")
         model = hydra.utils.instantiate(exp_cfg["model"], _convert_="all")
         gc.collect()
         # ==========================================
         # ==========================================
         device = torch.device("cuda")
         logger.info("Moving model to CUDA (FP16)...")
         with torch.no_grad(), torch.autocast("cuda", dtype=torch.float16):
             out = model.inference(scheduler=scheduler, **batch)
         out_audio = out[0, 0].detach().float().cpu().numpy()
         out_path = OUTPUT_DIR / f"{Path(audio_file).stem}_edited.wav"
         sf.write(str(out_path), out_audio, samplerate=target_sr)
                 audio_in = gr.Audio(label="Input", type="filepath")
                 caption = gr.Textbox(label="Instruction", lines=3)
                 gr.Examples(
+                    label="Examples (Click to load)",
+                    # 格式：[ [音频路径1, 提示词1], [音频路径2, 提示词2], ... ]
+                    examples=[
+                        # 示例 1 (原本的)
+                        ["./Ym8O802VvJes.wav", "Mix in dog barking around the middle."],
+                        # 示例 2 (新加的)
+                        ["./YDKM2KjNkX18.wav", "Incorporate Telephone bell ringing into the background."],
+                        # 示例 3 (新加的)
+                        ["./drop_audiocaps_1.wav", "Remove the sound of several beeps."],
+                        # 示例 4 (新加的)
+                        ["./reorder_audiocaps_1.wav", "Switch the positions of the woman's voice and whistling."]
+                    ],
+                    inputs=[audio_in, caption],  # 对应上面列表的顺序：第一个是 Audio，第二个是 Textbox
+                    cache_examples=False,        # ZeroGPU 环境建议设为 False，避免启动时耗时计算
                 )
                 with gr.Row():
                     num_steps = gr.Slider(10, 100, 50, step=1, label="Steps")