Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on A100

App Files Files Community

ChuxiJ commited on Dec 27, 2025

Commit

b3f1425

1 Parent(s): 22b5dfd

fix: bot return metas

Browse files

Files changed (3) hide show

acestep/acestep_v15_pipeline.py +15 -4
acestep/api_server.py +45 -70
acestep/handler.py +59 -13

acestep/acestep_v15_pipeline.py CHANGED Viewed

@@ -9,10 +9,21 @@ import sys
 for proxy_var in ['http_proxy', 'https_proxy', 'HTTP_PROXY', 'HTTPS_PROXY', 'ALL_PROXY']:
     os.environ.pop(proxy_var, None)
-from .handler import AceStepHandler
-from .llm_inference import LLMHandler
-from .dataset_handler import DatasetHandler
-from .gradio_ui import create_gradio_interface
 def create_demo(init_params=None):

 for proxy_var in ['http_proxy', 'https_proxy', 'HTTP_PROXY', 'HTTPS_PROXY', 'ALL_PROXY']:
     os.environ.pop(proxy_var, None)
+try:
+    # When executed as a module: `python -m acestep.acestep_v15_pipeline`
+    from .handler import AceStepHandler
+    from .llm_inference import LLMHandler
+    from .dataset_handler import DatasetHandler
+    from .gradio_ui import create_gradio_interface
+except ImportError:
+    # When executed as a script: `python acestep/acestep_v15_pipeline.py`
+    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    if project_root not in sys.path:
+        sys.path.insert(0, project_root)
+    from acestep.handler import AceStepHandler
+    from acestep.llm_inference import LLMHandler
+    from acestep.dataset_handler import DatasetHandler
+    from acestep.gradio_ui import create_gradio_interface
 def create_demo(init_params=None):

acestep/api_server.py CHANGED Viewed

@@ -51,8 +51,9 @@ class GenerateMusicRequest(BaseModel):
     thinking: bool = False
     bpm: Optional[int] = None
-    key_scale: str = ""
-    time_signature: str = ""
     vocal_language: str = "en"
     inference_steps: int = 8
     guidance_scale: float = 7.0
@@ -61,7 +62,7 @@ class GenerateMusicRequest(BaseModel):
     reference_audio_path: Optional[str] = None
     src_audio_path: Optional[str] = None
-    audio_duration: Optional[float] = None
     batch_size: Optional[int] = None
     audio_code_string: str = ""
@@ -93,6 +94,10 @@ class GenerateMusicRequest(BaseModel):
     lm_repetition_penalty: float = 1.0
     lm_negative_prompt: str = "NO USER INPUT"
 _LM_DEFAULT_TEMPERATURE = 0.85
 _LM_DEFAULT_CFG_SCALE = 2.0
@@ -501,62 +506,6 @@ def create_app() -> FastAPI:
                     max_dur = float(os.getenv("ACESTEP_LYRICS_MAX_DURATION_SECONDS", "180"))
                     return float(min(max(est, min_dur), max_dur))
-                def _extract_lm_fields(meta: Dict[str, Any]) -> Dict[str, Any]:
-                    def _parse_first_float(v: Any) -> Optional[float]:
-                        if v is None:
-                            return None
-                        if isinstance(v, (int, float)):
-                            return float(v)
-                        s = str(v).strip()
-                        if not s or s.upper() == "N/A":
-                            return None
-                        try:
-                            return float(s)
-                        except Exception:
-                            pass
-                        m = re.search(r"[-+]?\d*\.?\d+", s)
-                        if not m:
-                            return None
-                        try:
-                            return float(m.group(0))
-                        except Exception:
-                            return None
-                    def _parse_first_int(v: Any) -> Optional[int]:
-                        fv = _parse_first_float(v)
-                        if fv is None:
-                            return None
-                        try:
-                            return int(round(fv))
-                        except Exception:
-                            return None
-                    def _none_if_na(v: Any) -> Any:
-                        if v is None:
-                            return None
-                        if isinstance(v, str) and v.strip() in {"", "N/A"}:
-                            return None
-                        return v
-                    out: Dict[str, Any] = {}
-                    bpm_raw = _none_if_na(meta.get("bpm"))
-                    out["bpm"] = _parse_first_int(bpm_raw)
-                    dur_raw = _none_if_na(meta.get("duration"))
-                    out["duration"] = _parse_first_float(dur_raw)
-                    genres_raw = _none_if_na(meta.get("genres"))
-                    out["genres"] = str(genres_raw) if genres_raw is not None else None
-                    keyscale_raw = _none_if_na(meta.get("keyscale", meta.get("key_scale")))
-                    out["keyscale"] = str(keyscale_raw) if keyscale_raw is not None else None
-                    ts_raw = _none_if_na(meta.get("timesignature", meta.get("time_signature")))
-                    out["timesignature"] = str(ts_raw) if ts_raw is not None else None
-                    return out
                 def _normalize_metas(meta: Dict[str, Any]) -> Dict[str, Any]:
                     """Ensure a stable `metas` dict (keys always present)."""
                     meta = meta or {}
@@ -587,7 +536,7 @@ def create_app() -> FastAPI:
                 # We keep backward compatibility: only auto-adjust when user didn't override (still at default 1.0).
                 audio_cover_strength_val = float(req.audio_cover_strength)
-                lm_fields: Dict[str, Any] = {}
                 # Determine effective batch size (used for per-sample LM code diversity)
                 effective_batch_size = req.batch_size
@@ -656,6 +605,7 @@ def create_app() -> FastAPI:
                             )
                         meta, codes, status = _lm_call()
                         if need_lm_codes:
                             if not codes:
@@ -668,12 +618,6 @@ def create_app() -> FastAPI:
                             else:
                                 audio_code_string = codes
-                        # Always expose LM metas when we invoked LM (even if user already set some fields).
-                        lm_fields = {
-                            "metas": _normalize_metas(meta),
-                            **_extract_lm_fields(meta),
-                        }
                         # Fill only missing fields (user-provided values win)
                         bpm_val, key_scale_val, time_sig_val, audio_duration_val = _maybe_fill_from_metadata(req, meta)
@@ -711,6 +655,25 @@ def create_app() -> FastAPI:
                         # thinking=True requires codes generation.
                         raise RuntimeError("thinking=true requires non-empty audio codes (LM generation failed).")
                 first, second, paths, gen_info, status_msg, seed_value, *_ = h.generate_music(
                     captions=req.caption,
                     lyrics=req.lyrics,
@@ -746,7 +709,12 @@ def create_app() -> FastAPI:
                     "generation_info": gen_info,
                     "status_message": status_msg,
                     "seed_value": seed_value,
-                    **lm_fields,
                 }
             t0 = time.time()
@@ -815,13 +783,20 @@ def create_app() -> FastAPI:
             if not callable(get):
                 raise HTTPException(status_code=400, detail="Invalid request payload")
             return GenerateMusicRequest(
                 caption=str(get("caption", "") or ""),
                 lyrics=str(get("lyrics", "") or ""),
                 thinking=_to_bool(get("thinking"), False),
                 bpm=_to_int(get("bpm"), None),
-                key_scale=str(get("key_scale", "") or ""),
-                time_signature=str(get("time_signature", "") or ""),
                 vocal_language=str(get("vocal_language", "en") or "en"),
                 inference_steps=_to_int(get("inference_steps"), 8) or 8,
                 guidance_scale=_to_float(get("guidance_scale"), 7.0) or 7.0,
@@ -829,7 +804,7 @@ def create_app() -> FastAPI:
                 seed=_to_int(get("seed"), -1) or -1,
                 reference_audio_path=reference_audio_path,
                 src_audio_path=src_audio_path,
-                audio_duration=_to_float(get("audio_duration"), None),
                 batch_size=_to_int(get("batch_size"), None),
                 audio_code_string=str(get("audio_code_string", "") or ""),
                 repainting_start=_to_float(get("repainting_start"), 0.0) or 0.0,

     thinking: bool = False
     bpm: Optional[int] = None
+    # Accept common client keys while keeping internal field names stable.
+    key_scale: str = Field(default="", alias="keyscale")
+    time_signature: str = Field(default="", alias="timesignature")
     vocal_language: str = "en"
     inference_steps: int = 8
     guidance_scale: float = 7.0
     reference_audio_path: Optional[str] = None
     src_audio_path: Optional[str] = None
+    audio_duration: Optional[float] = Field(default=None, alias="duration")
     batch_size: Optional[int] = None
     audio_code_string: str = ""
     lm_repetition_penalty: float = 1.0
     lm_negative_prompt: str = "NO USER INPUT"
+    class Config:
+        allow_population_by_field_name = True
+        allow_population_by_alias = True
 _LM_DEFAULT_TEMPERATURE = 0.85
 _LM_DEFAULT_CFG_SCALE = 2.0
                     max_dur = float(os.getenv("ACESTEP_LYRICS_MAX_DURATION_SECONDS", "180"))
                     return float(min(max(est, min_dur), max_dur))
                 def _normalize_metas(meta: Dict[str, Any]) -> Dict[str, Any]:
                     """Ensure a stable `metas` dict (keys always present)."""
                     meta = meta or {}
                 # We keep backward compatibility: only auto-adjust when user didn't override (still at default 1.0).
                 audio_cover_strength_val = float(req.audio_cover_strength)
+                lm_meta: Optional[Dict[str, Any]] = None
                 # Determine effective batch size (used for per-sample LM code diversity)
                 effective_batch_size = req.batch_size
                             )
                         meta, codes, status = _lm_call()
+                        lm_meta = meta
                         if need_lm_codes:
                             if not codes:
                             else:
                                 audio_code_string = codes
                         # Fill only missing fields (user-provided values win)
                         bpm_val, key_scale_val, time_sig_val, audio_duration_val = _maybe_fill_from_metadata(req, meta)
                         # thinking=True requires codes generation.
                         raise RuntimeError("thinking=true requires non-empty audio codes (LM generation failed).")
+                # Response metas MUST reflect the actual values used by DiT.
+                metas_out = _normalize_metas(lm_meta or {})
+                if bpm_val is not None and int(bpm_val) > 0:
+                    metas_out["bpm"] = int(bpm_val)
+                if audio_duration_val is not None and float(audio_duration_val) > 0:
+                    metas_out["duration"] = float(audio_duration_val)
+                if (key_scale_val or "").strip():
+                    metas_out["keyscale"] = str(key_scale_val)
+                if (time_sig_val or "").strip():
+                    metas_out["timesignature"] = str(time_sig_val)
+                def _none_if_na_str(v: Any) -> Optional[str]:
+                    if v is None:
+                        return None
+                    s = str(v).strip()
+                    if s in {"", "N/A"}:
+                        return None
+                    return s
                 first, second, paths, gen_info, status_msg, seed_value, *_ = h.generate_music(
                     captions=req.caption,
                     lyrics=req.lyrics,
                     "generation_info": gen_info,
                     "status_message": status_msg,
                     "seed_value": seed_value,
+                    "metas": metas_out,
+                    "bpm": int(bpm_val) if bpm_val is not None else None,
+                    "duration": float(audio_duration_val) if audio_duration_val is not None else None,
+                    "genres": _none_if_na_str(metas_out.get("genres")),
+                    "keyscale": _none_if_na_str(metas_out.get("keyscale")),
+                    "timesignature": _none_if_na_str(metas_out.get("timesignature")),
                 }
             t0 = time.time()
             if not callable(get):
                 raise HTTPException(status_code=400, detail="Invalid request payload")
+            def _get_any(*keys: str, default: Any = None) -> Any:
+                for k in keys:
+                    v = get(k, None)
+                    if v is not None:
+                        return v
+                return default
             return GenerateMusicRequest(
                 caption=str(get("caption", "") or ""),
                 lyrics=str(get("lyrics", "") or ""),
                 thinking=_to_bool(get("thinking"), False),
                 bpm=_to_int(get("bpm"), None),
+                key_scale=str(_get_any("key_scale", "keyscale", default="") or ""),
+                time_signature=str(_get_any("time_signature", "timesignature", default="") or ""),
                 vocal_language=str(get("vocal_language", "en") or "en"),
                 inference_steps=_to_int(get("inference_steps"), 8) or 8,
                 guidance_scale=_to_float(get("guidance_scale"), 7.0) or 7.0,
                 seed=_to_int(get("seed"), -1) or -1,
                 reference_audio_path=reference_audio_path,
                 src_audio_path=src_audio_path,
+                audio_duration=_to_float(_get_any("audio_duration", "duration"), None),
                 batch_size=_to_int(get("batch_size"), None),
                 audio_code_string=str(get("audio_code_string", "") or ""),
                 repainting_start=_to_float(get("repainting_start"), 0.0) or 0.0,

acestep/handler.py CHANGED Viewed

@@ -932,7 +932,14 @@ class AceStepHandler:
         is_repaint_task = (task_type == "repaint")
         is_lego_task = (task_type == "lego")
         is_cover_task = (task_type == "cover")
-        if audio_code_string and str(audio_code_string).strip():
             is_cover_task = True
         # Both repaint and lego tasks can use repainting parameters for chunk mask
         can_use_repainting = is_repaint_task or is_lego_task
@@ -1371,10 +1378,16 @@ class AceStepHandler:
                     # Pad or crop to match max_latent_length
                     if hints.shape[1] < max_latent_length:
                         pad_length = max_latent_length - hints.shape[1]
-                        hints = torch.cat([
-                            hints,
-                            self.silence_latent[0, :pad_length, :]
-                        ], dim=1)
                     elif hints.shape[1] > max_latent_length:
                         hints = hints[:, :max_latent_length, :]
                     precomputed_lm_hints_25Hz_list.append(hints[0])  # Remove batch dimension
@@ -1553,19 +1566,45 @@ class AceStepHandler:
     def infer_refer_latent(self, refer_audioss):
         refer_audio_order_mask = []
         refer_audio_latents = []
         for batch_idx, refer_audios in enumerate(refer_audioss):
             if len(refer_audios) == 1 and torch.all(refer_audios[0] == 0.0):
-                refer_audio_latent = self.silence_latent[:, :750, :]
                 refer_audio_latents.append(refer_audio_latent)
                 refer_audio_order_mask.append(batch_idx)
             else:
                 for refer_audio in refer_audios:
                     # Ensure input is in VAE's dtype
                     vae_input = refer_audio.unsqueeze(0).to(self.vae.dtype)
                     refer_audio_latent = self.vae.encode(vae_input).latent_dist.sample()
                     # Cast back to model dtype
                     refer_audio_latent = refer_audio_latent.to(self.dtype)
-                    refer_audio_latents.append(refer_audio_latent.transpose(1, 2))
                     refer_audio_order_mask.append(batch_idx)
         refer_audio_latents = torch.cat(refer_audio_latents, dim=0)
@@ -1949,7 +1988,7 @@ class AceStepHandler:
         audio_duration: Optional[float] = None,
         batch_size: Optional[int] = None,
         src_audio=None,
-        audio_code_string: str = "",
         repainting_start: float = 0.0,
         repainting_end: Optional[float] = None,
         instruction: str = "Fill the audio semantic mask based on the given conditions:",
@@ -1978,11 +2017,16 @@ class AceStepHandler:
         if self.model is None or self.vae is None or self.text_tokenizer is None or self.text_encoder is None:
             return None, None, [], "", "❌ Model not fully initialized. Please initialize all components first.", "-1", "", "", None, "", "", None
         # Auto-detect task type based on audio_code_string
         # If audio_code_string is provided and not empty, use cover task
         # Otherwise, use text2music task (or keep current task_type if not text2music)
         if task_type == "text2music":
-            if audio_code_string and str(audio_code_string).strip():
                 # User has provided audio codes, switch to cover task
                 task_type = "cover"
                 # Update instruction for cover task
@@ -2031,7 +2075,7 @@ class AceStepHandler:
             processed_src_audio = None
             if src_audio is not None:
                 # Check if audio codes are provided - if so, ignore src_audio
-                if audio_code_string and str(audio_code_string).strip():
                     logger.info("[generate_music] Audio codes provided, ignoring src_audio and using codes instead")
                 else:
                     logger.info("[generate_music] Processing source audio...")
@@ -2070,9 +2114,11 @@ class AceStepHandler:
             # Prepare audio_code_hints - use if audio_code_string is provided
             # This works for both text2music (auto-switched to cover) and cover tasks
             audio_code_hints_batch = None
-            if audio_code_string and str(audio_code_string).strip():
-                # Audio codes provided, use as hints (will trigger cover mode in inference service)
-                audio_code_hints_batch = [audio_code_string] * actual_batch_size
             should_return_intermediate = (task_type == "text2music")
             outputs = self.service_generate(

         is_repaint_task = (task_type == "repaint")
         is_lego_task = (task_type == "lego")
         is_cover_task = (task_type == "cover")
+        has_codes = False
+        if isinstance(audio_code_string, list):
+            has_codes = any((c or "").strip() for c in audio_code_string)
+        else:
+            has_codes = bool(audio_code_string and str(audio_code_string).strip())
+        if has_codes:
             is_cover_task = True
         # Both repaint and lego tasks can use repainting parameters for chunk mask
         can_use_repainting = is_repaint_task or is_lego_task
                     # Pad or crop to match max_latent_length
                     if hints.shape[1] < max_latent_length:
                         pad_length = max_latent_length - hints.shape[1]
+                        pad = self.silence_latent
+                        # Match dims: hints is usually [1, T, D], silence_latent is [1, T, D]
+                        if pad.dim() == 2:
+                            pad = pad.unsqueeze(0)
+                        if hints.dim() == 2:
+                            hints = hints.unsqueeze(0)
+                        pad_chunk = pad[:, :pad_length, :]
+                        if pad_chunk.device != hints.device or pad_chunk.dtype != hints.dtype:
+                            pad_chunk = pad_chunk.to(device=hints.device, dtype=hints.dtype)
+                        hints = torch.cat([hints, pad_chunk], dim=1)
                     elif hints.shape[1] > max_latent_length:
                         hints = hints[:, :max_latent_length, :]
                     precomputed_lm_hints_25Hz_list.append(hints[0])  # Remove batch dimension
     def infer_refer_latent(self, refer_audioss):
         refer_audio_order_mask = []
         refer_audio_latents = []
+        def _normalize_audio_2d(a: torch.Tensor) -> torch.Tensor:
+            """Normalize audio tensor to [2, T] on current device."""
+            if not isinstance(a, torch.Tensor):
+                raise TypeError(f"refer_audio must be a torch.Tensor, got {type(a)!r}")
+            # Accept [T], [1, T], [2, T], [1, 2, T]
+            if a.dim() == 3 and a.shape[0] == 1:
+                a = a.squeeze(0)
+            if a.dim() == 1:
+                a = a.unsqueeze(0)
+            if a.dim() != 2:
+                raise ValueError(f"refer_audio must be 1D/2D/3D(1,2,T); got shape={tuple(a.shape)}")
+            if a.shape[0] == 1:
+                a = torch.cat([a, a], dim=0)
+            a = a[:2]
+            return a
+        def _ensure_latent_3d(z: torch.Tensor) -> torch.Tensor:
+            """Ensure latent is [N, T, D] (3D) for packing."""
+            if z.dim() == 4 and z.shape[0] == 1:
+                z = z.squeeze(0)
+            if z.dim() == 2:
+                z = z.unsqueeze(0)
+            return z
         for batch_idx, refer_audios in enumerate(refer_audioss):
             if len(refer_audios) == 1 and torch.all(refer_audios[0] == 0.0):
+                refer_audio_latent = _ensure_latent_3d(self.silence_latent[:, :750, :])
                 refer_audio_latents.append(refer_audio_latent)
                 refer_audio_order_mask.append(batch_idx)
             else:
                 for refer_audio in refer_audios:
+                    refer_audio = _normalize_audio_2d(refer_audio)
                     # Ensure input is in VAE's dtype
                     vae_input = refer_audio.unsqueeze(0).to(self.vae.dtype)
                     refer_audio_latent = self.vae.encode(vae_input).latent_dist.sample()
                     # Cast back to model dtype
                     refer_audio_latent = refer_audio_latent.to(self.dtype)
+                    refer_audio_latents.append(_ensure_latent_3d(refer_audio_latent.transpose(1, 2)))
                     refer_audio_order_mask.append(batch_idx)
         refer_audio_latents = torch.cat(refer_audio_latents, dim=0)
         audio_duration: Optional[float] = None,
         batch_size: Optional[int] = None,
         src_audio=None,
+        audio_code_string: Union[str, List[str]] = "",
         repainting_start: float = 0.0,
         repainting_end: Optional[float] = None,
         instruction: str = "Fill the audio semantic mask based on the given conditions:",
         if self.model is None or self.vae is None or self.text_tokenizer is None or self.text_encoder is None:
             return None, None, [], "", "❌ Model not fully initialized. Please initialize all components first.", "-1", "", "", None, "", "", None
+        def _has_audio_codes(v: Union[str, List[str]]) -> bool:
+            if isinstance(v, list):
+                return any((x or "").strip() for x in v)
+            return bool(v and str(v).strip())
         # Auto-detect task type based on audio_code_string
         # If audio_code_string is provided and not empty, use cover task
         # Otherwise, use text2music task (or keep current task_type if not text2music)
         if task_type == "text2music":
+            if _has_audio_codes(audio_code_string):
                 # User has provided audio codes, switch to cover task
                 task_type = "cover"
                 # Update instruction for cover task
             processed_src_audio = None
             if src_audio is not None:
                 # Check if audio codes are provided - if so, ignore src_audio
+                if _has_audio_codes(audio_code_string):
                     logger.info("[generate_music] Audio codes provided, ignoring src_audio and using codes instead")
                 else:
                     logger.info("[generate_music] Processing source audio...")
             # Prepare audio_code_hints - use if audio_code_string is provided
             # This works for both text2music (auto-switched to cover) and cover tasks
             audio_code_hints_batch = None
+            if _has_audio_codes(audio_code_string):
+                if isinstance(audio_code_string, list):
+                    audio_code_hints_batch = audio_code_string
+                else:
+                    audio_code_hints_batch = [audio_code_string] * actual_batch_size
             should_return_intermediate = (task_type == "text2music")
             outputs = self.service_generate(