LongCat-AudioDiT-3.5B

Running on Zero

App Files Files Community

hysts HF Staff commited on 27 days ago

Commit

f0a5bff

1 Parent(s): cf81ad3

Add files

Browse files

Files changed (8) hide show

.gitmodules +3 -0
.python-version +1 -0
README.md +2 -1
app.py +282 -0
pyproject.toml +63 -0
requirements.txt +366 -0
uv.lock +0 -0
vendor/LongCat-AudioDiT +1 -0

.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "vendor/LongCat-AudioDiT"]
+	path = vendor/LongCat-AudioDiT
+	url = https://github.com/meituan-longcat/LongCat-AudioDiT.git

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

README.md CHANGED Viewed

@@ -1,10 +1,11 @@
 ---
 title: LongCat AudioDiT 3.5B
-emoji: 👀
 colorFrom: purple
 colorTo: pink
 sdk: gradio
 sdk_version: 6.10.0
 app_file: app.py
 pinned: false
 ---

 ---
 title: LongCat AudioDiT 3.5B
+emoji: 🐱
 colorFrom: purple
 colorTo: pink
 sdk: gradio
 sdk_version: 6.10.0
+python_version: "3.12"
 app_file: app.py
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import re
+import sys
+from pathlib import Path
+import gradio as gr
+import librosa
+import numpy as np
+import spaces
+import torch
+# Register audiodit model type with transformers
+sys.path.insert(0, str(Path(__file__).resolve().parent / "vendor" / "LongCat-AudioDiT"))
+import audiodit  # noqa: F401
+from audiodit import AudioDiTModel
+from transformers import AutoTokenizer
+# ---------------------------------------------------------------------------
+# Text utilities (from upstream utils.py)
+# ---------------------------------------------------------------------------
+MAX_SEED = 2**32 - 1
+EN_DUR_PER_CHAR = 0.082
+ZH_DUR_PER_CHAR = 0.21
+def normalize_text(text: str) -> str:
+    text = text.lower()
+    text = re.sub(r"[\u201c\u201d\u201e\u2018\u2019]", " ", text)
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+def approx_duration_from_text(text: str, max_duration: float = 30.0) -> float:
+    text = re.sub(r"\s+", "", text)
+    num_zh = num_en = num_other = 0
+    for c in text:
+        if "\u4e00" <= c <= "\u9fff":
+            num_zh += 1
+        elif c.isalpha():
+            num_en += 1
+        else:
+            num_other += 1
+    if num_zh > num_en:
+        num_zh += num_other
+    else:
+        num_en += num_other
+    return min(max_duration, num_zh * ZH_DUR_PER_CHAR + num_en * EN_DUR_PER_CHAR)
+# ---------------------------------------------------------------------------
+# Model loading
+# ---------------------------------------------------------------------------
+MODEL_ID = "meituan-longcat/LongCat-AudioDiT-3.5B"
+model = AudioDiTModel.from_pretrained(MODEL_ID).to("cuda")
+model.vae.to_half()
+model.eval()
+tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder_model)
+# ---------------------------------------------------------------------------
+# Inference
+# ---------------------------------------------------------------------------
+def get_seed(randomize_seed: bool, seed: int) -> int:
+    rng = np.random.default_rng()
+    return int(rng.integers(0, MAX_SEED)) if randomize_seed else seed
+@spaces.GPU
+def generate_tts(
+    text: str,
+    guidance_method: str,
+    nfe: int,
+    guidance_strength: float,
+    seed: int,
+) -> tuple[int, np.ndarray]:
+    text = normalize_text(text)
+    if not text:
+        raise gr.Error("Text is empty (or contains only whitespace/quotes).")
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    sr = model.config.sampling_rate
+    full_hop = model.config.latent_hop
+    max_duration = model.config.max_wav_duration
+    inputs = tokenizer([text], padding="longest", return_tensors="pt")
+    dur_sec = approx_duration_from_text(text, max_duration=max_duration)
+    duration = int(dur_sec * sr // full_hop)
+    output = model(
+        input_ids=inputs.input_ids,
+        attention_mask=inputs.attention_mask,
+        duration=duration,
+        steps=nfe,
+        cfg_strength=guidance_strength,
+        guidance_method=guidance_method,
+    )
+    wav = output.waveform.squeeze().detach().cpu().numpy()
+    return (sr, wav)
+@spaces.GPU
+def generate_voice_clone(
+    text: str,
+    prompt_text: str,
+    prompt_audio: tuple[int, np.ndarray] | str | None,
+    guidance_method: str,
+    nfe: int,
+    guidance_strength: float,
+    seed: int,
+) -> tuple[int, np.ndarray]:
+    if prompt_audio is None:
+        raise gr.Error("Prompt audio is required.")
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    sr = model.config.sampling_rate
+    full_hop = model.config.latent_hop
+    max_duration = model.config.max_wav_duration
+    # Load prompt audio — gr.Audio returns (sample_rate, ndarray)
+    input_sr, audio_np = prompt_audio
+    if audio_np.ndim > 1:
+        audio_np = audio_np.mean(axis=-1)
+    audio_np = audio_np.astype(np.float32)
+    if np.abs(audio_np).max() > 1.0:
+        audio_np = audio_np / np.abs(audio_np).max()
+    if input_sr != sr:
+        audio_np = librosa.resample(audio_np, orig_sr=input_sr, target_sr=sr)
+    prompt_wav = torch.from_numpy(audio_np).unsqueeze(0).unsqueeze(0)  # (1, 1, T)
+    # encode_prompt_audio handles VAE padding/encoding/trimming internally
+    _, prompt_dur = model.encode_prompt_audio(prompt_wav)
+    # Text
+    text = normalize_text(text)
+    if not text:
+        raise gr.Error("Text is empty (or contains only whitespace/quotes).")
+    prompt_text = normalize_text(prompt_text)
+    if not prompt_text:
+        raise gr.Error("Prompt text is empty (or contains only whitespace/quotes).")
+    full_text = f"{prompt_text} {text}"
+    inputs = tokenizer([full_text], padding="longest", return_tensors="pt")
+    # Duration estimation
+    prompt_time = prompt_dur * full_hop / sr
+    dur_sec = approx_duration_from_text(text, max_duration=max_duration - prompt_time)
+    approx_pd = approx_duration_from_text(prompt_text, max_duration=max_duration)
+    ratio = np.clip(prompt_time / approx_pd, 1.0, 1.5)
+    dur_sec = dur_sec * ratio
+    duration = int(dur_sec * sr // full_hop)
+    duration = min(duration + prompt_dur, int(max_duration * sr // full_hop))
+    output = model(
+        input_ids=inputs.input_ids,
+        attention_mask=inputs.attention_mask,
+        prompt_audio=prompt_wav,
+        duration=duration,
+        steps=nfe,
+        cfg_strength=guidance_strength,
+        guidance_method=guidance_method,
+    )
+    wav = output.waveform.squeeze().detach().cpu().numpy()
+    return (sr, wav)
+# ---------------------------------------------------------------------------
+# UI
+# ---------------------------------------------------------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("# LongCat-AudioDiT")
+    gr.Markdown(
+        "Diffusion-based text-to-speech with zero-shot voice cloning. "
+        "Based on [meituan-longcat/LongCat-AudioDiT](https://github.com/meituan-longcat/LongCat-AudioDiT)."
+    )
+    with gr.Tabs():
+        with gr.Tab("TTS"):
+            with gr.Row():
+                with gr.Column():
+                    tts_text = gr.Textbox(
+                        label="Text",
+                        lines=5,
+                        placeholder="Enter text to synthesize...",
+                    )
+                    tts_btn = gr.Button("Generate")
+                with gr.Column():
+                    tts_output = gr.Audio(label="Output")
+            gr.Examples(
+                examples=[
+                    [
+                        "She sells seashells by the seashore. The shells she sells are surely seashells. So if she sells shells on the seashore, I'm sure she sells seashore shells."
+                    ],
+                    ["今天晴暖转阴雨，空气质量优至良，空气相对湿度较低。"],  # noqa: RUF001 — Chinese punctuation
+                ],
+                inputs=tts_text,
+            )
+        with gr.Tab("Voice Cloning"):
+            with gr.Row():
+                with gr.Column():
+                    vc_prompt_audio = gr.Audio(label="Prompt Audio", type="numpy")
+                    vc_prompt_text = gr.Textbox(
+                        label="Prompt Text",
+                        lines=2,
+                        placeholder="Transcription of the prompt audio...",
+                    )
+                    vc_text = gr.Textbox(
+                        label="Text to Synthesize",
+                        lines=3,
+                        placeholder="Enter text to synthesize in the cloned voice...",
+                    )
+                    vc_btn = gr.Button("Generate")
+                with gr.Column():
+                    vc_output = gr.Audio(label="Output")
+    with gr.Accordion("Advanced Settings", open=False):
+        guidance_method = gr.Radio(
+            label="Guidance",
+            choices=["cfg", "apg"],
+            value="cfg",
+        )
+        nfe = gr.Slider(label="NFE Steps", minimum=1, maximum=64, step=1, value=16)
+        guidance_strength = gr.Slider(
+            label="Guidance Strength",
+            minimum=0.0,
+            maximum=10.0,
+            step=0.1,
+            value=4.0,
+        )
+        seed = gr.Slider(
+            label="Seed",
+            minimum=0,
+            maximum=MAX_SEED,
+            step=1,
+            value=1024,
+        )
+        randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+    tts_btn.click(
+        fn=get_seed,
+        inputs=[randomize_seed, seed],
+        outputs=seed,
+        queue=False,
+    ).then(
+        fn=generate_tts,
+        inputs=[tts_text, guidance_method, nfe, guidance_strength, seed],
+        outputs=tts_output,
+    )
+    vc_btn.click(
+        fn=get_seed,
+        inputs=[randomize_seed, seed],
+        outputs=seed,
+        queue=False,
+    ).then(
+        fn=generate_voice_clone,
+        inputs=[
+            vc_text,
+            vc_prompt_text,
+            vc_prompt_audio,
+            guidance_method,
+            nfe,
+            guidance_strength,
+            seed,
+        ],
+        outputs=vc_output,
+    )
+if __name__ == "__main__":
+    demo.launch()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,63 @@

+[project]
+name = "longcat-audiodit-3-5b"
+version = "0.1.0"
+description = "Gradio demo for LongCat-AudioDiT TTS"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "einops>=0.8.2",
+    "gradio>=6.10.0",
+    "librosa>=0.11.0",
+    "numpy>=2.4.4",
+    "safetensors>=0.7.0",
+    "soundfile>=0.13.1",
+    "spaces>=0.48.1",
+    "torch==2.9.1",
+    "torchaudio>=2.11.0",
+    "transformers>=5.4.0",
+]
+[tool.ruff]
+line-length = 119
+extend-exclude = ["vendor"]
+[tool.ruff.lint]
+select = ["ALL"]
+ignore = [
+    "COM812", # missing-trailing-comma
+    "D203",   # one-blank-line-before-class
+    "D213",   # multi-line-summary-second-line
+    "E501",   # line-too-long
+    "SIM117", # multiple-with-statements
+    #
+    "D100",    # undocumented-public-module
+    "D101",    # undocumented-public-class
+    "D102",    # undocumented-public-method
+    "D103",    # undocumented-public-function
+    "D104",    # undocumented-public-package
+    "D105",    # undocumented-magic-method
+    "D107",    # undocumented-public-init
+    "EM101",   # raw-string-in-exception
+    "FBT001",  # boolean-type-hint-positional-argument
+    "FBT002",  # boolean-default-value-positional-argument
+    "ISC001",  # single-line-implicit-string-concatenation
+    "PGH003",  # blanket-type-ignore
+    "PLR0913", # too-many-arguments
+    "PLR0915", # too-many-statements
+    "TRY003",  # raise-vanilla-args
+]
+unfixable = [
+    "F401", # unused-import
+]
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+[tool.ruff.format]
+docstring-code-format = true
+[dependency-groups]
+dev = [
+    "ruff>=0.15.8",
+]
+hf-spaces = ["datasets"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,366 @@

+# This file was autogenerated by uv via the following command:
+#    uv export --no-hashes --no-dev --group hf-spaces --no-emit-package typer-slim --no-emit-package spaces -o requirements.txt
+aiofiles==24.1.0
+    # via gradio
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.13.4
+    # via fsspec
+aiosignal==1.4.0
+    # via aiohttp
+annotated-doc==0.0.4
+    # via
+    #   fastapi
+    #   typer
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.13.0
+    # via
+    #   gradio
+    #   httpx
+    #   starlette
+attrs==26.1.0
+    # via aiohttp
+audioop-lts==0.2.2 ; python_full_version >= '3.13'
+    # via
+    #   gradio
+    #   standard-aifc
+    #   standard-sunau
+audioread==3.1.0
+    # via librosa
+brotli==1.2.0
+    # via gradio
+certifi==2026.2.25
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+cffi==2.0.0
+    # via soundfile
+charset-normalizer==3.4.6
+    # via requests
+click==8.3.1
+    # via
+    #   typer
+    #   uvicorn
+colorama==0.4.6 ; sys_platform == 'win32'
+    # via
+    #   click
+    #   tqdm
+datasets==4.8.4
+decorator==5.2.1
+    # via librosa
+dill==0.4.1
+    # via
+    #   datasets
+    #   multiprocess
+einops==0.8.2
+    # via longcat-audiodit-3-5b
+fastapi==0.135.2
+    # via gradio
+ffmpy==1.0.0
+    # via gradio
+filelock==3.25.2
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+frozenlist==1.8.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2026.2.0
+    # via
+    #   datasets
+    #   gradio-client
+    #   huggingface-hub
+    #   torch
+gradio==6.10.0
+    # via
+    #   longcat-audiodit-3-5b
+    #   spaces
+gradio-client==2.4.0
+    # via
+    #   gradio
+    #   hf-gradio
+groovy==0.1.2
+    # via gradio
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+hf-gradio==0.3.0
+    # via gradio
+hf-xet==1.4.2 ; platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
+    # via huggingface-hub
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   safehttpx
+    #   spaces
+huggingface-hub==1.8.0
+    # via
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   tokenizers
+    #   transformers
+idna==3.11
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+    #   yarl
+jinja2==3.1.6
+    # via
+    #   gradio
+    #   torch
+joblib==1.5.3
+    # via
+    #   librosa
+    #   scikit-learn
+lazy-loader==0.5
+    # via librosa
+librosa==0.11.0
+    # via longcat-audiodit-3-5b
+llvmlite==0.46.0
+    # via numba
+markdown-it-py==4.0.0
+    # via rich
+markupsafe==3.0.3
+    # via
+    #   gradio
+    #   jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+msgpack==1.1.2
+    # via librosa
+multidict==6.7.1
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.19
+    # via datasets
+networkx==3.6.1
+    # via torch
+numba==0.64.0
+    # via librosa
+numpy==2.4.4
+    # via
+    #   datasets
+    #   gradio
+    #   librosa
+    #   longcat-audiodit-3-5b
+    #   numba
+    #   pandas
+    #   scikit-learn
+    #   scipy
+    #   soundfile
+    #   soxr
+    #   transformers
+nvidia-cublas-cu12==12.8.4.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cuda-runtime-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cudnn-cu12==9.10.2.21 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cufft-cu12==11.3.3.83 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cufile-cu12==1.13.1.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-curand-cu12==10.3.9.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cusolver-cu12==11.7.3.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cusparse-cu12==12.5.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.7.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-nccl-cu12==2.27.5 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-nvjitlink-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cufft-cu12
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvshmem-cu12==3.3.20 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-nvtx-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+orjson==3.11.7
+    # via gradio
+packaging==26.0
+    # via
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   lazy-loader
+    #   pooch
+    #   spaces
+    #   transformers
+pandas==3.0.2
+    # via
+    #   datasets
+    #   gradio
+pillow==12.1.1
+    # via gradio
+platformdirs==4.9.4
+    # via pooch
+pooch==1.9.0
+    # via librosa
+propcache==0.4.1
+    # via
+    #   aiohttp
+    #   yarl
+psutil==5.9.8
+    # via spaces
+pyarrow==23.0.1
+    # via datasets
+pycparser==3.0 ; implementation_name != 'PyPy'
+    # via cffi
+pydantic==2.12.5
+    # via
+    #   fastapi
+    #   gradio
+    #   spaces
+pydantic-core==2.41.5
+    # via pydantic
+pydub==0.25.1
+    # via gradio
+pygments==2.20.0
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+python-multipart==0.0.22
+    # via gradio
+pytz==2026.1.post1
+    # via gradio
+pyyaml==6.0.3
+    # via
+    #   datasets
+    #   gradio
+    #   huggingface-hub
+    #   transformers
+regex==2026.3.32
+    # via transformers
+requests==2.33.1
+    # via
+    #   datasets
+    #   pooch
+    #   spaces
+rich==14.3.3
+    # via typer
+safehttpx==0.1.7
+    # via gradio
+safetensors==0.7.0
+    # via
+    #   longcat-audiodit-3-5b
+    #   transformers
+scikit-learn==1.8.0
+    # via librosa
+scipy==1.17.1
+    # via
+    #   librosa
+    #   scikit-learn
+semantic-version==2.10.0
+    # via gradio
+setuptools==82.0.1
+    # via torch
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+soundfile==0.13.1
+    # via
+    #   librosa
+    #   longcat-audiodit-3-5b
+soxr==1.0.0
+    # via librosa
+standard-aifc==3.13.0 ; python_full_version >= '3.13'
+    # via
+    #   audioread
+    #   librosa
+standard-chunk==3.13.0 ; python_full_version >= '3.13'
+    # via standard-aifc
+standard-sunau==3.13.0 ; python_full_version >= '3.13'
+    # via
+    #   audioread
+    #   librosa
+starlette==0.52.1
+    # via
+    #   fastapi
+    #   gradio
+sympy==1.14.0
+    # via torch
+threadpoolctl==3.6.0
+    # via scikit-learn
+tokenizers==0.22.2
+    # via transformers
+tomlkit==0.13.3
+    # via gradio
+torch==2.9.1
+    # via longcat-audiodit-3-5b
+torchaudio==2.11.0
+    # via longcat-audiodit-3-5b
+tqdm==4.67.3
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+transformers==5.4.0
+    # via longcat-audiodit-3-5b
+triton==3.5.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+typer==0.24.1
+    # via
+    #   gradio
+    #   hf-gradio
+    #   huggingface-hub
+    #   transformers
+typing-extensions==4.15.0
+    # via
+    #   aiosignal
+    #   anyio
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   librosa
+    #   pydantic
+    #   pydantic-core
+    #   spaces
+    #   starlette
+    #   torch
+    #   typing-inspection
+typing-inspection==0.4.2
+    # via
+    #   fastapi
+    #   pydantic
+tzdata==2025.3 ; sys_platform == 'emscripten' or sys_platform == 'win32'
+    # via pandas
+urllib3==2.6.3
+    # via requests
+uvicorn==0.42.0
+    # via gradio
+xxhash==3.6.0
+    # via datasets
+yarl==1.23.0
+    # via aiohttp

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

vendor/LongCat-AudioDiT ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit eec76e3b0fe5fd9ed6a1f0b990f97bc33cda21ae