hysts HF Staff commited on
Commit
f0a5bff
·
1 Parent(s): cf81ad3

Add files

Browse files
Files changed (8) hide show
  1. .gitmodules +3 -0
  2. .python-version +1 -0
  3. README.md +2 -1
  4. app.py +282 -0
  5. pyproject.toml +63 -0
  6. requirements.txt +366 -0
  7. uv.lock +0 -0
  8. vendor/LongCat-AudioDiT +1 -0
.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "vendor/LongCat-AudioDiT"]
2
+ path = vendor/LongCat-AudioDiT
3
+ url = https://github.com/meituan-longcat/LongCat-AudioDiT.git
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
README.md CHANGED
@@ -1,10 +1,11 @@
1
  ---
2
  title: LongCat AudioDiT 3.5B
3
- emoji: 👀
4
  colorFrom: purple
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 6.10.0
 
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
  title: LongCat AudioDiT 3.5B
3
+ emoji: 🐱
4
  colorFrom: purple
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 6.10.0
8
+ python_version: "3.12"
9
  app_file: app.py
10
  pinned: false
11
  ---
app.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ import gradio as gr
6
+ import librosa
7
+ import numpy as np
8
+ import spaces
9
+ import torch
10
+
11
+ # Register audiodit model type with transformers
12
+ sys.path.insert(0, str(Path(__file__).resolve().parent / "vendor" / "LongCat-AudioDiT"))
13
+ import audiodit # noqa: F401
14
+ from audiodit import AudioDiTModel
15
+ from transformers import AutoTokenizer
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # Text utilities (from upstream utils.py)
19
+ # ---------------------------------------------------------------------------
20
+
21
+ MAX_SEED = 2**32 - 1
22
+
23
+ EN_DUR_PER_CHAR = 0.082
24
+ ZH_DUR_PER_CHAR = 0.21
25
+
26
+
27
+ def normalize_text(text: str) -> str:
28
+ text = text.lower()
29
+ text = re.sub(r"[\u201c\u201d\u201e\u2018\u2019]", " ", text)
30
+ text = re.sub(r"\s+", " ", text)
31
+ return text.strip()
32
+
33
+
34
+ def approx_duration_from_text(text: str, max_duration: float = 30.0) -> float:
35
+ text = re.sub(r"\s+", "", text)
36
+ num_zh = num_en = num_other = 0
37
+ for c in text:
38
+ if "\u4e00" <= c <= "\u9fff":
39
+ num_zh += 1
40
+ elif c.isalpha():
41
+ num_en += 1
42
+ else:
43
+ num_other += 1
44
+ if num_zh > num_en:
45
+ num_zh += num_other
46
+ else:
47
+ num_en += num_other
48
+ return min(max_duration, num_zh * ZH_DUR_PER_CHAR + num_en * EN_DUR_PER_CHAR)
49
+
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # Model loading
53
+ # ---------------------------------------------------------------------------
54
+
55
+ MODEL_ID = "meituan-longcat/LongCat-AudioDiT-3.5B"
56
+
57
+ model = AudioDiTModel.from_pretrained(MODEL_ID).to("cuda")
58
+ model.vae.to_half()
59
+ model.eval()
60
+ tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder_model)
61
+
62
+
63
+ # ---------------------------------------------------------------------------
64
+ # Inference
65
+ # ---------------------------------------------------------------------------
66
+
67
+
68
+ def get_seed(randomize_seed: bool, seed: int) -> int:
69
+ rng = np.random.default_rng()
70
+ return int(rng.integers(0, MAX_SEED)) if randomize_seed else seed
71
+
72
+
73
+ @spaces.GPU
74
+ def generate_tts(
75
+ text: str,
76
+ guidance_method: str,
77
+ nfe: int,
78
+ guidance_strength: float,
79
+ seed: int,
80
+ ) -> tuple[int, np.ndarray]:
81
+ text = normalize_text(text)
82
+ if not text:
83
+ raise gr.Error("Text is empty (or contains only whitespace/quotes).")
84
+
85
+ torch.manual_seed(seed)
86
+ torch.cuda.manual_seed(seed)
87
+
88
+ sr = model.config.sampling_rate
89
+ full_hop = model.config.latent_hop
90
+ max_duration = model.config.max_wav_duration
91
+
92
+ inputs = tokenizer([text], padding="longest", return_tensors="pt")
93
+
94
+ dur_sec = approx_duration_from_text(text, max_duration=max_duration)
95
+ duration = int(dur_sec * sr // full_hop)
96
+
97
+ output = model(
98
+ input_ids=inputs.input_ids,
99
+ attention_mask=inputs.attention_mask,
100
+ duration=duration,
101
+ steps=nfe,
102
+ cfg_strength=guidance_strength,
103
+ guidance_method=guidance_method,
104
+ )
105
+
106
+ wav = output.waveform.squeeze().detach().cpu().numpy()
107
+ return (sr, wav)
108
+
109
+
110
+ @spaces.GPU
111
+ def generate_voice_clone(
112
+ text: str,
113
+ prompt_text: str,
114
+ prompt_audio: tuple[int, np.ndarray] | str | None,
115
+ guidance_method: str,
116
+ nfe: int,
117
+ guidance_strength: float,
118
+ seed: int,
119
+ ) -> tuple[int, np.ndarray]:
120
+ if prompt_audio is None:
121
+ raise gr.Error("Prompt audio is required.")
122
+
123
+ torch.manual_seed(seed)
124
+ torch.cuda.manual_seed(seed)
125
+
126
+ sr = model.config.sampling_rate
127
+ full_hop = model.config.latent_hop
128
+ max_duration = model.config.max_wav_duration
129
+
130
+ # Load prompt audio — gr.Audio returns (sample_rate, ndarray)
131
+ input_sr, audio_np = prompt_audio
132
+ if audio_np.ndim > 1:
133
+ audio_np = audio_np.mean(axis=-1)
134
+ audio_np = audio_np.astype(np.float32)
135
+ if np.abs(audio_np).max() > 1.0:
136
+ audio_np = audio_np / np.abs(audio_np).max()
137
+ if input_sr != sr:
138
+ audio_np = librosa.resample(audio_np, orig_sr=input_sr, target_sr=sr)
139
+
140
+ prompt_wav = torch.from_numpy(audio_np).unsqueeze(0).unsqueeze(0) # (1, 1, T)
141
+
142
+ # encode_prompt_audio handles VAE padding/encoding/trimming internally
143
+ _, prompt_dur = model.encode_prompt_audio(prompt_wav)
144
+
145
+ # Text
146
+ text = normalize_text(text)
147
+ if not text:
148
+ raise gr.Error("Text is empty (or contains only whitespace/quotes).")
149
+ prompt_text = normalize_text(prompt_text)
150
+ if not prompt_text:
151
+ raise gr.Error("Prompt text is empty (or contains only whitespace/quotes).")
152
+ full_text = f"{prompt_text} {text}"
153
+ inputs = tokenizer([full_text], padding="longest", return_tensors="pt")
154
+
155
+ # Duration estimation
156
+ prompt_time = prompt_dur * full_hop / sr
157
+ dur_sec = approx_duration_from_text(text, max_duration=max_duration - prompt_time)
158
+ approx_pd = approx_duration_from_text(prompt_text, max_duration=max_duration)
159
+ ratio = np.clip(prompt_time / approx_pd, 1.0, 1.5)
160
+ dur_sec = dur_sec * ratio
161
+ duration = int(dur_sec * sr // full_hop)
162
+ duration = min(duration + prompt_dur, int(max_duration * sr // full_hop))
163
+
164
+ output = model(
165
+ input_ids=inputs.input_ids,
166
+ attention_mask=inputs.attention_mask,
167
+ prompt_audio=prompt_wav,
168
+ duration=duration,
169
+ steps=nfe,
170
+ cfg_strength=guidance_strength,
171
+ guidance_method=guidance_method,
172
+ )
173
+
174
+ wav = output.waveform.squeeze().detach().cpu().numpy()
175
+ return (sr, wav)
176
+
177
+
178
+ # ---------------------------------------------------------------------------
179
+ # UI
180
+ # ---------------------------------------------------------------------------
181
+
182
+ with gr.Blocks() as demo:
183
+ gr.Markdown("# LongCat-AudioDiT")
184
+ gr.Markdown(
185
+ "Diffusion-based text-to-speech with zero-shot voice cloning. "
186
+ "Based on [meituan-longcat/LongCat-AudioDiT](https://github.com/meituan-longcat/LongCat-AudioDiT)."
187
+ )
188
+
189
+ with gr.Tabs():
190
+ with gr.Tab("TTS"):
191
+ with gr.Row():
192
+ with gr.Column():
193
+ tts_text = gr.Textbox(
194
+ label="Text",
195
+ lines=5,
196
+ placeholder="Enter text to synthesize...",
197
+ )
198
+ tts_btn = gr.Button("Generate")
199
+ with gr.Column():
200
+ tts_output = gr.Audio(label="Output")
201
+ gr.Examples(
202
+ examples=[
203
+ [
204
+ "She sells seashells by the seashore. The shells she sells are surely seashells. So if she sells shells on the seashore, I'm sure she sells seashore shells."
205
+ ],
206
+ ["今天晴暖转阴雨,空气质量优至良,空气相对湿度较低。"], # noqa: RUF001 — Chinese punctuation
207
+ ],
208
+ inputs=tts_text,
209
+ )
210
+
211
+ with gr.Tab("Voice Cloning"):
212
+ with gr.Row():
213
+ with gr.Column():
214
+ vc_prompt_audio = gr.Audio(label="Prompt Audio", type="numpy")
215
+ vc_prompt_text = gr.Textbox(
216
+ label="Prompt Text",
217
+ lines=2,
218
+ placeholder="Transcription of the prompt audio...",
219
+ )
220
+ vc_text = gr.Textbox(
221
+ label="Text to Synthesize",
222
+ lines=3,
223
+ placeholder="Enter text to synthesize in the cloned voice...",
224
+ )
225
+ vc_btn = gr.Button("Generate")
226
+ with gr.Column():
227
+ vc_output = gr.Audio(label="Output")
228
+
229
+ with gr.Accordion("Advanced Settings", open=False):
230
+ guidance_method = gr.Radio(
231
+ label="Guidance",
232
+ choices=["cfg", "apg"],
233
+ value="cfg",
234
+ )
235
+ nfe = gr.Slider(label="NFE Steps", minimum=1, maximum=64, step=1, value=16)
236
+ guidance_strength = gr.Slider(
237
+ label="Guidance Strength",
238
+ minimum=0.0,
239
+ maximum=10.0,
240
+ step=0.1,
241
+ value=4.0,
242
+ )
243
+ seed = gr.Slider(
244
+ label="Seed",
245
+ minimum=0,
246
+ maximum=MAX_SEED,
247
+ step=1,
248
+ value=1024,
249
+ )
250
+ randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
251
+
252
+ tts_btn.click(
253
+ fn=get_seed,
254
+ inputs=[randomize_seed, seed],
255
+ outputs=seed,
256
+ queue=False,
257
+ ).then(
258
+ fn=generate_tts,
259
+ inputs=[tts_text, guidance_method, nfe, guidance_strength, seed],
260
+ outputs=tts_output,
261
+ )
262
+ vc_btn.click(
263
+ fn=get_seed,
264
+ inputs=[randomize_seed, seed],
265
+ outputs=seed,
266
+ queue=False,
267
+ ).then(
268
+ fn=generate_voice_clone,
269
+ inputs=[
270
+ vc_text,
271
+ vc_prompt_text,
272
+ vc_prompt_audio,
273
+ guidance_method,
274
+ nfe,
275
+ guidance_strength,
276
+ seed,
277
+ ],
278
+ outputs=vc_output,
279
+ )
280
+
281
+ if __name__ == "__main__":
282
+ demo.launch()
pyproject.toml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "longcat-audiodit-3-5b"
3
+ version = "0.1.0"
4
+ description = "Gradio demo for LongCat-AudioDiT TTS"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "einops>=0.8.2",
9
+ "gradio>=6.10.0",
10
+ "librosa>=0.11.0",
11
+ "numpy>=2.4.4",
12
+ "safetensors>=0.7.0",
13
+ "soundfile>=0.13.1",
14
+ "spaces>=0.48.1",
15
+ "torch==2.9.1",
16
+ "torchaudio>=2.11.0",
17
+ "transformers>=5.4.0",
18
+ ]
19
+
20
+ [tool.ruff]
21
+ line-length = 119
22
+ extend-exclude = ["vendor"]
23
+
24
+ [tool.ruff.lint]
25
+ select = ["ALL"]
26
+ ignore = [
27
+ "COM812", # missing-trailing-comma
28
+ "D203", # one-blank-line-before-class
29
+ "D213", # multi-line-summary-second-line
30
+ "E501", # line-too-long
31
+ "SIM117", # multiple-with-statements
32
+ #
33
+ "D100", # undocumented-public-module
34
+ "D101", # undocumented-public-class
35
+ "D102", # undocumented-public-method
36
+ "D103", # undocumented-public-function
37
+ "D104", # undocumented-public-package
38
+ "D105", # undocumented-magic-method
39
+ "D107", # undocumented-public-init
40
+ "EM101", # raw-string-in-exception
41
+ "FBT001", # boolean-type-hint-positional-argument
42
+ "FBT002", # boolean-default-value-positional-argument
43
+ "ISC001", # single-line-implicit-string-concatenation
44
+ "PGH003", # blanket-type-ignore
45
+ "PLR0913", # too-many-arguments
46
+ "PLR0915", # too-many-statements
47
+ "TRY003", # raise-vanilla-args
48
+ ]
49
+ unfixable = [
50
+ "F401", # unused-import
51
+ ]
52
+
53
+ [tool.ruff.lint.pydocstyle]
54
+ convention = "google"
55
+
56
+ [tool.ruff.format]
57
+ docstring-code-format = true
58
+
59
+ [dependency-groups]
60
+ dev = [
61
+ "ruff>=0.15.8",
62
+ ]
63
+ hf-spaces = ["datasets"]
requirements.txt ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv export --no-hashes --no-dev --group hf-spaces --no-emit-package typer-slim --no-emit-package spaces -o requirements.txt
3
+ aiofiles==24.1.0
4
+ # via gradio
5
+ aiohappyeyeballs==2.6.1
6
+ # via aiohttp
7
+ aiohttp==3.13.4
8
+ # via fsspec
9
+ aiosignal==1.4.0
10
+ # via aiohttp
11
+ annotated-doc==0.0.4
12
+ # via
13
+ # fastapi
14
+ # typer
15
+ annotated-types==0.7.0
16
+ # via pydantic
17
+ anyio==4.13.0
18
+ # via
19
+ # gradio
20
+ # httpx
21
+ # starlette
22
+ attrs==26.1.0
23
+ # via aiohttp
24
+ audioop-lts==0.2.2 ; python_full_version >= '3.13'
25
+ # via
26
+ # gradio
27
+ # standard-aifc
28
+ # standard-sunau
29
+ audioread==3.1.0
30
+ # via librosa
31
+ brotli==1.2.0
32
+ # via gradio
33
+ certifi==2026.2.25
34
+ # via
35
+ # httpcore
36
+ # httpx
37
+ # requests
38
+ cffi==2.0.0
39
+ # via soundfile
40
+ charset-normalizer==3.4.6
41
+ # via requests
42
+ click==8.3.1
43
+ # via
44
+ # typer
45
+ # uvicorn
46
+ colorama==0.4.6 ; sys_platform == 'win32'
47
+ # via
48
+ # click
49
+ # tqdm
50
+ datasets==4.8.4
51
+ decorator==5.2.1
52
+ # via librosa
53
+ dill==0.4.1
54
+ # via
55
+ # datasets
56
+ # multiprocess
57
+ einops==0.8.2
58
+ # via longcat-audiodit-3-5b
59
+ fastapi==0.135.2
60
+ # via gradio
61
+ ffmpy==1.0.0
62
+ # via gradio
63
+ filelock==3.25.2
64
+ # via
65
+ # datasets
66
+ # huggingface-hub
67
+ # torch
68
+ frozenlist==1.8.0
69
+ # via
70
+ # aiohttp
71
+ # aiosignal
72
+ fsspec==2026.2.0
73
+ # via
74
+ # datasets
75
+ # gradio-client
76
+ # huggingface-hub
77
+ # torch
78
+ gradio==6.10.0
79
+ # via
80
+ # longcat-audiodit-3-5b
81
+ # spaces
82
+ gradio-client==2.4.0
83
+ # via
84
+ # gradio
85
+ # hf-gradio
86
+ groovy==0.1.2
87
+ # via gradio
88
+ h11==0.16.0
89
+ # via
90
+ # httpcore
91
+ # uvicorn
92
+ hf-gradio==0.3.0
93
+ # via gradio
94
+ hf-xet==1.4.2 ; platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
95
+ # via huggingface-hub
96
+ httpcore==1.0.9
97
+ # via httpx
98
+ httpx==0.28.1
99
+ # via
100
+ # datasets
101
+ # gradio
102
+ # gradio-client
103
+ # huggingface-hub
104
+ # safehttpx
105
+ # spaces
106
+ huggingface-hub==1.8.0
107
+ # via
108
+ # datasets
109
+ # gradio
110
+ # gradio-client
111
+ # tokenizers
112
+ # transformers
113
+ idna==3.11
114
+ # via
115
+ # anyio
116
+ # httpx
117
+ # requests
118
+ # yarl
119
+ jinja2==3.1.6
120
+ # via
121
+ # gradio
122
+ # torch
123
+ joblib==1.5.3
124
+ # via
125
+ # librosa
126
+ # scikit-learn
127
+ lazy-loader==0.5
128
+ # via librosa
129
+ librosa==0.11.0
130
+ # via longcat-audiodit-3-5b
131
+ llvmlite==0.46.0
132
+ # via numba
133
+ markdown-it-py==4.0.0
134
+ # via rich
135
+ markupsafe==3.0.3
136
+ # via
137
+ # gradio
138
+ # jinja2
139
+ mdurl==0.1.2
140
+ # via markdown-it-py
141
+ mpmath==1.3.0
142
+ # via sympy
143
+ msgpack==1.1.2
144
+ # via librosa
145
+ multidict==6.7.1
146
+ # via
147
+ # aiohttp
148
+ # yarl
149
+ multiprocess==0.70.19
150
+ # via datasets
151
+ networkx==3.6.1
152
+ # via torch
153
+ numba==0.64.0
154
+ # via librosa
155
+ numpy==2.4.4
156
+ # via
157
+ # datasets
158
+ # gradio
159
+ # librosa
160
+ # longcat-audiodit-3-5b
161
+ # numba
162
+ # pandas
163
+ # scikit-learn
164
+ # scipy
165
+ # soundfile
166
+ # soxr
167
+ # transformers
168
+ nvidia-cublas-cu12==12.8.4.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
169
+ # via
170
+ # nvidia-cudnn-cu12
171
+ # nvidia-cusolver-cu12
172
+ # torch
173
+ nvidia-cuda-cupti-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
174
+ # via torch
175
+ nvidia-cuda-nvrtc-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
176
+ # via torch
177
+ nvidia-cuda-runtime-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
178
+ # via torch
179
+ nvidia-cudnn-cu12==9.10.2.21 ; platform_machine == 'x86_64' and sys_platform == 'linux'
180
+ # via torch
181
+ nvidia-cufft-cu12==11.3.3.83 ; platform_machine == 'x86_64' and sys_platform == 'linux'
182
+ # via torch
183
+ nvidia-cufile-cu12==1.13.1.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
184
+ # via torch
185
+ nvidia-curand-cu12==10.3.9.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
186
+ # via torch
187
+ nvidia-cusolver-cu12==11.7.3.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
188
+ # via torch
189
+ nvidia-cusparse-cu12==12.5.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
190
+ # via
191
+ # nvidia-cusolver-cu12
192
+ # torch
193
+ nvidia-cusparselt-cu12==0.7.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
194
+ # via torch
195
+ nvidia-nccl-cu12==2.27.5 ; platform_machine == 'x86_64' and sys_platform == 'linux'
196
+ # via torch
197
+ nvidia-nvjitlink-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
198
+ # via
199
+ # nvidia-cufft-cu12
200
+ # nvidia-cusolver-cu12
201
+ # nvidia-cusparse-cu12
202
+ # torch
203
+ nvidia-nvshmem-cu12==3.3.20 ; platform_machine == 'x86_64' and sys_platform == 'linux'
204
+ # via torch
205
+ nvidia-nvtx-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
206
+ # via torch
207
+ orjson==3.11.7
208
+ # via gradio
209
+ packaging==26.0
210
+ # via
211
+ # datasets
212
+ # gradio
213
+ # gradio-client
214
+ # huggingface-hub
215
+ # lazy-loader
216
+ # pooch
217
+ # spaces
218
+ # transformers
219
+ pandas==3.0.2
220
+ # via
221
+ # datasets
222
+ # gradio
223
+ pillow==12.1.1
224
+ # via gradio
225
+ platformdirs==4.9.4
226
+ # via pooch
227
+ pooch==1.9.0
228
+ # via librosa
229
+ propcache==0.4.1
230
+ # via
231
+ # aiohttp
232
+ # yarl
233
+ psutil==5.9.8
234
+ # via spaces
235
+ pyarrow==23.0.1
236
+ # via datasets
237
+ pycparser==3.0 ; implementation_name != 'PyPy'
238
+ # via cffi
239
+ pydantic==2.12.5
240
+ # via
241
+ # fastapi
242
+ # gradio
243
+ # spaces
244
+ pydantic-core==2.41.5
245
+ # via pydantic
246
+ pydub==0.25.1
247
+ # via gradio
248
+ pygments==2.20.0
249
+ # via rich
250
+ python-dateutil==2.9.0.post0
251
+ # via pandas
252
+ python-multipart==0.0.22
253
+ # via gradio
254
+ pytz==2026.1.post1
255
+ # via gradio
256
+ pyyaml==6.0.3
257
+ # via
258
+ # datasets
259
+ # gradio
260
+ # huggingface-hub
261
+ # transformers
262
+ regex==2026.3.32
263
+ # via transformers
264
+ requests==2.33.1
265
+ # via
266
+ # datasets
267
+ # pooch
268
+ # spaces
269
+ rich==14.3.3
270
+ # via typer
271
+ safehttpx==0.1.7
272
+ # via gradio
273
+ safetensors==0.7.0
274
+ # via
275
+ # longcat-audiodit-3-5b
276
+ # transformers
277
+ scikit-learn==1.8.0
278
+ # via librosa
279
+ scipy==1.17.1
280
+ # via
281
+ # librosa
282
+ # scikit-learn
283
+ semantic-version==2.10.0
284
+ # via gradio
285
+ setuptools==82.0.1
286
+ # via torch
287
+ shellingham==1.5.4
288
+ # via typer
289
+ six==1.17.0
290
+ # via python-dateutil
291
+ soundfile==0.13.1
292
+ # via
293
+ # librosa
294
+ # longcat-audiodit-3-5b
295
+ soxr==1.0.0
296
+ # via librosa
297
+ standard-aifc==3.13.0 ; python_full_version >= '3.13'
298
+ # via
299
+ # audioread
300
+ # librosa
301
+ standard-chunk==3.13.0 ; python_full_version >= '3.13'
302
+ # via standard-aifc
303
+ standard-sunau==3.13.0 ; python_full_version >= '3.13'
304
+ # via
305
+ # audioread
306
+ # librosa
307
+ starlette==0.52.1
308
+ # via
309
+ # fastapi
310
+ # gradio
311
+ sympy==1.14.0
312
+ # via torch
313
+ threadpoolctl==3.6.0
314
+ # via scikit-learn
315
+ tokenizers==0.22.2
316
+ # via transformers
317
+ tomlkit==0.13.3
318
+ # via gradio
319
+ torch==2.9.1
320
+ # via longcat-audiodit-3-5b
321
+ torchaudio==2.11.0
322
+ # via longcat-audiodit-3-5b
323
+ tqdm==4.67.3
324
+ # via
325
+ # datasets
326
+ # huggingface-hub
327
+ # transformers
328
+ transformers==5.4.0
329
+ # via longcat-audiodit-3-5b
330
+ triton==3.5.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
331
+ # via torch
332
+ typer==0.24.1
333
+ # via
334
+ # gradio
335
+ # hf-gradio
336
+ # huggingface-hub
337
+ # transformers
338
+ typing-extensions==4.15.0
339
+ # via
340
+ # aiosignal
341
+ # anyio
342
+ # fastapi
343
+ # gradio
344
+ # gradio-client
345
+ # huggingface-hub
346
+ # librosa
347
+ # pydantic
348
+ # pydantic-core
349
+ # spaces
350
+ # starlette
351
+ # torch
352
+ # typing-inspection
353
+ typing-inspection==0.4.2
354
+ # via
355
+ # fastapi
356
+ # pydantic
357
+ tzdata==2025.3 ; sys_platform == 'emscripten' or sys_platform == 'win32'
358
+ # via pandas
359
+ urllib3==2.6.3
360
+ # via requests
361
+ uvicorn==0.42.0
362
+ # via gradio
363
+ xxhash==3.6.0
364
+ # via datasets
365
+ yarl==1.23.0
366
+ # via aiohttp
uv.lock ADDED
The diff for this file is too large to render. See raw diff
 
vendor/LongCat-AudioDiT ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit eec76e3b0fe5fd9ed6a1f0b990f97bc33cda21ae