GovIndLok commited on
Commit
5b38e09
·
1 Parent(s): fcfec4d

feat: add MiniCPM5-1B model integration and migrate TTS to VoxCPM2 with TorchDynamo compilation fixes

Browse files
Files changed (4) hide show
  1. model.py +70 -0
  2. pyproject.toml +6 -2
  3. tts_model.py +93 -202
  4. uv.lock +0 -0
model.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # VoxCPM2 torch.compiles a submodule that crashes TorchDynamo on this stack
3
+ # ("Cannot construct ConstantVariable for torch.device"); disable compilation so
4
+ # it runs eager. Must be set before torch is imported (via spaces / voxcpm).
5
+ os.environ.setdefault("TORCHDYNAMO_DISABLE", "1")
6
+ os.environ.setdefault("TORCH_COMPILE_DISABLE", "1")
7
+ import threading
8
+ import spaces
9
+ import torch
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
11
+ import threading
12
+ import spaces
13
+
14
+ MODEL_ID = "openbmb/MiniCPM5-1B"
15
+
16
+ print(f"[llm] Loading tokenizer for {MODEL_ID} ...", flush=True)
17
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
18
+
19
+ print(f"[llm] Tokenizer loaded in GPU ...", flush=True)
20
+
21
+ model = AutoModelForCausalLM.from_pretrained(
22
+ MODEL_ID,
23
+ trust_remote_code=True,
24
+ torch_dtype=torch.bfloat16,
25
+ low_cpu_mem_usage=True,
26
+ ).to("cuda")
27
+ model.eval()
28
+ print("[llm] model is ready", flush=True)
29
+
30
+ def model_input(messages):
31
+ "Tokenize chat messages into model inputs."
32
+
33
+ kw = dict(Tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt")
34
+
35
+ try:
36
+ enc = tokenizer.apply_chat_template(messages, enable_thinking=False, **kw)
37
+ except TypeError:
38
+ enc = tokenizer.apply_chat_template(messages, **kw)
39
+ return enc.to(model.device)
40
+
41
+ @spaces.GPU(duration=120)
42
+ def generate(messages, max_new_tokens: int = 100) -> str:
43
+ "One full chat completion (use by blocking path)"
44
+ inputs = model_input(messages)
45
+ in_len = inputs["input_ids"].shape[-1]
46
+ with torch.no_grad():
47
+ out = model.generate(**input, max_new_tokens=max_new_tokens,pad_token_id=tokenizer.eos_token_id, **GEN)
48
+
49
+ return tokenizer.decode(out[0][in_len:], skip_special_tokens=True).strip()
50
+
51
+
52
+ # Test live generation
53
+ @spaces.GPU(duration=100)
54
+ def generate_stream(messages, max_new_tokens: int = 120):
55
+ "Generate lines as miniCPM write it"
56
+ inputs model_input(messages)
57
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
58
+ kwargs = dict(**inputs, streamer=streamer, max_new_tokens=max_new_tokens,pad_token_id=tokenizer.eos_token_id, **GEN)
59
+
60
+ def _run():
61
+ with torch.no_grad():
62
+ model.generate(**kwargs)
63
+
64
+ threading.Thread(target=_run, daemon=True).start()
65
+ acc = ""
66
+
67
+ for piece with streamer:
68
+ acc += piece
69
+ yield piece
70
+
pyproject.toml CHANGED
@@ -6,12 +6,16 @@ readme = "README.md"
6
  requires-python = ">=3.12"
7
  dependencies = [
8
  "gradio",
9
- "ollama",
10
  "numpy",
11
  "scipy",
12
  "torch",
13
- "kokoro",
 
 
 
 
14
  "soundfile",
 
15
  ]
16
 
17
  [tool.setuptools.packages.find]
 
6
  requires-python = ">=3.12"
7
  dependencies = [
8
  "gradio",
 
9
  "numpy",
10
  "scipy",
11
  "torch",
12
+ "torchaudio",
13
+ "tiktoken",
14
+ "sentencepiece",
15
+ "voxcpm>=2.0",
16
+ "transformers",
17
  "soundfile",
18
+ "spaces",
19
  ]
20
 
21
  [tool.setuptools.packages.find]
tts_model.py CHANGED
@@ -1,207 +1,98 @@
1
  from re import split
2
  import os
3
- import torch
 
 
 
 
 
 
 
 
4
  import numpy as np
5
- import time
6
- from typing import Tuple, List
7
-
8
- from kokoro import KModel, KPipeline
9
-
10
- try:
11
- import spaces
12
- except ImportError:
13
- # Mock spaces decorator for local development
14
- class mock_spaces:
15
- @staticmethod
16
- def GPU(duration=None):
17
- def decorator(func):
18
- return func
19
- return decorator
20
- spaces = mock_spaces
21
-
22
- _MODEL: KModel | None = None
23
- _ON_GPU = False
24
-
25
-
26
- @spaces.GPU(duration=None)
27
- def _forword_gpu(ps, ref_s, speed: float):
28
- global _ON_GPU
29
- if not _ON_GPU:
30
- device = "cuda" if torch.cuda.is_available() else "cpu"
31
- _MODEL.to(device)
32
- _ON_GPU = True
33
- return _MODEL(ps, ref_s, speed)
34
-
35
-
36
- class TTSModel:
37
- def __init__(self) -> None:
38
- self.pipeline = {}
39
- # Support voice_t1 directory path, falling back to voice_v1
40
- voice_dir = os.path.join(os.path.dirname(__file__), "voice_t1")
41
- if not os.path.exists(voice_dir):
42
- voice_dir = os.path.join(os.path.dirname(__file__), "voice_v1")
43
- self.voice_dir = voice_dir
44
-
45
- def initialize(self) -> bool:
46
- global _MODEL
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  try:
48
- print("Initiazing the model .. .")
49
- _MODEL = KModel().eval()
50
- print("Model initialized")
51
- return True
52
- except Exception as e:
53
- print(f"Error Initiazing model: {str(e)}")
54
- return False
55
-
56
- def _pipeline_for(self, lang_code: str) -> KPipeline:
57
- if lang_code not in self.pipeline:
58
- self.pipeline[lang_code] = KPipeline(lang_code=lang_code, model=False)
59
- return self.pipeline[lang_code]
60
-
61
- def list_voice(self) -> List[str]:
62
- voices = []
63
- if os.path.exists(self.voice_dir):
64
- for file in os.listdir(self.voice_dir):
65
- if file.endswith(".pt"):
66
- voices.append(file[:-3])
67
- return sorted(voices)
68
-
69
- def generate_speech(
70
- self,
71
- text: str,
72
- voice_names: list[str],
73
- speed: float = 1.0,
74
- gpu_timeout: int = 60,
75
- progress_callback=None,
76
- progress_state=None,
77
- progress=None,
78
- ):
79
- try:
80
- start_time = time.time()
81
- if not text or not voice_names:
82
- raise ValueError("Text and voice name are required")
83
-
84
- # Resolve voice names to local paths if they exist locally
85
- resolved_voices = []
86
- for v in voice_names:
87
- if os.path.exists(v):
88
- resolved_voices.append(v)
89
- else:
90
- local_path = os.path.join(self.voice_dir, f"{v}.pt")
91
- if os.path.exists(local_path):
92
- resolved_voices.append(local_path)
93
- else:
94
- resolved_voices.append(v)
95
-
96
- # Extract base names to determine language codes
97
- base_voice_names = []
98
- for v in resolved_voices:
99
- if os.path.exists(v):
100
- base_name = os.path.basename(v)
101
- if base_name.endswith(".pt"):
102
- base_name = base_name[:-3]
103
- base_voice_names.append(base_name)
104
- else:
105
- base_voice_names.append(v)
106
-
107
- lang_codes = {v[0] for v in base_voice_names}
108
- if len(lang_codes) > 1:
109
- raise ValueError(
110
- f"Cannot mix voices from different languages: {sorted(lang_codes)}. "
111
- )
112
- lang_code = base_voice_names[0][0]
113
- pipeline = self._pipeline_for(lang_code)
114
-
115
- voice_name = (
116
- ",".join(resolved_voices) if len(resolved_voices) > 1 else resolved_voices[0]
117
- )
118
- pack = pipeline.load_voice(voice_name)
119
-
120
- processed_text = "\n\n".join(
121
- paragraph.replace("\n", " ").replace(" ", " ").strip()
122
- for paragraph in text.split("\n\n")
123
- )
124
-
125
- audio_chucks = []
126
- chunk_times = []
127
- chunk_sizes = []
128
- total_tokens = 0
129
- total_process_time = 0
130
-
131
- for i, (gs, ps, _) in enumerate(
132
- pipeline(
133
- processed_text,
134
- voice=voice_name,
135
- speed=speed,
136
- split_pattern=r"\n\n+",
137
- )
138
- ):
139
- ref_s = pack[len(ps) - 1].detach()
140
- audio = _forword_gpu(ps, ref_s, speed)
141
- audio = (
142
- audio.cpu().numpy() if hasattr(audio, "cpu") else np.asarray(audio)
143
- )
144
-
145
- chunk_process_time = time.time() - start_time - total_process_time
146
- total_process_time += chunk_process_time
147
- audio_chucks.append(audio)
148
-
149
- chunk_tokens = len(gs)
150
- total_tokens += chunk_tokens
151
- chunk_duration = len(audio) / 24000
152
- tokens_per_sec = (
153
- chunk_tokens / chunk_duration if chunk_duration else 0.0
154
- )
155
- rtf = chunk_process_time / chunk_duration if chunk_duration else 0.0
156
-
157
- chunk_times.append(chunk_process_time)
158
- chunk_sizes.append(chunk_tokens)
159
-
160
- print(f"Chunk {i + 1}:")
161
- print(f" Process time: {chunk_process_time:.2f}s")
162
- print(f" Audio duration: {chunk_duration:.2f}s")
163
- print(f" Tokens/sec: {tokens_per_sec:.1f}")
164
- print(f" Real-time factor: {rtf:.3f}")
165
-
166
- if progress_callback and progress_state:
167
- progress_state.setdefault("tokens_per_sec", []).append(
168
- tokens_per_sec
169
- )
170
- progress_state.setdefault("rtf", []).append(rtf)
171
- progress_state.setdefault("chunk_times", []).append(
172
- chunk_process_time
173
- )
174
- progress_callback(
175
- i + 1,
176
- -1,
177
- tokens_per_sec,
178
- rtf,
179
- progress_state,
180
- start_time,
181
- gpu_timeout,
182
- progress,
183
- )
184
- audio = np.concatenate(audio_chucks)
185
-
186
- return (
187
- audio,
188
- len(audio) / 24000,
189
- {
190
- "chunk_times": chunk_times,
191
- "chunk_sizes": chunk_sizes,
192
- "tokens_per_sec": [
193
- float(x) for x in progress_state["tokens_per_sec"]
194
- ]
195
- if progress_state
196
- else [],
197
- "rtf": [float(x) for x in progress_state["rtf"]]
198
- if progress_state
199
- else [],
200
- "total_tokens": total_tokens,
201
- "total_time": time.time() - start_time,
202
- },
203
- )
204
-
205
  except Exception as e:
206
- print(f"Error in gneration of speech: {str(e)}")
207
- raise
 
1
  from re import split
2
  import os
3
+ # VoxCPM2 torch.compiles a submodule that crashes TorchDynamo on this stack
4
+ # ("Cannot construct ConstantVariable for torch.device"); disable compilation so
5
+ # it runs eager. Must be set before torch is imported (via spaces / voxcpm).
6
+ os.environ.setdefault("TORCHDYNAMO_DISABLE", "1")
7
+ os.environ.setdefault("TORCH_COMPILE_DISABLE", "1")
8
+
9
+ import re
10
+ import tempfile
11
+ import threading
12
  import numpy as np
13
+ import soundfile as sf
14
+ import spaces
15
+
16
+ VOICE_MODEL_ID = "openbmb/VoxCPM2"
17
+
18
+ VOICE_DESIGN = {
19
+ "sml":
20
+ "(An incredibly animated, sassy sci-fi droid. A bright, mid-to-high tone, "
21
+ "highly irregular cadence with sudden jumps in pitch. Expressive, sharp, "
22
+ "and conversational, mimicking an opinionated pet communicating with attitude "
23
+ "rather than reading a script.)",
24
+
25
+ "chop":
26
+ "(A grumpy, belligerent mechanical gremlin. Low, gravelly, and guttural "
27
+ "mid-range tone, delivered with a muffled, throaty resonance. The cadence "
28
+ "is punchy, argumentative, and filled with aggressive, stubborn muttering.)",
29
+
30
+ "agressor":
31
+ "(A cheap, mass-produced military unit. Highly nasally, thin, and tinny "
32
+ "high-mid tone. The cadence is uniform, robotic, and stiff, with an "
33
+ "empty-headed, flat delivery that is completely devoid of natural human flow.)",
34
+ }
35
+
36
+ _model = None
37
+ _load_lock = threading.Lock()
38
+ _refs = dir[str, str]= {}
39
+ _ref_lock = threading.Lock()
40
+ _CACHE_DIR = tempfile.mkdtemp(prefix="ttt_voices_")
41
+
42
+ def get_model():
43
+ global _model
44
+ if _model is None:
45
+ with _load_lock:
46
+ if _model is None:
47
+ from voxcpm import VoxCPM
48
+ print(f"[voice] loading {VOICE_MODEL_ID}...", flush=True)
49
+ _model = VoxCPM.from_pretrained(VOICE_MODEL_ID, load_denoiser=False)
50
+ print(f"[voice] model ready")
51
+ return _model
52
+
53
+ def _ref_path(voice_key: str) -> str:
54
+ return os.path.join(_CACHE_DIR, re.sub(r"\W+", "_", voice_key) + ".wav")
55
+
56
+ def ensure_ref(voice_key: str) -> str:
57
+ """Bake (once) and return this character's reference voice wav.
58
+ Cached to a DETERMINISTIC path under a module-level temp dir (created in the main
59
+ process), so every ZeroGPU worker fork sees the same file on disk — whichever
60
+ worker bakes it first, all later synth calls reuse it instead of re-designing the
61
+ voice. This matters for Option C, which makes several synth calls per beat.
62
+ """
63
+ path = _ref_path(voice_key)
64
+ if os.path.exists(path):
65
+ return path
66
+ with _ref_lock:
67
+ if os.path.exists(path):
68
+ return path
69
+ m = _get_model()
70
+ design = VOICE_DESIGN.get(voice_key, DEFAULT_DESIGN)
71
+ cal = _CALIBRATION.get(voice_key, _DEFAULT_CALIBRATION)
72
+ print(f"[voice] designing voice for {voice_key!r} ...", flush=True)
73
+ wav = m.generate(text=f"{design}{cal}", normalize=True)
74
+ sf.write(path, wav, m.tts_model.sample_rate)
75
+ _refs[voice_key] = path
76
+ return path
77
+
78
+ @spaces.GPU(duration=50)
79
+ def synthesize(text: str, voice_key: str):
80
+ speech = text or ""
81
+ if not speech:
82
+ return None
83
+ m = get_model()
84
+ ref = ensure_ref(voice_key)
85
+ wav = m.generate(text=speech, reference_wav_path=ref, normalize=True)
86
+ wav = np.asarray(wav, dtype=np.float32).squeeze()
87
+ return (int(m.tts_model.sample_rate), wav)
88
+
89
+ @space.GPU(duration=150)
90
+ def warmup(voice_key=None):
91
+ keys = list(voice_keys) if voice_keys else list(VOICE_DESIGN.keys())
92
+ get_model()
93
+ for k in keys:
94
  try:
95
+ ensure_ref(k)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  except Exception as e:
97
+ print(f"[voice] warmup failed for {k!r}: {e}", flush=True)
98
+ print("[voice] warmup complete.", flush=True)
uv.lock CHANGED
The diff for this file is too large to render. See raw diff