Spaces:

build-small-hackathon
/

Voinal

Running on Zero

App Files Files Community

GovIndLok commited on 20 days ago

Commit

5b38e09

1 Parent(s): fcfec4d

feat: add MiniCPM5-1B model integration and migrate TTS to VoxCPM2 with TorchDynamo compilation fixes

Browse files

Files changed (4) hide show

model.py +70 -0
pyproject.toml +6 -2
tts_model.py +93 -202
uv.lock +0 -0

model.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import os
+# VoxCPM2 torch.compiles a submodule that crashes TorchDynamo on this stack
+# ("Cannot construct ConstantVariable for torch.device"); disable compilation so
+# it runs eager. Must be set before torch is imported (via spaces / voxcpm).
+os.environ.setdefault("TORCHDYNAMO_DISABLE", "1")
+os.environ.setdefault("TORCH_COMPILE_DISABLE", "1")
+import threading
+import spaces
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+import threading
+import spaces
+MODEL_ID = "openbmb/MiniCPM5-1B"
+print(f"[llm] Loading tokenizer for {MODEL_ID} ...", flush=True)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+print(f"[llm] Tokenizer loaded in  GPU ...", flush=True)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=True,
+).to("cuda")
+model.eval()
+print("[llm] model is ready", flush=True)
+def model_input(messages):
+    "Tokenize chat messages into model inputs."
+    kw = dict(Tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+    try:
+        enc = tokenizer.apply_chat_template(messages, enable_thinking=False, **kw)
+    except TypeError:
+        enc = tokenizer.apply_chat_template(messages, **kw)
+    return enc.to(model.device)
+@spaces.GPU(duration=120)
+def generate(messages, max_new_tokens: int = 100) -> str:
+    "One full chat completion (use by blocking path)"
+    inputs = model_input(messages)
+    in_len = inputs["input_ids"].shape[-1]
+    with torch.no_grad():
+        out = model.generate(**input, max_new_tokens=max_new_tokens,pad_token_id=tokenizer.eos_token_id, **GEN)
+    return tokenizer.decode(out[0][in_len:], skip_special_tokens=True).strip()
+# Test live generation
+@spaces.GPU(duration=100)
+def generate_stream(messages, max_new_tokens: int = 120):
+    "Generate lines as miniCPM write it"
+    inputs model_input(messages)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    kwargs = dict(**inputs, streamer=streamer, max_new_tokens=max_new_tokens,pad_token_id=tokenizer.eos_token_id, **GEN)
+    def _run():
+        with torch.no_grad():
+            model.generate(**kwargs)
+    threading.Thread(target=_run, daemon=True).start()
+    acc = ""
+    for piece with streamer:
+        acc += piece
+        yield piece

pyproject.toml CHANGED Viewed

@@ -6,12 +6,16 @@ readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
     "gradio",
-    "ollama",
     "numpy",
     "scipy",
     "torch",
-    "kokoro",
     "soundfile",
 ]
 [tool.setuptools.packages.find]

 requires-python = ">=3.12"
 dependencies = [
     "gradio",
     "numpy",
     "scipy",
     "torch",
+    "torchaudio",
+    "tiktoken",
+    "sentencepiece",
+    "voxcpm>=2.0",
+    "transformers",
     "soundfile",
+    "spaces",
 ]
 [tool.setuptools.packages.find]

tts_model.py CHANGED Viewed

@@ -1,207 +1,98 @@
 from re import split
 import os
-import torch
 import numpy as np
-import time
-from typing import Tuple, List
-from kokoro import KModel, KPipeline
-try:
-    import spaces
-except ImportError:
-    # Mock spaces decorator for local development
-    class mock_spaces:
-        @staticmethod
-        def GPU(duration=None):
-            def decorator(func):
-                return func
-            return decorator
-    spaces = mock_spaces
-_MODEL: KModel | None = None
-_ON_GPU = False
-@spaces.GPU(duration=None)
-def _forword_gpu(ps, ref_s, speed: float):
-    global _ON_GPU
-    if not _ON_GPU:
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        _MODEL.to(device)
-        _ON_GPU = True
-    return _MODEL(ps, ref_s, speed)
-class TTSModel:
-    def __init__(self) -> None:
-        self.pipeline = {}
-        # Support voice_t1 directory path, falling back to voice_v1
-        voice_dir = os.path.join(os.path.dirname(__file__), "voice_t1")
-        if not os.path.exists(voice_dir):
-            voice_dir = os.path.join(os.path.dirname(__file__), "voice_v1")
-        self.voice_dir = voice_dir
-    def initialize(self) -> bool:
-        global _MODEL
         try:
-            print("Initiazing the model .. .")
-            _MODEL = KModel().eval()
-            print("Model initialized")
-            return True
-        except Exception as e:
-            print(f"Error Initiazing model: {str(e)}")
-            return False
-    def _pipeline_for(self, lang_code: str) -> KPipeline:
-        if lang_code not in self.pipeline:
-            self.pipeline[lang_code] = KPipeline(lang_code=lang_code, model=False)
-        return self.pipeline[lang_code]
-    def list_voice(self) -> List[str]:
-        voices = []
-        if os.path.exists(self.voice_dir):
-            for file in os.listdir(self.voice_dir):
-                if file.endswith(".pt"):
-                    voices.append(file[:-3])
-        return sorted(voices)
-    def generate_speech(
-        self,
-        text: str,
-        voice_names: list[str],
-        speed: float = 1.0,
-        gpu_timeout: int = 60,
-        progress_callback=None,
-        progress_state=None,
-        progress=None,
-    ):
-        try:
-            start_time = time.time()
-            if not text or not voice_names:
-                raise ValueError("Text and voice name are required")
-            # Resolve voice names to local paths if they exist locally
-            resolved_voices = []
-            for v in voice_names:
-                if os.path.exists(v):
-                    resolved_voices.append(v)
-                else:
-                    local_path = os.path.join(self.voice_dir, f"{v}.pt")
-                    if os.path.exists(local_path):
-                        resolved_voices.append(local_path)
-                    else:
-                        resolved_voices.append(v)
-            # Extract base names to determine language codes
-            base_voice_names = []
-            for v in resolved_voices:
-                if os.path.exists(v):
-                    base_name = os.path.basename(v)
-                    if base_name.endswith(".pt"):
-                        base_name = base_name[:-3]
-                    base_voice_names.append(base_name)
-                else:
-                    base_voice_names.append(v)
-            lang_codes = {v[0] for v in base_voice_names}
-            if len(lang_codes) > 1:
-                raise ValueError(
-                    f"Cannot mix voices from different languages: {sorted(lang_codes)}. "
-                )
-            lang_code = base_voice_names[0][0]
-            pipeline = self._pipeline_for(lang_code)
-            voice_name = (
-                ",".join(resolved_voices) if len(resolved_voices) > 1 else resolved_voices[0]
-            )
-            pack = pipeline.load_voice(voice_name)
-            processed_text = "\n\n".join(
-                paragraph.replace("\n", " ").replace("  ", " ").strip()
-                for paragraph in text.split("\n\n")
-            )
-            audio_chucks = []
-            chunk_times = []
-            chunk_sizes = []
-            total_tokens = 0
-            total_process_time = 0
-            for i, (gs, ps, _) in enumerate(
-                pipeline(
-                    processed_text,
-                    voice=voice_name,
-                    speed=speed,
-                    split_pattern=r"\n\n+",
-                )
-            ):
-                ref_s = pack[len(ps) - 1].detach()
-                audio = _forword_gpu(ps, ref_s, speed)
-                audio = (
-                    audio.cpu().numpy() if hasattr(audio, "cpu") else np.asarray(audio)
-                )
-                chunk_process_time = time.time() - start_time - total_process_time
-                total_process_time += chunk_process_time
-                audio_chucks.append(audio)
-                chunk_tokens = len(gs)
-                total_tokens += chunk_tokens
-                chunk_duration = len(audio) / 24000
-                tokens_per_sec = (
-                    chunk_tokens / chunk_duration if chunk_duration else 0.0
-                )
-                rtf = chunk_process_time / chunk_duration if chunk_duration else 0.0
-                chunk_times.append(chunk_process_time)
-                chunk_sizes.append(chunk_tokens)
-                print(f"Chunk {i + 1}:")
-                print(f"  Process time: {chunk_process_time:.2f}s")
-                print(f"  Audio duration: {chunk_duration:.2f}s")
-                print(f"  Tokens/sec: {tokens_per_sec:.1f}")
-                print(f"  Real-time factor: {rtf:.3f}")
-                if progress_callback and progress_state:
-                    progress_state.setdefault("tokens_per_sec", []).append(
-                        tokens_per_sec
-                    )
-                    progress_state.setdefault("rtf", []).append(rtf)
-                    progress_state.setdefault("chunk_times", []).append(
-                        chunk_process_time
-                    )
-                    progress_callback(
-                        i + 1,
-                        -1,
-                        tokens_per_sec,
-                        rtf,
-                        progress_state,
-                        start_time,
-                        gpu_timeout,
-                        progress,
-                    )
-            audio = np.concatenate(audio_chucks)
-            return (
-                audio,
-                len(audio) / 24000,
-                {
-                    "chunk_times": chunk_times,
-                    "chunk_sizes": chunk_sizes,
-                    "tokens_per_sec": [
-                        float(x) for x in progress_state["tokens_per_sec"]
-                    ]
-                    if progress_state
-                    else [],
-                    "rtf": [float(x) for x in progress_state["rtf"]]
-                    if progress_state
-                    else [],
-                    "total_tokens": total_tokens,
-                    "total_time": time.time() - start_time,
-                },
-            )
         except Exception as e:
-            print(f"Error in gneration of speech: {str(e)}")
-            raise

 from re import split
 import os
+# VoxCPM2 torch.compiles a submodule that crashes TorchDynamo on this stack
+# ("Cannot construct ConstantVariable for torch.device"); disable compilation so
+# it runs eager. Must be set before torch is imported (via spaces / voxcpm).
+os.environ.setdefault("TORCHDYNAMO_DISABLE", "1")
+os.environ.setdefault("TORCH_COMPILE_DISABLE", "1")
+import re
+import tempfile
+import threading
 import numpy as np
+import soundfile as sf
+import spaces
+VOICE_MODEL_ID = "openbmb/VoxCPM2"
+VOICE_DESIGN = {
+    "sml":
+        "(An incredibly animated, sassy sci-fi droid. A bright, mid-to-high tone, "
+        "highly irregular cadence with sudden jumps in pitch. Expressive, sharp, "
+        "and conversational, mimicking an opinionated pet communicating with attitude "
+        "rather than reading a script.)",
+    "chop":
+        "(A grumpy, belligerent mechanical gremlin. Low, gravelly, and guttural "
+        "mid-range tone, delivered with a muffled, throaty resonance. The cadence "
+        "is punchy, argumentative, and filled with aggressive, stubborn muttering.)",
+    "agressor":
+        "(A cheap, mass-produced military unit. Highly nasally, thin, and tinny "
+        "high-mid tone. The cadence is uniform, robotic, and stiff, with an "
+        "empty-headed, flat delivery that is completely devoid of natural human flow.)",
+}
+_model = None
+_load_lock = threading.Lock()
+_refs = dir[str, str]= {}
+_ref_lock = threading.Lock()
+_CACHE_DIR = tempfile.mkdtemp(prefix="ttt_voices_")
+def get_model():
+    global _model
+    if _model is None:
+        with _load_lock:
+            if _model is None:
+                from voxcpm import VoxCPM
+                print(f"[voice] loading {VOICE_MODEL_ID}...", flush=True)
+                _model = VoxCPM.from_pretrained(VOICE_MODEL_ID, load_denoiser=False)
+                print(f"[voice] model ready")
+    return _model
+def _ref_path(voice_key: str) -> str:
+    return os.path.join(_CACHE_DIR, re.sub(r"\W+", "_", voice_key) + ".wav")
+def ensure_ref(voice_key: str) -> str:
+    """Bake (once) and return this character's reference voice wav.
+    Cached to a DETERMINISTIC path under a module-level temp dir (created in the main
+    process), so every ZeroGPU worker fork sees the same file on disk — whichever
+    worker bakes it first, all later synth calls reuse it instead of re-designing the
+    voice. This matters for Option C, which makes several synth calls per beat.
+    """
+    path = _ref_path(voice_key)
+    if os.path.exists(path):
+        return path
+    with _ref_lock:
+        if os.path.exists(path):
+            return path
+        m = _get_model()
+        design = VOICE_DESIGN.get(voice_key, DEFAULT_DESIGN)
+        cal = _CALIBRATION.get(voice_key, _DEFAULT_CALIBRATION)
+        print(f"[voice] designing voice for {voice_key!r} ...", flush=True)
+        wav = m.generate(text=f"{design}{cal}", normalize=True)
+        sf.write(path, wav, m.tts_model.sample_rate)
+        _refs[voice_key] = path
+        return path
+@spaces.GPU(duration=50)
+def synthesize(text: str, voice_key: str):
+    speech = text or ""
+    if not speech:
+        return None
+    m = get_model()
+    ref = ensure_ref(voice_key)
+    wav = m.generate(text=speech, reference_wav_path=ref, normalize=True)
+    wav = np.asarray(wav, dtype=np.float32).squeeze()
+    return (int(m.tts_model.sample_rate), wav)
+@space.GPU(duration=150)
+def warmup(voice_key=None):
+    keys = list(voice_keys) if voice_keys else list(VOICE_DESIGN.keys())
+    get_model()
+    for k in keys:
         try:
+            ensure_ref(k)
         except Exception as e:
+            print(f"[voice] warmup failed for {k!r}: {e}", flush=True)
+    print("[voice] warmup complete.", flush=True)

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff