Spaces:

maya-research
/

maya1

Running on Zero

App Files Files Community

Veena commited on Nov 7

Commit

e5b76b7

1 Parent(s): d1c3c57

Remove maya1 directory (using transformers)

Browse files

Files changed (8) hide show

maya1/__init__.py +0 -7
maya1/api_v2.py +0 -342
maya1/constants.py +0 -95
maya1/model_loader.py +0 -145
maya1/pipeline.py +0 -128
maya1/prompt_builder.py +0 -31
maya1/snac_decoder.py +0 -515
maya1/streaming_pipeline.py +0 -159

maya1/__init__.py DELETED Viewed

@@ -1,7 +0,0 @@
-"""
-Maya1 TTS Inference System
-Open-source inference for description-conditioned TTS with emotion control.
-"""
-__version__ = "1.0.0"
-__author__ = "Maya Research AI"

maya1/api_v2.py DELETED Viewed

@@ -1,342 +0,0 @@
-import os
-import io
-import wave
-import time
-from typing import Optional
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import StreamingResponse
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, Field
-from dotenv import load_dotenv
-from .model_loader import Maya1Model
-from .prompt_builder import Maya1PromptBuilder
-from .snac_decoder import SNACDecoder
-from .pipeline import Maya1Pipeline
-from .streaming_pipeline import Maya1SlidingWindowPipeline
-from .constants import (
-    DEFAULT_TEMPERATURE,
-    DEFAULT_TOP_P,
-    DEFAULT_MAX_TOKENS,
-    DEFAULT_REPETITION_PENALTY,
-    AUDIO_SAMPLE_RATE,
-)
-# Timeout settings (seconds)
-GENERATE_TIMEOUT = 60
-# Load environment variables
-load_dotenv()
-# Initialize FastAPI app
-app = FastAPI(
-    title="Maya1 TTS API",
-    description="Open source TTS inference for Maya1",
-    version="1.0.0",
-    docs_url=None,
-    redoc_url=None,
-)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# Global state
-model = None
-prompt_builder = None
-snac_decoder = None
-pipeline = None
-streaming_pipeline = None
-# ============================================================================
-# Startup/Shutdown
-# ============================================================================
-@app.on_event("startup")
-async def startup_event():
-    """Initialize model on startup."""
-    global model, prompt_builder, snac_decoder, pipeline, streaming_pipeline
-    print("\n" + "="*60)
-    print(" Starting Maya1 TTS API Server")
-    print("="*60 + "\n")
-    # Initialize components
-    model = Maya1Model()
-    prompt_builder = Maya1PromptBuilder(model.tokenizer, model)
-    # Initialize SNAC decoder
-    snac_decoder = SNACDecoder(enable_batching=True, max_batch_size=64, batch_timeout_ms=15)
-    await snac_decoder.start_batch_processor()
-    # Initialize pipelines
-    pipeline = Maya1Pipeline(model, prompt_builder, snac_decoder)
-    streaming_pipeline = Maya1SlidingWindowPipeline(model, prompt_builder, snac_decoder)
-    print("\n" + "="*60)
-    print("Maya1 TTS API Server Ready")
-    print("="*60 + "\n")
-@app.on_event("shutdown")
-async def shutdown_event():
-    """Cleanup on shutdown."""
-    print("\nShutting down Maya1 TTS API Server")
-    if snac_decoder and snac_decoder.is_running:
-        await snac_decoder.stop_batch_processor()
-# ============================================================================
-# Utility Functions
-# ============================================================================
-def create_wav_header(sample_rate: int = 24000, channels: int = 1, bits_per_sample: int = 16, data_size: int = 0) -> bytes:
-    """Create WAV file header."""
-    import struct
-    byte_rate = sample_rate * channels * bits_per_sample // 8
-    block_align = channels * bits_per_sample // 8
-    header = struct.pack(
-        '<4sI4s4sIHHIIHH4sI',
-        b'RIFF',
-        36 + data_size,
-        b'WAVE',
-        b'fmt ',
-        16,
-        1,
-        channels,
-        sample_rate,
-        byte_rate,
-        block_align,
-        bits_per_sample,
-        b'data',
-        data_size
-    )
-    return header
-# ============================================================================
-# Request/Response Models
-# ============================================================================
-class TTSRequest(BaseModel):
-    """TTS generation request."""
-    description: str = Field(
-        ...,
-        description="Voice description (e.g., 'Male voice in their 30s with american accent')"
-    )
-    text: str = Field(
-        ...,
-        description="Text to synthesize (can include <emotion> tags)"
-    )
-    temperature: Optional[float] = Field(
-        default=DEFAULT_TEMPERATURE,
-        description="Sampling temperature"
-    )
-    top_p: Optional[float] = Field(
-        default=DEFAULT_TOP_P,
-        description="Nucleus sampling"
-    )
-    max_tokens: Optional[int] = Field(
-        default=DEFAULT_MAX_TOKENS,
-        description="Maximum tokens to generate"
-    )
-    repetition_penalty: Optional[float] = Field(
-        default=DEFAULT_REPETITION_PENALTY,
-        description="Repetition penalty"
-    )
-    seed: Optional[int] = Field(
-        default=None,
-        description="Random seed for reproducibility",
-        ge=0,
-    )
-    stream: bool = Field(
-        default=False,
-        description="Stream audio (True) or return complete WAV (False)"
-    )
-# ============================================================================
-# Endpoints
-# ============================================================================
-@app.get("/")
-async def root():
-    """Root endpoint."""
-    return {
-        "service": "Maya1 TTS API",
-        "version": "1.0.0",
-        "status": "running",
-        "model": "Maya1-Voice (open source)",
-        "endpoints": {
-            "generate": "/v1/tts/generate (POST)",
-            "health": "/health (GET)",
-        },
-    }
-@app.get("/health")
-async def health_check():
-    """Health check endpoint."""
-    return {
-        "status": "healthy",
-        "model": "Maya1-Voice",
-        "timestamp": time.time(),
-    }
-# ============================================================================
-# TTS Generation Endpoint
-# ============================================================================
-@app.post("/v1/tts/generate")
-async def generate_tts(request: TTSRequest):
-    """Generate TTS audio from description and text."""
-    try:
-        # Route to streaming or non-streaming
-        if request.stream:
-            return await _generate_tts_streaming(
-                description=request.description,
-                text=request.text,
-                temperature=request.temperature,
-                top_p=request.top_p,
-                max_tokens=request.max_tokens,
-                repetition_penalty=request.repetition_penalty,
-                seed=request.seed,
-            )
-        else:
-            return await _generate_tts_complete(
-                description=request.description,
-                text=request.text,
-                temperature=request.temperature,
-                top_p=request.top_p,
-                max_tokens=request.max_tokens,
-                repetition_penalty=request.repetition_penalty,
-                seed=request.seed,
-            )
-    except HTTPException:
-        raise
-    except Exception as e:
-        print(f" Error: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-async def _generate_tts_complete(
-    description: str,
-    text: str,
-    temperature: float,
-    top_p: float,
-    max_tokens: int,
-    repetition_penalty: float,
-    seed: Optional[int],
-):
-    """Generate complete WAV file (non-streaming)."""
-    try:
-        import asyncio
-        # Generate audio
-        audio_bytes = await asyncio.wait_for(
-            pipeline.generate_speech(
-                description=description,
-                text=text,
-                temperature=temperature,
-                top_p=top_p,
-                max_tokens=max_tokens,
-                repetition_penalty=repetition_penalty,
-                seed=seed,
-            ),
-            timeout=GENERATE_TIMEOUT
-        )
-        if audio_bytes is None:
-            raise Exception("Audio generation failed")
-        # Create WAV file
-        wav_buffer = io.BytesIO()
-        with wave.open(wav_buffer, 'wb') as wav_file:
-            wav_file.setnchannels(1)
-            wav_file.setsampwidth(2)
-            wav_file.setframerate(AUDIO_SAMPLE_RATE)
-            wav_file.writeframes(audio_bytes)
-        wav_buffer.seek(0)
-        return StreamingResponse(
-            wav_buffer,
-            media_type="audio/wav",
-            headers={"Content-Disposition": "attachment; filename=output.wav"}
-        )
-    except asyncio.TimeoutError:
-        raise HTTPException(status_code=504, detail="Generation timeout")
-async def _generate_tts_streaming(
-    description: str,
-    text: str,
-    temperature: float,
-    top_p: float,
-    max_tokens: int,
-    repetition_penalty: float,
-    seed: Optional[int],
-):
-    """Generate streaming audio."""
-    start_time = time.time()
-    first_audio_time = None
-    async def audio_stream_generator():
-        """Generate audio stream with WAV header."""
-        nonlocal first_audio_time
-        # Send WAV header first
-        yield create_wav_header(sample_rate=AUDIO_SAMPLE_RATE, channels=1, bits_per_sample=16)
-        # Stream audio chunks
-        async for audio_chunk in streaming_pipeline.generate_speech_stream(
-            description=description,
-            text=text,
-            temperature=temperature,
-            top_p=top_p,
-            max_tokens=max_tokens,
-            repetition_penalty=repetition_penalty,
-            seed=seed,
-        ):
-            if first_audio_time is None:
-                first_audio_time = time.time()
-                ttfb_ms = (first_audio_time - start_time) * 1000
-                print(f"⏱️  TTFB: {ttfb_ms:.1f}ms")
-            yield audio_chunk
-    try:
-        return StreamingResponse(
-            audio_stream_generator(),
-            media_type="audio/wav",
-            headers={"Cache-Control": "no-cache"}
-        )
-    except Exception as e:
-        print(f"Streaming error: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-# For running directly
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(
-        app,
-        host="0.0.0.0",
-        port=8000,
-        log_level="info"
-    )

maya1/constants.py DELETED Viewed

@@ -1,95 +0,0 @@
-"""
-Maya1 Constants
-Token IDs and special tokens used in the model.
-Matches training configuration exactly.
-"""
-# Special control tokens
-SOH_ID = 128259  # Start of Human turn
-EOH_ID = 128260  # End of Human turn
-SOA_ID = 128261  # Start of AI turn
-EOA_ID = 128262  # End of AI turn (not used in maya1)
-PAD_ID = 128263  # Padding token
-# Text tokens
-BOS_ID = 128000  # Begin of sequence (Llama BOS)
-TEXT_EOT_ID = 128009  # End of text (appears in prefix, not a stop token!)
-# Audio tokens
-CODE_START_TOKEN_ID = 128257  # SOS - Start of Speech
-CODE_END_TOKEN_ID = 128258   # EOS - End of Speech (audio stop token)
-CODE_TOKEN_OFFSET = 128266   # Start of SNAC codes
-# SNAC token range
-SNAC_MIN_ID = 128266
-SNAC_MAX_ID = 156937  # 128266 + (7 * 4096) - 1
-# Stop tokens for generation
-# CRITICAL: Only use CODE_END_TOKEN_ID (128258) for audio generation
-# TEXT_EOT_ID (128009) appears in prefix and should NOT stop generation
-TRAINING_STOP_TOKEN_IDS = [CODE_END_TOKEN_ID]  # [128258]
-ALL_POSSIBLE_STOP_TOKENS = [TEXT_EOT_ID, CODE_END_TOKEN_ID]  # For reference only
-# 20 Extended Emotion Tags (must be single tokens)
-ALL_EMOTION_TAGS = [
-    '<angry>',
-    '<appalled>',
-    '<chuckle>',
-    '<cry>',
-    '<curious>',
-    '<disappointed>',
-    '<excited>',
-    '<exhale>',
-    '<gasp>',
-    '<giggle>',
-    '<gulp>',
-    '<laugh>',
-    '<laugh_harder>',
-    '<mischievous>',
-    '<sarcastic>',
-    '<scream>',
-    '<sigh>',
-    '<sing>',
-    '<snort>',
-    '<whisper>',
-]
-# Model configuration
-DEFAULT_MODEL_PATH = "maya-research/maya1"
-DEFAULT_CHECKPOINT = "checkpoint-25000"
-DEFAULT_MAX_MODEL_LEN = 8192
-# SNAC configuration
-SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
-SNAC_SAMPLE_RATE = 24000
-SNAC_TOKENS_PER_FRAME = 7
-SNAC_LEVELS = 3
-# Audio configuration
-AUDIO_SAMPLE_RATE = 24000
-AUDIO_CHANNELS = 1
-AUDIO_BITS_PER_SAMPLE = 16
-# Generation defaults
-DEFAULT_TEMPERATURE = 0.4  # Lower temp for more stable generation
-DEFAULT_TOP_P = 0.9
-DEFAULT_MAX_TOKENS = 2048  # Reasonable default for most use cases
-DEFAULT_MIN_TOKENS = 28  # At least 4 SNAC frames
-DEFAULT_REPETITION_PENALTY = 1.1
-DEFAULT_SEED = None  # None = random, set integer for reproducibility
-# IMPORTANT: Emotion tags consume audio time!
-# <laugh> = ~4-6 seconds (~300-400 tokens)
-# <excited>, <chuckle> = ~1-2 seconds (~50-150 tokens)
-# Recommended max_tokens by use case:
-# - Short phrases (< 10 words): 150-250 tokens (~3-5s)
-# - Medium text (10-30 words): 250-500 tokens (~5-10s)
-# - Long text (30+ words): 500-1500 tokens (~10-30s)
-# - Very long text: 1500-2000 tokens (~30-42s)
-# Note: 1 second ≈ 48 tokens (7 tokens/frame * 6.86 frames/sec)
-# Streaming configuration
-STREAM_BUFFER_SIZE = 28  # 4 frames (process every 28 tokens)
-SNAC_BATCH_SIZE = 64
-SNAC_BATCH_TIMEOUT_MS = 15

maya1/model_loader.py DELETED Viewed

@@ -1,145 +0,0 @@
-"""
-Maya1 Model Loader
-Loads Maya1 model with vLLM engine and validates emotion tags.
-"""
-import os
-from transformers import AutoTokenizer
-from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
-from .constants import (
-    ALL_EMOTION_TAGS,
-    DEFAULT_MAX_MODEL_LEN,
-    SOH_ID, EOH_ID, SOA_ID, BOS_ID, TEXT_EOT_ID, CODE_START_TOKEN_ID,
-)
-class Maya1Model:
-    """Maya1 TTS Model with vLLM inference engine."""
-    def __init__(
-        self,
-        model_path: str = None,
-        dtype: str = "bfloat16",
-        max_model_len: int = DEFAULT_MAX_MODEL_LEN,
-        gpu_memory_utilization: float = 0.85,
-        tensor_parallel_size: int = 1,
-        **engine_kwargs
-    ):
-        """
-        Initialize Maya1 model with vLLM.
-        Args:
-            model_path: Path to checkpoint (local or HF repo)
-            dtype: Model precision (bfloat16 recommended)
-            max_model_len: Maximum sequence length
-            gpu_memory_utilization: GPU memory fraction
-            tensor_parallel_size: Number of GPUs
-        """
-        # Use provided path or environment variable or default
-        if model_path is None:
-            model_path = os.environ.get(
-                'MAYA1_MODEL_PATH',
-                os.path.expanduser('~/models/maya1-voice')
-            )
-        self.model_path = model_path
-        self.dtype = dtype
-        print(f"Initializing Maya1 Model")
-        print(f"Model: {model_path}")
-        # Load tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            model_path,
-            trust_remote_code=True,
-        )
-        print(f"Tokenizer loaded: {len(self.tokenizer)} tokens")
-        # Validate emotion tags
-        self._validate_emotion_tags()
-        # Precompute special token strings
-        self._init_special_tokens()
-        # Initialize vLLM engine
-        print(f"Initializing vLLM engine...")
-        engine_args = AsyncEngineArgs(
-            model=model_path,
-            tokenizer=model_path,
-            dtype=dtype,
-            max_model_len=max_model_len,
-            gpu_memory_utilization=gpu_memory_utilization,
-            tensor_parallel_size=tensor_parallel_size,
-            trust_remote_code=True,
-            disable_log_stats=False,
-            **engine_kwargs
-        )
-        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
-        print(f"Maya1 Model ready\n")
-    def _validate_emotion_tags(self):
-        """Validate that all 20 emotion tags are single tokens."""
-        failed_tags = []
-        for tag in ALL_EMOTION_TAGS:
-            token_ids = self.tokenizer.encode(tag, add_special_tokens=False)
-            if len(token_ids) != 1:
-                failed_tags.append((tag, len(token_ids)))
-        if failed_tags:
-            print(f"ERROR: {len(failed_tags)} emotion tags are NOT single tokens!")
-            raise AssertionError(f"Emotion tags validation failed")
-        print(f"All {len(ALL_EMOTION_TAGS)} emotion tags validated")
-    def _init_special_tokens(self):
-        """Precompute special token strings for fast prefix building."""
-        self.soh_token = self.tokenizer.decode([SOH_ID])
-        self.bos_token = self.tokenizer.bos_token
-        self.eot_token = self.tokenizer.decode([TEXT_EOT_ID])
-        self.eoh_token = self.tokenizer.decode([EOH_ID])
-        self.soa_token = self.tokenizer.decode([SOA_ID])
-        self.sos_token = self.tokenizer.decode([CODE_START_TOKEN_ID])
-    async def generate(self, prompt: str, sampling_params: SamplingParams):
-        """
-        Generate tokens from prompt (non-streaming).
-        Args:
-            prompt: Input prompt
-            sampling_params: vLLM sampling parameters
-        Returns:
-            Generated output from vLLM
-        """
-        request_id = f"req_{id(prompt)}"
-        # Collect results from async generator
-        final_output = None
-        async for output in self.engine.generate(
-            prompt=prompt,
-            sampling_params=sampling_params,
-            request_id=request_id
-        ):
-            final_output = output
-        return [final_output] if final_output else []
-    async def generate_stream(self, prompt: str, sampling_params: SamplingParams):
-        """
-        Generate tokens from prompt (streaming).
-        Args:
-            prompt: Input prompt
-            sampling_params: vLLM sampling parameters
-        Yields:
-            Generated outputs from vLLM
-        """
-        request_id = f"req_{id(prompt)}"
-        # Stream from engine
-        async for output in self.engine.generate(
-            prompt=prompt,
-            sampling_params=sampling_params,
-            request_id=request_id
-        ):
-            yield output

maya1/pipeline.py DELETED Viewed

@@ -1,128 +0,0 @@
-"""
-Maya1 Generation Pipeline
-End-to-end pipeline for TTS generation (non-streaming).
-"""
-import asyncio
-from typing import Optional, List
-from vllm import SamplingParams
-from .constants import (
-    CODE_END_TOKEN_ID,
-    CODE_START_TOKEN_ID,
-    SNAC_MIN_ID,
-    SNAC_MAX_ID,
-    DEFAULT_TEMPERATURE,
-    DEFAULT_TOP_P,
-    DEFAULT_MAX_TOKENS,
-    DEFAULT_MIN_TOKENS,
-    DEFAULT_REPETITION_PENALTY,
-    DEFAULT_SEED,
-)
-class Maya1Pipeline:
-    """End-to-end TTS pipeline for Maya1."""
-    def __init__(self, model, prompt_builder, snac_decoder):
-        """
-        Initialize pipeline.
-        Args:
-            model: Maya1Model instance
-            prompt_builder: Maya1PromptBuilder instance
-            snac_decoder: SNACDecoder instance
-        """
-        self.model = model
-        self.prompt_builder = prompt_builder
-        self.snac_decoder = snac_decoder
-        print(f"✅ Maya1Pipeline initialized")
-    async def generate_speech(
-        self,
-        description: str,
-        text: str,
-        temperature: float = DEFAULT_TEMPERATURE,
-        top_p: float = DEFAULT_TOP_P,
-        max_tokens: int = DEFAULT_MAX_TOKENS,
-        repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
-        seed: Optional[int] = None,
-    ) -> Optional[bytes]:
-        """
-        Generate speech audio (non-streaming).
-        Args:
-            description: Voice description
-            text: Text to synthesize (may include <emotion> tags)
-            temperature: Sampling temperature
-            top_p: Nucleus sampling
-            max_tokens: Max SNAC tokens to generate
-            repetition_penalty: Prevent loops
-            seed: Random seed for reproducibility
-        Returns:
-            Audio bytes (int16 PCM, 24kHz mono) or None if failed
-        """
-        # Build prompt
-        prompt = self.prompt_builder.build_prefix(description, text)
-        # Configure sampling
-        sampling_params = SamplingParams(
-            temperature=temperature,
-            top_p=top_p,
-            max_tokens=max_tokens,
-            min_tokens=DEFAULT_MIN_TOKENS,
-            repetition_penalty=repetition_penalty,
-            stop_token_ids=[CODE_END_TOKEN_ID],
-            seed=seed if seed is not None else DEFAULT_SEED,
-        )
-        # Generate tokens
-        outputs = await self.model.generate(prompt, sampling_params)
-        if not outputs or len(outputs) == 0:
-            return None
-        output = outputs[0]
-        generated_token_ids = output.outputs[0].token_ids
-        # Extract SNAC codes
-        snac_codes = self._extract_snac_codes(generated_token_ids)
-        if not snac_codes:
-            return None
-        # Decode to audio
-        audio_bytes = await self.snac_decoder.decode_single_async(snac_codes)
-        if audio_bytes:
-            frames = len(snac_codes) // 7
-            duration_sec = frames / 6.86
-            print(f" Generated {frames} frames (~{duration_sec:.1f}s audio)")
-        return audio_bytes
-    def _extract_snac_codes(self, token_ids: List[int]) -> List[int]:
-        # Find SOS and EOS positions
-        try:
-            sos_idx = token_ids.index(CODE_START_TOKEN_ID)
-        except ValueError:
-            sos_idx = -1
-        try:
-            eos_idx = token_ids.index(CODE_END_TOKEN_ID)
-        except ValueError:
-            eos_idx = len(token_ids)
-        # Extract tokens between SOS and EOS
-        if sos_idx >= 0:
-            snac_tokens = token_ids[sos_idx + 1:eos_idx]
-        else:
-            # If no SOS found, take everything before EOS
-            snac_tokens = token_ids[:eos_idx]
-        # Filter to only valid SNAC token IDs
-        snac_codes = [
-            token_id for token_id in snac_tokens
-            if SNAC_MIN_ID <= token_id <= SNAC_MAX_ID
-        ]
-        return snac_codes

maya1/prompt_builder.py DELETED Viewed

@@ -1,31 +0,0 @@
-"""
-Maya1 Prompt Builder
-Builds formatted prompts for description-conditioned TTS.
-Format: <SOH><BOS><description="..."> text<EOT><EOH><SOA><SOS>
-"""
-from .constants import ALL_EMOTION_TAGS
-class Maya1PromptBuilder:
-    """Builds prompts in the format expected by Maya1 model."""
-    def __init__(self, tokenizer, model):
-        self.tokenizer = tokenizer
-        self.model = model
-    def build_prefix(self, description: str, text: str) -> str:
-        # Format as: <description="..."> text
-        formatted_text = f'<description="{description}"> {text}'
-        # Build full prefix with special tokens
-        prompt = (
-            self.model.soh_token +
-            self.model.bos_token +
-            formatted_text +
-            self.model.eot_token +
-            self.model.eoh_token +
-            self.model.soa_token +
-            self.model.sos_token
-        )
-        return prompt

maya1/snac_decoder.py DELETED Viewed

@@ -1,515 +0,0 @@
-import torch
-import numpy as np
-import asyncio
-from typing import List, Optional, Tuple
-from snac import SNAC
-from .constants import (
-    CODE_END_TOKEN_ID,
-    CODE_TOKEN_OFFSET,
-    SNAC_MODEL_NAME,
-    SNAC_SAMPLE_RATE,
-    SNAC_TOKENS_PER_FRAME,
-)
-class SNACDecoder:
-    """
-    SNAC Decoder for maya1.
-    Unpacks 7-token SNAC frames and decodes to audio waveforms.
-    Unpacking logic is the EXACT INVERSE of training preprocessing.
-    Supports async batching for concurrent requests.
-    CRITICAL: Any mismatch in unpacking will produce garbage audio.
-    """
-    def __init__(
-        self,
-        device: str = "cuda",
-        compile_decoder: bool = False,
-        enable_batching: bool = False,
-        max_batch_size: int = 64,
-        batch_timeout_ms: int = 15,
-    ):
-        """
-        Initialize SNAC decoder.
-        Args:
-            device: Device for SNAC model (cuda/cpu)
-            compile_decoder: Use torch.compile for speedup
-            enable_batching: Enable async batching
-            max_batch_size: Max sequences to batch together
-            batch_timeout_ms: Max wait time before processing batch
-        """
-        self.device = device
-        self.enable_batching = enable_batching
-        self.max_batch_size = max_batch_size
-        self.batch_timeout_ms = batch_timeout_ms
-        print(f"Loading SNAC 24kHz model to {device}...")
-        self.snac_model = SNAC.from_pretrained(SNAC_MODEL_NAME).eval().to(device)
-        if compile_decoder:
-            print(f"Compiling SNAC decoder with torch.compile...")
-            self._compile_model()
-        # Batching infrastructure
-        if enable_batching:
-            self.request_queue = asyncio.Queue()
-            self.batch_processor_task = None
-            self._running = False
-            print(f"Batching enabled (max_batch={max_batch_size}, timeout={batch_timeout_ms}ms)")
-        print(f"SNAC decoder initialized")
-    def _compile_model(self):
-        """Compile SNAC decoder with torch.compile"""
-        # Warm up with various sizes
-        for frames in [4, 16, 32]:
-            dummy_codes = [
-                torch.randint(0, 4096, (1, frames), device=self.device),
-                torch.randint(0, 4096, (1, frames * 2), device=self.device),
-                torch.randint(0, 4096, (1, frames * 4), device=self.device),
-            ]
-            with torch.inference_mode():
-                z_q = self.snac_model.quantizer.from_codes(dummy_codes)
-                _ = self.snac_model.decoder(z_q)
-        # Apply compilation
-        self.snac_model.decoder = torch.compile(
-            self.snac_model.decoder,
-            mode="max-autotune"
-        )
-        self.snac_model.quantizer = torch.compile(
-            self.snac_model.quantizer,
-            mode="reduce-overhead"
-        )
-        print(f"SNAC decoder compiled")
-    def unpack_snac_from_7(self, vocab_ids: List[int]) -> List[List[int]]:
-        """
-        Unpack 7-token SNAC frames to 3 hierarchical levels.
-        This is the EXACT INVERSE of the training preprocessing function
-        `pack_snac_to_7_and_offset()`.
-        Frame structure:
-        [slot0, slot1, slot2, slot3, slot4, slot5, slot6]
-        Unpacking:
-        - slot0: L1[i]
-        - slot1: L2[2*i]      (even index)
-        - slot2: L3[4*i + 0]
-        - slot3: L3[4*i + 1]
-        - slot4: L2[2*i + 1]  (odd index)
-        - slot5: L3[4*i + 2]
-        - slot6: L3[4*i + 3]
-        Args:
-            vocab_ids: List of SNAC token IDs (128266-156937)
-                       Must be divisible by 7
-        Returns:
-            [L1, L2, L3] where:
-                L1: n elements (coarse level)
-                L2: 2n elements (medium level)
-                L3: 4n elements (fine level)
-        """
-        # Strip EOS token if present
-        if vocab_ids and vocab_ids[-1] == CODE_END_TOKEN_ID:
-            vocab_ids = vocab_ids[:-1]
-        # Ensure complete frames (divisible by 7)
-        frames = len(vocab_ids) // SNAC_TOKENS_PER_FRAME
-        vocab_ids = vocab_ids[:frames * SNAC_TOKENS_PER_FRAME]
-        if frames == 0:
-            return [[], [], []]
-        l1, l2, l3 = [], [], []
-        for i in range(frames):
-            # Extract 7 slots for this frame
-            slots = vocab_ids[i*7:(i+1)*7]
-            # Subtract offset (128266) and mod 4096 to get original codes
-            # Each level uses 4096 codes (0-4095)
-            l1.append((slots[0] - CODE_TOKEN_OFFSET) % 4096)
-            l2.extend([
-                (slots[1] - CODE_TOKEN_OFFSET) % 4096,  # Even index
-                (slots[4] - CODE_TOKEN_OFFSET) % 4096,  # Odd index
-            ])
-            l3.extend([
-                (slots[2] - CODE_TOKEN_OFFSET) % 4096,
-                (slots[3] - CODE_TOKEN_OFFSET) % 4096,
-                (slots[5] - CODE_TOKEN_OFFSET) % 4096,
-                (slots[6] - CODE_TOKEN_OFFSET) % 4096,
-            ])
-        return [l1, l2, l3]
-    @torch.inference_mode()
-    def decode(
-        self,
-        snac_tokens: List[int],
-        trim_warmup: bool = True,
-        trim_amount: Optional[int] = None,
-        use_sliding_window: bool = False
-    ) -> Optional[np.ndarray]:
-        """
-        Decode SNAC tokens to audio waveform.
-        Args:
-            snac_tokens: List of SNAC token IDs (7*n tokens)
-            trim_warmup: Whether to trim SNAC warmup samples (default: True)
-            trim_amount: Number of samples to trim (default: 2048 for first chunk, 0 for others)
-                        Can be set to a smaller value (e.g., 512) for intermediate chunks
-            use_sliding_window: If True, only return middle 2048 samples (for sliding window streaming)
-        Returns:
-            Audio waveform as numpy array (float32, 24kHz mono)
-            Shape: (samples,)
-            Returns None if not enough tokens
-        """
-        if len(snac_tokens) < SNAC_TOKENS_PER_FRAME:
-            print(f"Not enough SNAC tokens: {len(snac_tokens)} < {SNAC_TOKENS_PER_FRAME}")
-            return None
-        # Unpack to 3 levels
-        levels = self.unpack_snac_from_7(snac_tokens)
-        if not levels[0]:  # No frames after unpacking
-            return None
-        # Convert to tensors
-        codes = [
-            torch.tensor(level, dtype=torch.long, device=self.device).unsqueeze(0)
-            for level in levels
-        ]
-        # Decode through SNAC
-        z_q = self.snac_model.quantizer.from_codes(codes)
-        audio = self.snac_model.decoder(z_q)
-        # Extract audio (remove padding if any)
-        # SNAC decoder outputs: [batch, 1, samples]
-        audio = audio[0, 0].cpu().numpy()
-        # Sliding window mode: only keep middle 2048 samples
-        # This eliminates popping/cracking when using overlapping 28-token windows
-        if use_sliding_window:
-            if len(audio) >= 4096:
-                audio = audio[2048:4096]  # Keep middle portion only
-            else:
-                # For shorter audio, keep everything (final chunk)
-                pass
-        else:
-            # Standard mode: trim warm-up samples
-            # Default: 2048 samples for first chunk, 0 for subsequent chunks
-            # Can be customized via trim_amount parameter
-            if trim_warmup:
-                if trim_amount is None:
-                    trim_amount = 2048  # Default full trim
-                if len(audio) > trim_amount:
-                    audio = audio[trim_amount:]
-        return audio
-    def decode_to_bytes(
-        self,
-        snac_tokens: List[int],
-        trim_warmup: bool = True,
-        use_sliding_window: bool = False
-    ) -> Optional[bytes]:
-        """
-        Decode SNAC tokens to audio bytes (int16 PCM).
-        Args:
-            snac_tokens: List of SNAC token IDs
-            trim_warmup: Whether to trim SNAC warmup samples (default: True)
-            use_sliding_window: If True, only return middle 2048 samples (for sliding window streaming)
-        Returns:
-            Audio as bytes (int16 PCM, 24kHz mono)
-            Returns None if decode fails
-        """
-        audio = self.decode(snac_tokens, trim_warmup=trim_warmup, use_sliding_window=use_sliding_window)
-        if audio is None:
-            return None
-        # Convert float32 to int16 PCM
-        audio_int16 = (audio * 32767).astype(np.int16)
-        return audio_int16.tobytes()
-    def validate_tokens(self, snac_tokens: List[int]) -> bool:
-        """
-        Validate SNAC tokens before decoding.
-        Args:
-            snac_tokens: List of SNAC token IDs
-        Returns:
-            True if valid, False otherwise
-        """
-        # Check minimum length
-        if len(snac_tokens) < SNAC_TOKENS_PER_FRAME:
-            print(f"Too few tokens: {len(snac_tokens)}")
-            return False
-        # Check divisibility by 7
-        if len(snac_tokens) % SNAC_TOKENS_PER_FRAME != 0:
-            print(f"  Warning: Token count {len(snac_tokens)} not divisible by 7")
-            print(f"   Will truncate to {(len(snac_tokens) // 7) * 7}")
-        # Check token range
-        for i, token_id in enumerate(snac_tokens):
-            if token_id < CODE_TOKEN_OFFSET or token_id > 156937:
-                print(f" Invalid token at position {i}: {token_id}")
-                print(f"   Expected range: [{CODE_TOKEN_OFFSET}, 156937]")
-                return False
-        return True
-    # ========== Async Batching Methods ==========
-    @property
-    def is_running(self) -> bool:
-        """Check if batch processor is running."""
-        return self._running if self.enable_batching else False
-    async def start_batch_processor(self):
-        """Start the background batch processor task."""
-        if not self.enable_batching:
-            return
-        if self._running:
-            print("Batch processor already running")
-            return
-        self._running = True
-        self.batch_processor_task = asyncio.create_task(self._batch_processor_loop())
-        print("Batch processor started")
-    async def stop_batch_processor(self):
-        """Stop the background batch processor task."""
-        if not self.enable_batching:
-            return
-        if not self._running:
-            return
-        self._running = False
-        if self.batch_processor_task:
-            self.batch_processor_task.cancel()
-            try:
-                await self.batch_processor_task
-            except asyncio.CancelledError:
-                pass
-        print("Batch processor stopped")
-    async def decode_single_async(
-        self,
-        snac_tokens: List[int],
-        trim_warmup: bool = True,
-        use_sliding_window: bool = False
-    ) -> Optional[bytes]:
-        """
-        Async decode for batching support.
-        Queues the request and waits for batched processing.
-        Args:
-            snac_tokens: List of SNAC token IDs
-            trim_warmup: Whether to trim SNAC warmup samples (default: True)
-            use_sliding_window: If True, only return middle 2048 samples (for sliding window streaming)
-        Returns:
-            Audio bytes or None if decode fails
-        """
-        if not self.enable_batching:
-            # Fallback to synchronous decode
-            return self.decode_to_bytes(snac_tokens, trim_warmup=trim_warmup, use_sliding_window=use_sliding_window)
-        # Create future for result
-        result_future = asyncio.Future()
-        # Add to queue (include trim_warmup and sliding_window flags)
-        await self.request_queue.put((snac_tokens, trim_warmup, use_sliding_window, result_future))
-        # Wait for result
-        return await result_future
-    async def _batch_processor_loop(self):
-        """Background task that processes batched decode requests."""
-        while self._running:
-            try:
-                # Collect batch
-                batch = await self._collect_batch()
-                if not batch:
-                    continue
-                # Process batch
-                await self._process_batch(batch)
-            except asyncio.CancelledError:
-                break
-            except Exception as e:
-                print(f"Batch processor error: {e}")
-                import traceback
-                traceback.print_exc()
-    async def _collect_batch(self) -> List[Tuple[List[int], bool, bool, asyncio.Future]]:
-        """
-        Collect requests into a batch.
-        Waits for timeout or until batch is full.
-        Returns:
-            List of (tokens, trim_warmup, use_sliding_window, future) tuples
-        """
-        batch = []
-        timeout_sec = self.batch_timeout_ms / 1000.0
-        try:
-            # Wait for first request (blocking)
-            first_item = await asyncio.wait_for(
-                self.request_queue.get(),
-                timeout=timeout_sec
-            )
-            batch.append(first_item)
-            # Collect more requests (non-blocking)
-            while len(batch) < self.max_batch_size:
-                try:
-                    item = await asyncio.wait_for(
-                        self.request_queue.get(),
-                        timeout=timeout_sec
-                    )
-                    batch.append(item)
-                except asyncio.TimeoutError:
-                    break  # Timeout reached, process what we have
-        except asyncio.TimeoutError:
-            # No requests in timeout period
-            pass
-        return batch
-    @torch.inference_mode()
-    async def _process_batch(self, batch: List[Tuple[List[int], bool, bool, asyncio.Future]]):
-        """
-        Process a batch of decode requests.
-        Args:
-            batch: List of (tokens, trim_warmup, use_sliding_window, future) tuples
-        """
-        if not batch:
-            return
-        # Extract components
-        token_sequences = [item[0] for item in batch]
-        trim_warmup_flags = [item[1] for item in batch]
-        sliding_window_flags = [item[2] for item in batch]
-        futures = [item[3] for item in batch]
-        lengths = [len(tokens) for tokens in token_sequences]
-        can_batch_efficiently = len(set(lengths)) == 1
-        if can_batch_efficiently and len(batch) > 1:
-            # Efficient batching: all same length
-            try:
-                audio_bytes_list = await self._decode_batch_same_length(
-                    token_sequences, trim_warmup_flags, sliding_window_flags
-                )
-                # Set results
-                for future, audio_bytes in zip(futures, audio_bytes_list):
-                    if not future.done():
-                        future.set_result(audio_bytes)
-            except Exception as e:
-                # Set exceptions
-                for future in futures:
-                    if not future.done():
-                        future.set_exception(e)
-        else:
-            # Sequential decode (different lengths or single item)
-            for tokens, trim_warmup, use_sliding_window, future in batch:
-                try:
-                    audio_bytes = self.decode_to_bytes(
-                        tokens, trim_warmup=trim_warmup, use_sliding_window=use_sliding_window
-                    )
-                    if not future.done():
-                        future.set_result(audio_bytes)
-                except Exception as e:
-                    if not future.done():
-                        future.set_exception(e)
-    async def _decode_batch_same_length(
-        self,
-        token_sequences: List[List[int]],
-        trim_warmup_flags: List[bool],
-        sliding_window_flags: List[bool]
-    ) -> List[Optional[bytes]]:
-        """
-        Decode multiple sequences with same length in parallel.
-        Args:
-            token_sequences: List of token sequences (all same length)
-            trim_warmup_flags: List of trim_warmup flags for each sequence
-            sliding_window_flags: List of use_sliding_window flags for each sequence
-        Returns:
-            List of audio bytes
-        """
-        if not token_sequences:
-            return []
-        # Unpack all sequences
-        unpacked_list = [self.unpack_snac_from_7(tokens) for tokens in token_sequences]
-        # Check all have valid frames
-        valid_indices = [i for i, levels in enumerate(unpacked_list) if levels[0]]
-        if not valid_indices:
-            return [None] * len(token_sequences)
-        # Stack into batched tensors
-        batch_size = len(valid_indices)
-        frames = len(unpacked_list[valid_indices[0]][0])
-        # Build batched codes [batch, frames], [batch, 2*frames], [batch, 4*frames]
-        codes = [
-            torch.stack([
-                torch.tensor(unpacked_list[i][level_idx], dtype=torch.long, device=self.device)
-                for i in valid_indices
-            ], dim=0)
-            for level_idx in range(3)
-        ]
-        # Batched decode
-        z_q = self.snac_model.quantizer.from_codes(codes)
-        audio_batch = self.snac_model.decoder(z_q)  # [batch, 1, samples]
-        # Extract and convert to bytes
-        audio_bytes_list = [None] * len(token_sequences)
-        for batch_idx, orig_idx in enumerate(valid_indices):
-            audio = audio_batch[batch_idx, 0].detach().cpu().numpy()
-            # Apply sliding window or trim warmup based on flags
-            if sliding_window_flags[orig_idx]:
-                # Sliding window mode: keep middle 2048 samples only
-                if len(audio) >= 4096:
-                    audio = audio[2048:4096]
-            else:
-                # Standard mode: trim warm-up if requested
-                if trim_warmup_flags[orig_idx] and len(audio) > 2048:
-                    audio = audio[2048:]
-            # Convert to int16
-            audio_int16 = (audio * 32767).astype(np.int16)
-            audio_bytes_list[orig_idx] = audio_int16.tobytes()
-        return audio_bytes_list

maya1/streaming_pipeline.py DELETED Viewed

@@ -1,159 +0,0 @@
-"""
-Maya1 Streaming Pipeline - Sliding Window Approach
-Implements sliding window technique for smooth streaming without artifacts.
-"""
-import asyncio
-from typing import AsyncGenerator, Optional
-from vllm import SamplingParams
-from .constants import (
-    CODE_END_TOKEN_ID,
-    SNAC_MIN_ID,
-    SNAC_MAX_ID,
-    DEFAULT_TEMPERATURE,
-    DEFAULT_TOP_P,
-    DEFAULT_MAX_TOKENS,
-    DEFAULT_MIN_TOKENS,
-    DEFAULT_REPETITION_PENALTY,
-    DEFAULT_SEED,
-)
-class Maya1SlidingWindowPipeline:
-    """
-    Streaming TTS pipeline using sliding window approach.
-    Decodes overlapping 28-token windows (4 frames) and keeps only
-    the middle 2048 samples for smooth audio continuity.
-    """
-    # Sliding window configuration
-    WINDOW_SIZE = 28  # 4 frames (7 tokens per frame)
-    YIELD_STRIDE = 7  # Yield every 1 frame
-    MIDDLE_SAMPLES = 2048  # Keep middle 2048 samples from each decode
-    def __init__(self, model, prompt_builder, snac_decoder):
-        """
-        Initialize sliding window streaming pipeline.
-        Args:
-            model: Maya1Model instance
-            prompt_builder: Maya1PromptBuilder instance
-            snac_decoder: SNACDecoder instance
-        """
-        self.model = model
-        self.prompt_builder = prompt_builder
-        self.snac_decoder = snac_decoder
-        print(f"Sliding window pipeline initialized")
-    async def generate_speech_stream(
-        self,
-        description: str,
-        text: str,
-        temperature: float = DEFAULT_TEMPERATURE,
-        top_p: float = DEFAULT_TOP_P,
-        max_tokens: int = DEFAULT_MAX_TOKENS,
-        repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
-        seed: Optional[int] = None,
-    ) -> AsyncGenerator[bytes, None]:
-        """
-        Generate speech audio with sliding window streaming.
-        Args:
-            description: Voice description
-            text: Text to synthesize (may include <emotion> tags)
-            temperature: Sampling temperature
-            top_p: Nucleus sampling
-            max_tokens: Max SNAC tokens to generate
-            repetition_penalty: Prevent loops
-            seed: Random seed
-        Yields:
-            Audio bytes (int16 PCM, 24kHz mono)
-        """
-        # Build prompt
-        prompt = self.prompt_builder.build_prefix(description, text)
-        # Configure sampling
-        sampling_params = SamplingParams(
-            temperature=temperature,
-            top_p=top_p,
-            max_tokens=max_tokens,
-            min_tokens=DEFAULT_MIN_TOKENS,
-            repetition_penalty=repetition_penalty,
-            stop_token_ids=[CODE_END_TOKEN_ID],
-            seed=seed if seed is not None else DEFAULT_SEED,
-        )
-        # Stream tokens
-        snac_buffer = []
-        last_yield_position = 0
-        chunk_count = 0
-        total_tokens_seen = 0
-        async for output in self.model.generate_stream(prompt, sampling_params):
-            # Get latest generated tokens (cumulative list)
-            generated_token_ids = output.outputs[0].token_ids
-            # Process only NEW tokens since last iteration
-            new_tokens = generated_token_ids[total_tokens_seen:]
-            total_tokens_seen = len(generated_token_ids)
-            # Collect SNAC codes from new tokens
-            for token_id in new_tokens:
-                # Stop if we hit EOS
-                if token_id == CODE_END_TOKEN_ID:
-                    break
-                # Only collect valid SNAC tokens
-                if SNAC_MIN_ID <= token_id <= SNAC_MAX_ID:
-                    snac_buffer.append(token_id)
-            # Yield audio when we have enough tokens for a window
-            while len(snac_buffer) >= last_yield_position + self.WINDOW_SIZE:
-                # Get window of 28 tokens
-                window_start = last_yield_position
-                window_end = window_start + self.WINDOW_SIZE
-                window = snac_buffer[window_start:window_end]
-                if len(window) == self.WINDOW_SIZE:
-                    # Decode window to audio
-                    audio_bytes = await self.snac_decoder.decode_single_async(window)
-                    if audio_bytes:
-                        # Extract middle portion of audio
-                        audio_samples = len(audio_bytes) // 2
-                        middle_start_sample = (audio_samples - self.MIDDLE_SAMPLES) // 2
-                        middle_end_sample = middle_start_sample + self.MIDDLE_SAMPLES
-                        # Convert to byte positions
-                        middle_start_byte = middle_start_sample * 2
-                        middle_end_byte = middle_end_sample * 2
-                        # Extract middle chunk
-                        audio_chunk = audio_bytes[middle_start_byte:middle_end_byte]
-                        chunk_count += 1
-                        if chunk_count == 1:
-                            print(f" First chunk ready")
-                        yield audio_chunk
-                # Move forward by stride
-                last_yield_position += self.YIELD_STRIDE
-            # Check if generation is done
-            if CODE_END_TOKEN_ID in new_tokens:
-                break
-        # Final chunk: decode remaining tokens
-        remaining_tokens = len(snac_buffer) - last_yield_position
-        if remaining_tokens >= self.WINDOW_SIZE:
-            window = snac_buffer[-self.WINDOW_SIZE:]
-            audio_bytes = await self.snac_decoder.decode_single_async(window)
-            if audio_bytes:
-                yield audio_bytes[-self.MIDDLE_SAMPLES * 2:]
-        frames = len(snac_buffer) // 7
-        duration = frames / 6.86
-        print(f"Streamed {chunk_count} chunks (~{duration:.1f}s audio)")