Initial model upload with complete configuration and weights

Browse files

Files changed (4) hide show

README.md +27 -0
benchmark.py +72 -0
optimize_tts.py +234 -0
requirements.txt +80 -4

README.md CHANGED Viewed

@@ -24,6 +24,12 @@ language:
 - el
 - tr
 ---
 # MOSS-TTS Family
@@ -49,6 +55,27 @@ language:
 </div>
 ## Overview
 MOSS‑TTS Family is an open‑source **speech and sound generation model family** from [MOSI.AI](https://mosi.cn/#hero) and the [OpenMOSS team](https://www.open-moss.com/). It is designed for **high‑fidelity**, **high‑expressiveness**, and **complex real‑world scenarios**, covering stable long‑form speech, multi‑speaker dialogue, voice/character design, environmental sound effects, and real‑time streaming TTS.

 - el
 - tr
 ---
+# MOSS-TTS (CPU Optimized)
+> **Notice**: This is the **CPU-Optimized** version of MOSS-TTS. It includes high-performance inference scripts and has been validated for efficient execution on CPU-only environments using dynamic quantization.
+---
 # MOSS-TTS Family
 </div>
+### CPU Optimized Inference
+This version contains specific optimizations for CPU environments.
+1. **Installation**:
+   ```bash
+   pip install -r requirements.txt
+   ```
+2. **Run Optimized Inference**:
+   Use the `optimize_tts.py` script included in this repository:
+   ```bash
+   python optimize_tts.py --mode int8 --text "Generating speech on CPU."
+   ```
+3. **Optimization Details**:
+   - Runtime Dynamic INT8 Quantization.
+   - Forced Float32 for stability on CPU.
+   - Multi-threaded CPU performance scaling.
+---
 ## Overview
 MOSS‑TTS Family is an open‑source **speech and sound generation model family** from [MOSI.AI](https://mosi.cn/#hero) and the [OpenMOSS team](https://www.open-moss.com/). It is designed for **high‑fidelity**, **high‑expressiveness**, and **complex real‑world scenarios**, covering stable long‑form speech, multi‑speaker dialogue, voice/character design, environmental sound effects, and real‑time streaming TTS.

benchmark.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import subprocess
+import sys
+import os
+import json
+import logging
+logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
+def get_python_exe():
+    """Detects the best python executable to use (prefers venv)."""
+    # Prefer explicit venv path
+    venv_python = os.path.join(os.getcwd(), "venv/bin/python3")
+    if os.path.exists(venv_python):
+        return venv_python
+    return sys.executable
+def run_benchmark():
+    python_exe = get_python_exe()
+    logging.info(f"Using Python executable: {python_exe}")
+    modes = ["fp32", "int8", "selective"]
+    summary = {}
+    os.makedirs("results", exist_ok=True)
+    os.makedirs("outputs", exist_ok=True)
+    for mode in modes:
+        logging.info(f"=== BENCHMARKING MODE: {mode} ===")
+        output_file = f"results/results_{mode}.json"
+        # Run in subprocess to ensure isolated memory measurement
+        cmd = [
+            python_exe, "src/optimize_tts.py",
+            "--mode", mode,
+            "--output_json", output_file,
+            "--text", "This is a benchmarking sample for CPU optimized MOSS TTS. It tests end-to-end latency."
+        ]
+        try:
+            # Capturing outputs
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            if result.returncode == 0:
+                if os.path.exists(output_file):
+                    with open(output_file, "r") as f:
+                        summary[mode] = json.load(f)
+                    logging.info(f"Success: {mode}")
+                else:
+                    logging.error(f"Output file {output_file} not found for mode {mode}")
+            else:
+                logging.error(f"Failed to benchmark {mode}. Return code: {result.returncode}")
+                logging.error(f"STDERR: {result.stderr}")
+        except Exception as e:
+            logging.error(f"Subprocess error for {mode}: {e}")
+    # Print Summary Table
+    print("\n" + "="*80)
+    print(f"{'Quantization Mode':<20} | {'RAM (MB)':<12} | {'Latency (ms)':<15} | {'Load (s)':<10}")
+    print("-" * 80)
+    for mode in modes:
+        if mode in summary:
+            data = summary[mode]
+            print(f"{mode:<20} | {data['peak_ram_mb']:<12.2f} | {data['latency_ms']:<15.2f} | {data['load_time_sec']:<10.2f}")
+        else:
+            print(f"{mode:<20} | {'FAILED':<12} | {'N/A':<15} | {'N/A':<10}")
+    print("="*80 + "\n")
+    with open("results/benchmark_summary.json", "w") as f:
+        json.dump(summary, f, indent=4)
+    logging.info("Benchmark summary saved to results/benchmark_summary.json")
+if __name__ == "__main__":
+    run_benchmark()

optimize_tts.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import os
+import sys
+import time
+import json
+import logging
+import argparse
+import psutil
+import torch
+import torchaudio
+from transformers import AutoProcessor, AutoModel
+def setup_logging():
+    """
+    Sets up a production-grade logger with a stream handler and file logging.
+    Returns:
+        logging.Logger: The configured logger instance.
+    """
+    logger = logging.getLogger("MOSS-TTS-Opt")
+    if not logger.handlers:
+        logger.setLevel(logging.INFO)
+        formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(name)s: %(message)s')
+        # Stream Handler
+        sh = logging.StreamHandler(sys.stdout)
+        sh.setFormatter(formatter)
+        logger.addHandler(sh)
+        # File Handler
+        os.makedirs("logs", exist_ok=True)
+        fh = logging.FileHandler("logs/inference.log")
+        fh.setFormatter(formatter)
+        logger.addHandler(fh)
+    return logger
+class MOSSInferenceEngine:
+    """
+    A high-performance inference engine for MOSS-TTS optimized for CPU execution.
+    This engine handles model loading with float32 enforcement, dynamic INT8 quantization,
+    and optimized audio generation specifically for CPU-only environments.
+    """
+    def __init__(self, model_id: str = "OpenMOSS-Team/MOSS-TTS", device: str = "cpu"):
+        """
+        Initializes the inference engine.
+        Args:
+            model_id (str): Hugging Face model repository ID.
+            device (str): Device to run inference on (default is 'cpu').
+        """
+        self.model_id = model_id
+        self.device = device
+        self.model = None
+        self.processor = None
+        self.logger = setup_logging()
+        # Optimize CPU threading for PyTorch
+        self.threads = os.cpu_count()
+        torch.set_num_threads(self.threads)
+        self.logger.info(f"Engine: Initialized with {self.threads} CPU threads.")
+    def load(self, trust_remote_code: bool = True):
+        """
+        Loads the model and processor from the Hugging Face Hub.
+        Enforces float32 to ensure compatibility with CPU quantization and avoid dtype mismatches.
+        Args:
+            trust_remote_code (bool): Whether to trust remote code from the model repository.
+        """
+        self.logger.info(f"Engine: Loading model and processor: {self.model_id}")
+        start_time = time.time()
+        try:
+            self.processor = AutoProcessor.from_pretrained(self.model_id, trust_remote_code=trust_remote_code)
+            # Implementation Note: We explicitly use torch_dtype=torch.float32 to avoid
+            # BFloat16/Float16 weight mismatches during torch.ao.quantization.quantize_dynamic calls on CPU.
+            self.model = AutoModel.from_pretrained(
+                self.model_id,
+                trust_remote_code=trust_remote_code,
+                torch_dtype=torch.float32,
+                low_cpu_mem_usage=True
+            ).to(self.device)
+            # Defensive cast to ensure all parameters are indeed float32
+            self.model = self.model.float()
+            self.model.eval()
+            self.logger.info(f"Engine: Load complete in {time.time() - start_time:.2f}s")
+        except Exception as e:
+            self.logger.error(f"Engine: Model loading failed: {e}")
+            raise
+    def quantize(self, mode: str = "int8"):
+        """
+        Applies a dynamic quantization strategy to the model.
+        Args:
+            mode (str): Quantization strategy - 'fp32' (none), 'int8' (full), or 'selective'.
+        """
+        if mode == "fp32":
+            self.logger.info("Engine: Operating in FP32 mode (No quantization).")
+            return
+        start_q = time.time()
+        if mode == "int8":
+            self.logger.info("Engine: Applying full Dynamic INT8 quantization to Linear layers...")
+            self.model = torch.quantization.quantize_dynamic(
+                self.model, {torch.nn.Linear}, dtype=torch.qint8
+            )
+        elif mode == "selective":
+            self.logger.info("Engine: Applying selective Dynamic INT8 quantization (Backbone only)...")
+            # Target the heavy language model backbone
+            if hasattr(self.model, 'language_model'):
+                self.model.language_model = torch.quantization.quantize_dynamic(
+                    self.model.language_model, {torch.nn.Linear}, dtype=torch.qint8
+                )
+            # Target the output heads if present
+            if hasattr(self.model, 'lm_heads'):
+                self.model.lm_heads = torch.quantization.quantize_dynamic(
+                    self.model.lm_heads, {torch.nn.Linear}, dtype=torch.qint8
+                )
+        self.logger.info(f"Engine: Quantization ({mode}) completed in {time.time() - start_q:.2f}s.")
+    def generate(self, text: str, max_new_tokens: int = 50, output_wav: str = None) -> dict:
+        """
+        Synthesizes speech from text and saves the output to a WAV file.
+        Args:
+            text (str): Input text to synthesize.
+            max_new_tokens (int): Maximum generation length.
+            output_wav (str): File path to save the generated audio.
+        Returns:
+            dict: Latency and output metadata.
+        """
+        self.logger.info(f"Engine: Generating for text sample: '{text[:50]}...'")
+        conversations = [{"role": "user", "content": text}]
+        inputs = self.processor(conversations=conversations, return_tensors="pt").to(self.device)
+        start_inf = time.time()
+        with torch.no_grad():
+            outputs = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
+        latency = (time.time() - start_inf) * 1000
+        self.logger.info(f"Engine: Generation finished in {latency:.2f}ms")
+        if output_wav:
+            self._save_audio(outputs, output_wav)
+        return {"latency_ms": latency}
+    def _save_audio(self, outputs, output_path: str):
+        """Helper to extract and save audio from model outputs."""
+        try:
+            waveform = None
+            if isinstance(outputs, torch.Tensor):
+                waveform = outputs
+            elif isinstance(outputs, dict) and "waveform" in outputs:
+                waveform = outputs["waveform"]
+            elif hasattr(outputs, "waveform"):
+                waveform = outputs.waveform
+            if waveform is not None:
+                waveform = waveform.detach().cpu().float()
+                if waveform.dim() == 1:
+                    waveform = waveform.unsqueeze(0)
+                elif waveform.dim() == 3: # Case: [batch, channel, time]
+                    waveform = waveform.squeeze(0)
+                # Retrieve sample rate from model config or default to 24000
+                sr = getattr(self.model.config, "sampling_rate", 24000)
+                os.makedirs(os.path.dirname(output_path), exist_ok=True)
+                torchaudio.save(output_path, waveform, sr)
+                self.logger.info(f"Engine: Audio saved to {output_path}")
+            else:
+                self.logger.warning("Engine: No waveform found in model outputs.")
+        except Exception as e:
+            self.logger.error(f"Engine: Audio saving error: {e}")
+def get_current_ram():
+    """Calculates the current process RAM usage in MB."""
+    return psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
+def main():
+    """Main entry point for the CLI tool."""
+    parser = argparse.ArgumentParser(description="Production-grade MOSS-TTS Optimizer for CPU")
+    parser.add_argument("--mode", type=str, choices=["fp32", "int8", "selective"], default="fp32",
+                        help="Quantization mode (fp32, int8, selective).")
+    parser.add_argument("--text", type=str, default="Validating the optimized CPU inference pipeline for MOSS TTS.",
+                        help="Text string to synthesize.")
+    parser.add_argument("--output_json", type=str, default="results/metrics.json",
+                        help="Path to save performance metrics (JSON).")
+    parser.add_argument("--output_wav", type=str, default="outputs/generated_audio.wav",
+                        help="Path to save the generated audio (WAV).")
+    args = parser.parse_args()
+    logger = setup_logging()
+    initial_ram = get_current_ram()
+    try:
+        engine = MOSSInferenceEngine()
+        load_start = time.time()
+        engine.load()
+        load_time = time.time() - load_start
+        engine.quantize(mode=args.mode)
+        peak_ram = get_current_ram()
+        # Adjust wav path to include mode
+        wav_path = args.output_wav.replace(".wav", f"_{args.mode}.wav")
+        res = engine.generate(args.text, output_wav=wav_path)
+        final_stats = {
+            "mode": args.mode,
+            "load_time_sec": load_time,
+            "peak_ram_mb": peak_ram,
+            "ram_usage_delta_mb": peak_ram - initial_ram,
+            "latency_ms": res["latency_ms"]
+        }
+        os.makedirs(os.path.dirname(args.output_json), exist_ok=True)
+        with open(args.output_json, "w") as f:
+            json.dump(final_stats, f, indent=4)
+        logger.info(f"Success: Mode={args.mode} | RAM={peak_ram:.2f}MB | Latency={res['latency_ms']:.2f}ms")
+    except Exception as e:
+        logger.error(f"Execution failed: {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -1,7 +1,83 @@
-transformers>=4.40.0
-torch
-torchaudio
 huggingface_hub
 psutil
-accelerate>=0.26.0
 pypinyin

+Jinja2==3.1.6
+MarkupSafe==3.0.3
+PyYAML==6.0.3
+Pygments==2.19.2
+accelerate==1.12.0
+accelerate>=0.26.0
+annotated-doc==0.0.4
+anyio==4.12.1
+audioread==3.1.0
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+cuda-bindings==12.9.4
+cuda-pathfinder==1.3.4
+decorator==5.2.1
+filelock==3.24.3
+fsspec==2026.2.0
+h11==0.16.0
+hf-xet==1.2.0
+httpcore==1.0.9
+httpx==0.28.1
 huggingface_hub
+huggingface_hub==1.4.1
+idna==3.11
+joblib==1.5.3
+lazy_loader==0.4
+librosa==0.11.0
+llvmlite==0.46.0
+markdown-it-py==4.0.0
+mdurl==0.1.2
+mpmath==1.3.0
+msgpack==1.1.2
+networkx==3.6.1
+numba==0.64.0
+numpy==2.4.2
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.4.5
+nvidia-nvtx-cu12==12.8.90
+packaging==26.0
+platformdirs==4.9.2
+pooch==1.9.0
 psutil
+psutil==7.2.2
+pycparser==3.0
 pypinyin
+regex==2026.2.19
+requests==2.32.5
+rich==14.3.3
+safetensors==0.7.0
+scikit-learn==1.8.0
+scipy==1.17.1
+setuptools==82.0.0
+shellingham==1.5.4
+soundfile==0.13.1
+soxr==1.0.0
+sympy==1.14.0
+threadpoolctl==3.6.0
+tokenizers==0.22.2
+torch
+torch==2.10.0
+torchaudio
+torchaudio==2.10.0
+tqdm==4.67.3
+transformers==5.2.0
+transformers>=4.40.0
+triton==3.6.0
+typer-slim==0.24.0
+typer==0.24.1
+typing_extensions==4.15.0
+urllib3==2.6.3