ThreadAbort commited on Jun 1

Commit

33774ef

0 Parent(s):

Duplicate from ThreadAbort/IndexTTS-Rust

Browse files

Co-authored-by: Christopher Chenoweth <ThreadAbort@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +58 -0
.gitignore +38 -0
CLAUDE.md +140 -0
CODEBASE_ANALYSIS.md +594 -0
Cargo.lock +3683 -0
Cargo.toml +88 -0
DIRECTORY_STRUCTURE.txt +224 -0
EXPLORATION_SUMMARY.md +283 -0
LICENSE +201 -0
MANIFEST.in +3 -0
README.md +295 -0
SOURCE_FILE_LISTING.txt +513 -0
archive/README_INDEXTTS_1_5.md +247 -0
benches/inference.rs +98 -0
benches/mel_spectrogram.rs +45 -0
config.yaml +51 -0
context.md +383 -0
crates/marine_salience/Cargo.toml +18 -0
crates/marine_salience/src/config.rs +140 -0
crates/marine_salience/src/ema.rs +126 -0
crates/marine_salience/src/lib.rs +42 -0
crates/marine_salience/src/packet.rs +122 -0
crates/marine_salience/src/processor.rs +334 -0
docs/Integrating Marine Algorithm into IndexTTS-Rust.md +450 -0
examples/analyze_chris.rs +3 -0
examples/cases.jsonl +3 -0
examples/emo_hate.wav +3 -0
examples/emo_sad.wav +3 -0
examples/marine_test.rs +3 -0
examples/voice_01.wav +3 -0
examples/voice_02.wav +3 -0
examples/voice_03.wav +3 -0
examples/voice_04.wav +3 -0
examples/voice_05.wav +3 -0
examples/voice_06.wav +3 -0
examples/voice_07.wav +3 -0
examples/voice_08.wav +3 -0
examples/voice_09.wav +3 -0
examples/voice_10.wav +3 -0
examples/voice_11.wav +3 -0
examples/voice_12.wav +3 -0
models/bigvgan.onnx +3 -0
models/bigvgan.onnx.data +3 -0
models/speaker_encoder.onnx +3 -0
models/speaker_encoder.onnx.data +3 -0
src/audio/dsp.rs +210 -0
src/audio/io.rs +150 -0
src/audio/mel.rs +356 -0
src/audio/mod.rs +57 -0
src/audio/resample.rs +75 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,58 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/voice_05.wav filter=lfs diff=lfs merge=lfs -text
+examples/voice_07.wav filter=lfs diff=lfs merge=lfs -text
+examples/voice_08.wav filter=lfs diff=lfs merge=lfs -text
+examples/voice_09.wav filter=lfs diff=lfs merge=lfs -text
+examples/emo_sad.wav filter=lfs diff=lfs merge=lfs -text
+examples/voice_02.wav filter=lfs diff=lfs merge=lfs -text
+examples/voice_06.wav filter=lfs diff=lfs merge=lfs -text
+examples/voice_10.wav filter=lfs diff=lfs merge=lfs -text
+examples/voice_11.wav filter=lfs diff=lfs merge=lfs -text
+examples/voice_12.wav filter=lfs diff=lfs merge=lfs -text
+examples/emo_hate.wav filter=lfs diff=lfs merge=lfs -text
+examples/voice_01.wav filter=lfs diff=lfs merge=lfs -text
+examples/voice_03.wav filter=lfs diff=lfs merge=lfs -text
+examples/voice_04.wav filter=lfs diff=lfs merge=lfs -text
+indextts/utils/maskgct/models/codec/facodec/modules/JDC/bst.t7 filter=lfs diff=lfs merge=lfs -text
+examples/* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*. filter=lfs diff=lfs merge=lfs -text
+.onnx filter=lfs diff=lfs merge=lfs -text
+.wav filter=lfs diff=lfs merge=lfs -text
+.mp3 filter=lfs diff=lfs merge=lfs -text
+.flac filter=lfs diff=lfs merge=lfs -text
+*.onnx.data filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,38 @@

+venv/
+__pycache__
+*.egg-info
+*.DS_Store
+.idea/
+.vscode/
+checkpoints/*.pth
+checkpoints/*.vocab
+checkpoints/*.model
+checkpoints/.cache
+outputs/
+build/
+*.py[cod]
+*.egg-info/
+.venv
+checkpoints/*
+__MACOSX
+.lock
+# Python build artifacts
+*.py[cod]
+*.egg-info/
+.venv
+build/
+dist/
+*.egg-info/
+# Rust build artifacts
+/target/
+**/*.rs.bk
+.venv/
+.claude-flow/
+**/target/
+indexout/
+output.wav
+*.wav
+*.flac
+.swarm/
+.claude/
+clone_chris.py

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,140 @@

+# CLAUDE.md
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+## Project Overview
+IndexTTS-Rust is a high-performance Text-to-Speech engine, a complete Rust rewrite of the Python IndexTTS system. It uses ONNX Runtime for neural network inference and provides zero-shot voice cloning with emotion control.
+## Build and Development Commands
+```bash
+# Build (always build release for performance testing)
+cargo build --release
+# Run linter (MANDATORY before commits - catches many issues)
+cargo clippy -- -D warnings
+# Run tests
+cargo test
+# Run specific test
+cargo test test_name
+# Run benchmarks (Criterion-based)
+cargo bench
+# Run specific benchmark
+cargo bench --bench mel_spectrogram
+cargo bench --bench inference
+# Check compilation without building
+cargo check
+# Format code
+cargo fmt
+# Full pre-commit workflow (BUILD -> CLIPPY -> BUILD)
+cargo build --release && cargo clippy -- -D warnings && cargo build --release
+```
+## CLI Usage
+```bash
+# Show help
+./target/release/indextts --help
+# Synthesize speech
+./target/release/indextts synthesize \
+  --text "Hello world" \
+  --voice examples/voice_01.wav \
+  --output output.wav
+# Generate default config
+./target/release/indextts init-config -o config.yaml
+# Show system info
+./target/release/indextts info
+# Run built-in benchmarks
+./target/release/indextts benchmark --iterations 100
+```
+## Architecture
+The codebase follows a modular pipeline architecture where each stage processes data sequentially:
+```
+Text Input → Normalization → Tokenization → Model Inference → Vocoding → Audio Output
+```
+### Core Modules (src/)
+- **audio/** - Audio DSP operations
+  - `mel.rs` - Mel-spectrogram computation (STFT, filterbanks)
+  - `io.rs` - WAV file I/O using hound
+  - `dsp.rs` - Signal processing utilities
+  - `resample.rs` - Audio resampling using rubato
+- **text/** - Text processing pipeline
+  - `normalizer.rs` - Text normalization (Chinese/English/mixed)
+  - `tokenizer.rs` - BPE tokenization via HuggingFace tokenizers
+  - `phoneme.rs` - Grapheme-to-phoneme conversion
+- **model/** - Neural network inference
+  - `session.rs` - ONNX Runtime wrapper (load-dynamic feature)
+  - `gpt.rs` - GPT-based sequence generation
+  - `embedding.rs` - Speaker and emotion encoders
+- **vocoder/** - Neural vocoding
+  - `bigvgan.rs` - BigVGAN waveform synthesis
+  - `activations.rs` - Snake/SnakeBeta activation functions
+- **pipeline/** - TTS orchestration
+  - `synthesis.rs` - Main synthesis logic, coordinates all modules
+- **config/** - Configuration management (YAML-based via serde)
+- **error.rs** - Error types using thiserror
+- **lib.rs** - Library entry point, exposes public API
+- **main.rs** - CLI entry point using clap
+### Key Constants (lib.rs)
+```rust
+pub const SAMPLE_RATE: u32 = 22050;  // Output audio sample rate
+pub const N_MELS: usize = 80;        // Mel filterbank channels
+pub const N_FFT: usize = 1024;       // FFT size
+pub const HOP_LENGTH: usize = 256;   // STFT hop length
+```
+### Dependencies Pattern
+- **Audio**: hound (WAV), rustfft/realfft (DSP), rubato (resampling), dasp (signal processing)
+- **ML Inference**: ort (ONNX Runtime with load-dynamic), ndarray, safetensors
+- **Text**: tokenizers (HuggingFace), jieba-rs (Chinese), regex, unicode-segmentation
+- **Parallelism**: rayon (data parallelism), tokio (async)
+- **CLI**: clap (derive), env_logger, indicatif
+## Important Notes
+1. **ONNX Runtime**: Uses `load-dynamic` feature - requires ONNX Runtime library installed on system
+2. **Model Files**: ONNX models go in `models/` directory (not in git, download separately)
+3. **Reference Implementation**: Python code in `indextts - REMOVING - REF ONLY/` is kept for reference only
+4. **Performance**: Release builds use LTO and single codegen-unit for maximum optimization
+5. **Audio Format**: All internal processing at 22050 Hz, 80-band mel spectrograms
+## Testing Strategy
+- Unit tests inline in modules
+- Criterion benchmarks in `benches/` for performance regression testing
+- Python regression tests in `tests/` for end-to-end validation
+- Example audio files in `examples/` for testing voice cloning
+## Missing Infrastructure (TODO)
+- No `scripts/manage.sh` yet (should include build, test, clean, docker controls)
+- No `context.md` yet for conversation continuity
+- No integration tests with actual ONNX models

CODEBASE_ANALYSIS.md ADDED Viewed

	@@ -0,0 +1,594 @@

+# IndexTTS-Rust Comprehensive Codebase Analysis
+## Executive Summary
+**IndexTTS** is an **industrial-level, controllable, and efficient zero-shot Text-To-Speech (TTS) system** currently implemented in **Python** using PyTorch. The project is being converted to Rust (as indicated by the branch name `claude/convert-to-rust-01USgPYEqMyp5KXjjFNVwztU`).
+**Key Statistics:**
+- **Total Python Files:** 194
+- **Total Lines of Code:** ~25,000+ (not counting dependencies)
+- **Current Version:** IndexTTS 1.5 (latest with stability improvements, especially for English)
+- **No Rust code exists yet** - this is a fresh conversion project
+---
+## 1. PROJECT STRUCTURE
+### Root Directory Layout
+```
+IndexTTS-Rust/
+├── indextts/              # Main package (194 .py files)
+│   ├── gpt/               # GPT-based model implementation
+│   ├── BigVGAN/           # Vocoder for audio synthesis
+│   ├── s2mel/             # Semantic-to-Mel spectrogram conversion
+│   ├── utils/             # Text processing, feature extraction, utilities
+│   └── vqvae/             # Vector Quantized VAE components
+├── examples/              # Sample audio files and test cases
+├── tests/                 # Test files for regression testing
+├── tools/                 # Utility scripts and i18n support
+├── webui.py               # Gradio-based web interface (18KB)
+├── cli.py                 # Command-line interface
+├── requirements.txt       # Python dependencies
+└── archive/               # Historical documentation
+```
+---
+## 2. CURRENT IMPLEMENTATION (PYTHON)
+### Programming Language & Framework
+- **Language:** Python 3.x
+- **Deep Learning Framework:** PyTorch (primary dependency)
+- **Model Format:** HuggingFace compatible (.safetensors)
+### Key Dependencies (requirements.txt)
+| Dependency | Version | Purpose |
+|-----------|---------|---------|
+| torch | (implicit) | Deep learning framework |
+| transformers | 4.52.1 | HuggingFace transformers library |
+| librosa | 0.10.2.post1 | Audio processing |
+| numpy | 1.26.2 | Numerical computing |
+| accelerate | 1.8.1 | Distributed training/inference |
+| deepspeed | 0.17.1 | Inference optimization |
+| torchaudio | (implicit) | Audio I/O |
+| safetensors | 0.5.2 | Model serialization |
+| gradio | (latest) | Web UI framework |
+| modelscope | 1.27.0 | Model hub integration |
+| jieba | 0.42.1 | Chinese text tokenization |
+| g2p-en | 2.1.0 | English phoneme conversion |
+| sentencepiece | (latest) | BPE tokenization |
+| descript-audiotools | 0.7.2 | Audio manipulation |
+| cn2an | 0.5.22 | Chinese number normalization |
+| WeTextProcessing / wetext | (conditional) | Text normalization (Linux/macOS) |
+---
+## 3. MAIN FUNCTIONALITY - THE TTS PIPELINE
+### What IndexTTS Does
+**IndexTTS is a zero-shot multi-lingual TTS system that:**
+1. **Takes text input** (Chinese, English, or mixed)
+2. **Takes a voice reference audio** (speaker prompt)
+3. **Generates high-quality speech** in the speaker's voice
+4. **Supports multiple control mechanisms:**
+   - Pinyin-based pronunciation control (for Chinese)
+   - Pause control via punctuation
+   - Emotion vector manipulation (8 dimensions)
+   - Emotion text guidance via Qwen model
+   - Style reference audio
+### Core TTS Pipeline (infer_v2.py - 739 lines)
+```
+Input Text
+    ↓
+Text Normalization (TextNormalizer)
+    ├─ Chinese-specific normalization
+    ├─ English-specific normalization
+    ├─ Pinyin tone extraction/preservation
+    └─ Name entity handling
+    ↓
+Text Tokenization (TextTokenizer + SentencePiece)
+    ├─ CJK character handling
+    └─ BPE encoding
+    ↓
+Semantic Encoding (w2v-BERT model)
+    ├─ Input: Text tokens + Reference audio
+    ├─ Process: Semantic codec (RepCodec)
+    └─ Output: Semantic codes
+    ↓
+Speaker Conditioning
+    ├─ Extract features from reference audio
+    ├─ CAMPPlus speaker embedding
+    ├─ Emotion embedding (from reference or text)
+    └─ Mel spectrogram reference
+    ↓
+GPT-based Sequence Generation (UnifiedVoice)
+    ├─ Semantic tokens → Mel tokens
+    ├─ Conformer-based speaker conditioning
+    ├─ Perceiver-based attention pooling
+    └─ Emotion control via vectors or text
+    ↓
+Length Regulation (s2mel)
+    ├─ Acoustic code expansion
+    ├─ Flow matching for duration modeling
+    └─ CFM (Continuous Flow Matching) estimator
+    ↓
+BigVGAN Vocoder
+    ├─ Mel spectrogram → Waveform
+    ├─ Uses anti-aliased activation functions
+    ├─ Optional CUDA kernel optimization
+    └─ Optional DeepSpeed acceleration
+    ↓
+Output Audio Waveform (22050 Hz)
+```
+---
+## 4. KEY ALGORITHMS AND COMPONENTS NEEDING RUST CONVERSION
+### A. Text Processing Pipeline
+**TextNormalizer (front.py - ~500 lines)**
+- Chinese text normalization using WeTextProcessing/wetext
+- English text normalization
+- Pinyin tone extraction and preservation
+- Name entity detection and preservation
+- Character mapping and replacement
+- Pattern matching using regex
+**TextTokenizer (front.py - ~200 lines)**
+- SentencePiece BPE tokenization
+- CJK character tokenization
+- Special token handling (BOS, EOS, UNK)
+- Vocabulary management
+### B. Neural Network Components
+#### 1. **UnifiedVoice GPT Model** (model_v2.py - 747 lines)
+   - Multi-layer transformer (configurable depth)
+   - Speaker conditioning via Conformer encoder
+   - Perceiver resampler for attention pooling
+   - Emotion conditioning encoder
+   - Position embeddings (learned)
+   - Mel and text embeddings
+   - Final layer norm + linear output layer
+#### 2. **Conformer Encoder** (conformer_encoder.py - 520 lines)
+   - Conformer blocks with attention + convolution
+   - Multi-head self-attention with relative position bias
+   - Positionwise feed-forward networks
+   - Layer normalization
+   - Subsampling layers (Conv2d with various factors)
+   - Positional encoding (absolute and relative)
+#### 3. **Perceiver Resampler** (perceiver.py - 317 lines)
+   - Latent queries (learnable embeddings)
+   - Cross-attention with context
+   - Feed-forward networks
+   - Dimension projection
+#### 4. **BigVGAN Vocoder** (models.py - ~1000 lines)
+   - Multi-scale convolution blocks (AMPBlock1, AMPBlock2)
+   - Anti-aliased activation functions (Snake, SnakeBeta)
+   - Spectral normalization
+   - Transposed convolution upsampling
+   - Weight normalization
+   - Optional CUDA kernel for activation
+#### 5. **S2Mel (Semantic-to-Mel) Model** (s2mel/modules/)
+   - Flow matching / CFM (Continuous Flow Matching)
+   - Length regulator
+   - Diffusion transformer
+   - Acoustic codec quantization
+   - Style embeddings
+### C. Feature Extraction & Processing
+**Audio Processing (audio.py)**
+- Mel spectrogram computation using librosa
+- Hann windowing and STFT
+- Dynamic range compression/decompression
+- Spectral normalization
+**Semantic Models**
+- W2V-BERT (wav2vec 2.0 BERT) embeddings
+- RepCodec (semantic codec with vector quantization)
+- Amphion Codec encoders/decoders
+**Speaker Features**
+- CAMPPlus speaker embedding (192-dim)
+- Campplus model inference
+- Mel-based reference features
+### D. Model Loading & Configuration
+**Checkpoint Loading** (checkpoint.py - ~50 lines)
+- Model weight restoration from .safetensors/.pt files
+**HuggingFace Integration**
+- Model hub downloads
+- Configuration loading (OmegaConf)
+**Configuration System** (YAML-based)
+- Model architecture parameters
+- Training/inference settings
+- Dataset configuration
+- Vocoder settings
+---
+## 5. EXTERNAL MODELS USED
+### Pre-trained Models (Downloaded from HuggingFace)
+| Model | Source | Purpose | Size | Parameters |
+|-------|--------|---------|------|-----------|
+| IndexTTS-2 | IndexTeam/IndexTTS-2 | Main TTS model | ~2GB | Various checkpoints |
+| W2V-BERT-2.0 | facebook/w2v-bert-2.0 | Semantic feature extraction | ~1GB | 614M |
+| MaskGCT | amphion/MaskGCT | Semantic codec | - | - |
+| CAMPPlus | funasr/campplus | Speaker embedding | ~100MB | - |
+| BigVGAN v2 | nvidia/bigvgan_v2_22khz_80band_256x | Vocoder | ~100MB | - |
+| Qwen Model | (via modelscope) | Emotion text guidance | Variable | - |
+### Model Component Breakdown
+```
+Checkpoint Files Loaded:
+├── gpt_checkpoint.pth          # UnifiedVoice model weights
+├── s2mel_checkpoint.pth        # Semantic-to-Mel model
+├── bpe_model.model             # SentencePiece tokenizer
+├── emotion_matrix.pt           # Emotion embedding vectors (8-dim)
+├── speaker_matrix.pt           # Speaker embedding matrix
+├── w2v_stat.pt                 # Semantic model statistics (mean/std)
+├── qwen_emo_path/              # Qwen-based emotion detector
+└── vocoder config              # BigVGAN vocoder config
+```
+---
+## 6. INFERENCE MODES & CAPABILITIES
+### A. Single Text Generation
+```python
+tts.infer(
+    spk_audio_prompt="voice.wav",
+    text="Hello world",
+    output_path="output.wav",
+    emo_audio_prompt=None,      # Optional emotion reference
+    emo_alpha=1.0,              # Emotion weight
+    emo_vector=None,            # Direct emotion control [0-1 values]
+    use_emo_text=False,         # Generate emotion from text
+    emo_text=None,              # Text for emotion extraction
+    interval_silence=200        # Silence between segments (ms)
+)
+```
+### B. Batch/Fast Inference
+```python
+tts.infer_fast(...)  # Parallel segment generation
+```
+### C. Multi-language Support
+- **Chinese (Simplified & Traditional):** Full pinyin support
+- **English:** Phoneme-based
+- **Mixed:** Chinese + English in single utterance
+### D. Emotion Control Methods
+1. **Reference Audio:** Extract from emotion_audio_prompt
+2. **Emotion Vectors:** Direct 8-dimensional control
+3. **Text-based:** Use Qwen model to detect emotion from text
+4. **Speaker-based:** Use speaker's natural emotion
+### E. Punctuation-based Pausing
+- Periods, commas, question marks, exclamation marks trigger pauses
+- Pause duration controlled via configuration
+---
+## 7. MAJOR COMPONENTS BREAKDOWN
+### indextts/gpt/ (16,953 lines)
+**Purpose:** GPT-based sequence-to-sequence modeling
+**Files:**
+- `model_v2.py` (747L) - UnifiedVoice implementation, GPT2InferenceModel
+- `model.py` (713L) - Original model (v1)
+- `conformer_encoder.py` (520L) - Conformer speaker encoder
+- `perceiver.py` (317L) - Perceiver attention mechanism
+- `transformers_*.py` (~13,000L) - HuggingFace transformer implementations (customized)
+### indextts/BigVGAN/ (6+ files, ~1000+ lines)
+**Purpose:** Neural vocoder for mel-to-audio conversion
+**Key Files:**
+- `models.py` - BigVGAN architecture with AMPBlocks
+- `ECAPA_TDNN.py` - Speaker encoder
+- `activations.py` - Snake/SnakeBeta activation functions
+- `alias_free_activation/` - Anti-aliasing filters (CUDA + Torch versions)
+- `alias_free_torch/` - Pure PyTorch fallback
+- `nnet/` - Network modules (normalization, CNN, linear)
+### indextts/s2mel/ (~500+ lines)
+**Purpose:** Semantic tokens → Mel spectrogram conversion
+**Key Files:**
+- `modules/audio.py` - Mel spectrogram computation
+- `modules/commons.py` - Common utilities
+- `modules/layers.py` - Neural network layers
+- `modules/length_regulator.py` - Duration modeling
+- `modules/flow_matching.py` - Continuous flow matching
+- `modules/diffusion_transformer.py` - Diffusion-based generation
+- `modules/rmvpe.py` - Pitch extraction
+- `modules/bigvgan/` - BigVGAN vocoder
+- `dac/` - DAC (Descript Audio Codec)
+### indextts/utils/ (12+ files, ~500 lines)
+**Purpose:** Text processing, feature extraction, utilities
+**Key Files:**
+- `front.py` (700L) - TextNormalizer, TextTokenizer
+- `maskgct_utils.py` (250L) - Semantic codec builders
+- `arch_util.py` - Architecture utilities (AttentionBlock)
+- `checkpoint.py` - Model loading
+- `xtransformers.py` (1600L) - Transformer utilities
+- `feature_extractors.py` - Mel spectrogram features
+- `typical_sampling.py` - Sampling strategies
+- `maskgct/` - MaskGCT codec components (~100+ files)
+### indextts/utils/maskgct/ (~100+ Python files)
+**Purpose:** MaskGCT (Masked Generative Codec Transformer) implementation
+**Components:**
+- `models/codec/` - Various audio codecs (Amphion, FACodec, SpeechTokenizer, NS3, VEVo, KMeans)
+- `models/tts/maskgct/` - TTS-specific implementations
+- Multiple codec variants with quantization
+---
+## 8. CONFIGURATION & MODEL DOWNLOADING
+### Configuration System (OmegaConf YAML)
+Example config.yaml structure:
+```yaml
+gpt:
+  layers: 8
+  model_dim: 512
+  heads: 8
+  max_text_tokens: 120
+  max_mel_tokens: 250
+  stop_mel_token: 8193
+  conformer_config: {...}
+vocoder:
+  name: "nvidia/bigvgan_v2_22khz_80band_256x"
+s2mel:
+  checkpoint: "models/s2mel.pth"
+  preprocess_params:
+    sr: 22050
+    spect_params:
+      n_fft: 1024
+      hop_length: 256
+      n_mels: 80
+dataset:
+  bpe_model: "models/bpe.model"
+emotions:
+  num: [5, 6, 8, ...]  # Emotion vector counts per dimension
+w2v_stat: "models/w2v_stat.pt"
+```
+### Model Auto-download
+```python
+download_model_from_huggingface(
+    local_path="./checkpoints",
+    cache_path="./checkpoints/hf_cache"
+)
+```
+Preloads from HuggingFace:
+- IndexTeam/IndexTTS-2
+- amphion/MaskGCT
+- funasr/campplus
+- facebook/w2v-bert-2.0
+- nvidia/bigvgan_v2_22khz_80band_256x
+---
+## 9. INTERFACES
+### A. Command Line (cli.py - 64 lines)
+```bash
+python -m indextts.cli "Text to synthesize" \
+  -v voice_prompt.wav \
+  -o output.wav \
+  -c checkpoints/config.yaml \
+  --model_dir checkpoints \
+  --fp16 \
+  -d cuda:0
+```
+### B. Web UI (webui.py - 18KB)
+Gradio-based interface with:
+- Real-time inference
+- Multiple emotion control modes
+- Example cases loading
+- Language selection (Chinese/English)
+- Batch processing
+- Cache management
+### C. Python API (infer_v2.py)
+```python
+from indextts.infer_v2 import IndexTTS2
+tts = IndexTTS2(
+    cfg_path="checkpoints/config.yaml",
+    model_dir="checkpoints",
+    use_fp16=True,
+    device="cuda:0"
+)
+audio = tts.infer(
+    spk_audio_prompt="speaker.wav",
+    text="Hello",
+    output_path="output.wav"
+)
+```
+---
+## 10. CRITICAL ALGORITHMS TO IMPLEMENT
+### Priority 1: Core Inference Pipeline
+1. **Text Normalization** - Pattern matching, phoneme handling
+2. **Text Tokenization** - SentencePiece integration
+3. **Semantic Encoding** - W2V-BERT model inference
+4. **GPT Generation** - Token-by-token generation with sampling
+5. **Vocoder** - BigVGAN mel-to-audio conversion
+### Priority 2: Feature Extraction
+1. **Mel Spectrogram** - STFT, librosa filters
+2. **Speaker Embeddings** - CAMPPlus inference
+3. **Emotion Encoding** - Vector quantization
+4. **Audio Loading/Processing** - Resampling, normalization
+### Priority 3: Advanced Features
+1. **Conformer Encoding** - Complex attention mechanism
+2. **Perceiver Pooling** - Cross-attention mechanisms
+3. **Flow Matching** - Continuous diffusion
+4. **Length Regulation** - Duration prediction
+### Priority 4: Optional Optimizations
+1. **CUDA Kernels** - Anti-aliased activations
+2. **DeepSpeed Integration** - Model parallelism
+3. **KV Cache** - Inference optimization
+---
+## 11. DATA FLOW EXAMPLE
+```
+Input: text="你好", voice="speaker.wav", emotion="happy"
+1. TextNormalizer.normalize("你好")
+   → "你好" (no change needed)
+2. TextTokenizer.encode("你好")
+   → [token_id_1, token_id_2, ...]
+3. Audio Loading & Processing:
+   - Load speaker.wav → 22050 Hz
+   - Extract W2V-BERT features
+   - Get semantic codes via RepCodec
+   - Extract CAMPPlus embedding (192-dim)
+   - Compute mel spectrogram
+4. Emotion Processing:
+   - If emotion vector: scale by emotion_alpha
+   - If emotion audio: extract embeddings
+   - Create emotion conditioning
+5. GPT Generation:
+   - Input: [semantic_codes, text_tokens]
+   - Output: mel_tokens (variable length)
+6. Length Regulation (s2mel):
+   - Input: mel_tokens + speaker_style
+   - Output: acoustic_codes (fine-grained tokens)
+7. BigVGAN Vocoding:
+   - Input: acoustic_codes → mel_spectrogram
+   - Output: waveform at 22050 Hz
+8. Post-processing:
+   - Optional silence insertion
+   - Audio normalization
+   - WAV file writing
+```
+---
+## 12. TESTING
+### Regression Tests (regression_test.py)
+Tests various scenarios:
+- Chinese text with pinyin tones
+- English text
+- Mixed Chinese/English
+- Long-form text
+- Names and entities
+- Special punctuation
+### Padding Tests (padding_test.py)
+- Variable length input handling
+- Batch processing
+- Edge cases
+---
+## 13. FILE STATISTICS SUMMARY
+| Category | Count | Lines |
+|----------|-------|-------|
+| Python Files | 194 | ~25,000+ |
+| GPT Module | 9 | 16,953 |
+| BigVGAN | 6+ | ~1,000+ |
+| Utils | 12+ | ~500 |
+| MaskGCT | 100+ | ~10,000+ |
+| S2Mel | 10+ | ~2,000+ |
+| Root Level | 3 | 730 |
+---
+## 14. KEY TECHNICAL CHALLENGES FOR RUST CONVERSION
+1. **PyTorch Model Loading** → Need ONNX export or custom binary format
+2. **Text Normalization Libraries** → May need Rust bindings or reimplementation
+3. **Complex Attention Mechanisms** → Transformers, Perceiver, Conformer
+4. **Mel Spectrogram Computation** → STFT, librosa filter banks
+5. **Quantization & Codecs** → Multiple codec implementations
+6. **Large Model Inference** → Optimization, batching, caching
+7. **CUDA Kernels** → Custom activation functions (if needed)
+8. **Web Server Integration** → Replace Gradio with Rust web framework
+---
+## 15. DEPENDENCY CONVERSION ROADMAP
+| Python Library | Rust Alternative | Priority |
+|---|---|---|
+| torch/transformers | ort, tch-rs, candle | Critical |
+| librosa | rustfft, dasp_signal | Critical |
+| sentencepiece | sentencepiece, tokenizers | Critical |
+| numpy | ndarray, nalgebra | Critical |
+| jieba | jieba-rs | High |
+| torchaudio | dasp, wav, hound | High |
+| gradio | actix-web, rocket, axum | Medium |
+| OmegaConf | serde, config-rs | Medium |
+| safetensors | safetensors-rs | High |
+---
+## Summary
+IndexTTS is a sophisticated, state-of-the-art TTS system with:
+- **194 Python files** across multiple specialized modules
+- **Multi-stage processing pipeline** from text to audio
+- **Advanced neural architectures** (Conformer, Perceiver, GPT, BigVGAN)
+- **Multi-language support** with emotion control
+- **Production-ready** with web UI and CLI interfaces
+- **Heavy reliance on PyTorch** and HuggingFace ecosystems
+- **Large external models** requiring careful integration
+The Rust conversion will require careful translation of:
+1. Complex text processing pipelines
+2. Neural network inference engines
+3. Audio DSP operations
+4. Model loading and management
+5. Web interface integration

Cargo.lock ADDED Viewed

	@@ -0,0 +1,3683 @@

+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+[[package]]
+name = "adler2"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
+[[package]]
+name = "adler32"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"
+[[package]]
+name = "ahash"
+version = "0.8.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "version_check",
+ "zerocopy",
+]
+[[package]]
+name = "aho-corasick"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
+dependencies = [
+ "memchr",
+]
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+[[package]]
+name = "anes"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
+[[package]]
+name = "anstream"
+version = "0.6.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+[[package]]
+name = "anstyle"
+version = "1.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78"
+[[package]]
+name = "anstyle-parse"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2"
+dependencies = [
+ "utf8parse",
+]
+[[package]]
+name = "anstyle-query"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
+dependencies = [
+ "anstyle",
+ "once_cell_polyfill",
+ "windows-sys 0.61.2",
+]
+[[package]]
+name = "anyhow"
+version = "1.0.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61"
+[[package]]
+name = "arraydeque"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d902e3d592a523def97af8f317b08ce16b7ab854c1985a0c671e6f15cebc236"
+[[package]]
+name = "async-trait"
+version = "0.1.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+]
+[[package]]
+name = "atomic-waker"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
+[[package]]
+name = "autocfg"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
+[[package]]
+name = "base64"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
+[[package]]
+name = "base64"
+version = "0.21.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
+[[package]]
+name = "base64"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
+[[package]]
+name = "base64ct"
+version = "1.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55248b47b0caf0546f7988906588779981c43bb1bc9d0c44087278f80cdb44ba"
+[[package]]
+name = "bitflags"
+version = "2.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
+dependencies = [
+ "serde_core",
+]
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
+[[package]]
+name = "bumpalo"
+version = "3.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43"
+[[package]]
+name = "bytemuck"
+version = "1.24.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fbdf580320f38b612e485521afda1ee26d10cc9884efaaa750d383e13e3c5f4"
+dependencies = [
+ "bytemuck_derive",
+]
+[[package]]
+name = "bytemuck_derive"
+version = "1.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+]
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+[[package]]
+name = "bytes"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3"
+[[package]]
+name = "cast"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
+[[package]]
+name = "cc"
+version = "1.2.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b97463e1064cb1b1c1384ad0a0b9c8abd0988e2a91f52606c80ef14aadb63e36"
+dependencies = [
+ "find-msvc-tools",
+ "jobserver",
+ "libc",
+ "shlex",
+]
+[[package]]
+name = "cedarwood"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90"
+dependencies = [
+ "smallvec 1.15.1",
+]
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+[[package]]
+name = "ciborium"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
+dependencies = [
+ "ciborium-io",
+ "ciborium-ll",
+ "serde",
+]
+[[package]]
+name = "ciborium-io"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
+[[package]]
+name = "ciborium-ll"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
+dependencies = [
+ "ciborium-io",
+ "half",
+]
+[[package]]
+name = "clap"
+version = "4.5.51"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c26d721170e0295f191a69bd9a1f93efcdb0aff38684b61ab5750468972e5f5"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+[[package]]
+name = "clap_builder"
+version = "4.5.51"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75835f0c7bf681bfd05abe44e965760fea999a5286c6eb2d59883634fd02011a"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+]
+[[package]]
+name = "clap_derive"
+version = "4.5.49"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+]
+[[package]]
+name = "clap_lex"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
+[[package]]
+name = "colorchoice"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
+[[package]]
+name = "config"
+version = "0.14.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68578f196d2a33ff61b27fae256c3164f65e36382648e30666dde05b8cc9dfdf"
+dependencies = [
+ "async-trait",
+ "convert_case",
+ "json5",
+ "nom",
+ "pathdiff",
+ "ron",
+ "rust-ini",
+ "serde",
+ "serde_json",
+ "toml",
+ "yaml-rust2",
+]
+[[package]]
+name = "console"
+version = "0.15.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8"
+dependencies = [
+ "encode_unicode",
+ "libc",
+ "once_cell",
+ "unicode-width",
+ "windows-sys 0.59.0",
+]
+[[package]]
+name = "const-random"
+version = "0.1.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359"
+dependencies = [
+ "const-random-macro",
+]
+[[package]]
+name = "const-random-macro"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
+dependencies = [
+ "getrandom 0.2.16",
+ "once_cell",
+ "tiny-keccak",
+]
+[[package]]
+name = "convert_case"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec182b0ca2f35d8fc196cf3404988fd8b8c739a4d270ff118a398feb0cbec1ca"
+dependencies = [
+ "unicode-segmentation",
+]
+[[package]]
+name = "core-foundation"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
+[[package]]
+name = "core2"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505"
+dependencies = [
+ "memchr",
+]
+[[package]]
+name = "cpufeatures"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
+dependencies = [
+ "libc",
+]
+[[package]]
+name = "crc32fast"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511"
+dependencies = [
+ "cfg-if",
+]
+[[package]]
+name = "criterion"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
+dependencies = [
+ "anes",
+ "cast",
+ "ciborium",
+ "clap",
+ "criterion-plot",
+ "is-terminal",
+ "itertools 0.10.5",
+ "num-traits",
+ "once_cell",
+ "oorandom",
+ "plotters",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "walkdir",
+]
+[[package]]
+name = "criterion-plot"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+dependencies = [
+ "cast",
+ "itertools 0.10.5",
+]
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+[[package]]
+name = "crunchy"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
+[[package]]
+name = "crypto-common"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
+dependencies = [
+ "generic-array",
+ "typenum",
+]
+[[package]]
+name = "darling"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee"
+dependencies = [
+ "darling_core",
+ "darling_macro",
+]
+[[package]]
+name = "darling_core"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e"
+dependencies = [
+ "fnv",
+ "ident_case",
+ "proc-macro2",
+ "quote",
+ "strsim",
+ "syn 2.0.110",
+]
+[[package]]
+name = "darling_macro"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead"
+dependencies = [
+ "darling_core",
+ "quote",
+ "syn 2.0.110",
+]
+[[package]]
+name = "dary_heap"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06d2e3287df1c007e74221c49ca10a95d557349e54b3a75dc2fb14712c751f04"
+[[package]]
+name = "dasp_envelope"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ec617ce7016f101a87fe85ed44180839744265fae73bb4aa43e7ece1b7668b6"
+dependencies = [
+ "dasp_frame",
+ "dasp_peak",
+ "dasp_ring_buffer",
+ "dasp_rms",
+ "dasp_sample",
+]
+[[package]]
+name = "dasp_frame"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2a3937f5fe2135702897535c8d4a5553f8b116f76c1529088797f2eee7c5cd6"
+dependencies = [
+ "dasp_sample",
+]
+[[package]]
+name = "dasp_interpolate"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7fc975a6563bb7ca7ec0a6c784ead49983a21c24835b0bc96eea11ee407c7486"
+dependencies = [
+ "dasp_frame",
+ "dasp_ring_buffer",
+ "dasp_sample",
+]
+[[package]]
+name = "dasp_peak"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5cf88559d79c21f3d8523d91250c397f9a15b5fc72fbb3f87fdb0a37b79915bf"
+dependencies = [
+ "dasp_frame",
+ "dasp_sample",
+]
+[[package]]
+name = "dasp_ring_buffer"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07d79e19b89618a543c4adec9c5a347fe378a19041699b3278e616e387511ea1"
+[[package]]
+name = "dasp_rms"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6c5dcb30b7e5014486e2822537ea2beae50b19722ffe2ed7549ab03774575aa"
+dependencies = [
+ "dasp_frame",
+ "dasp_ring_buffer",
+ "dasp_sample",
+]
+[[package]]
+name = "dasp_sample"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c87e182de0887fd5361989c677c4e8f5000cd9491d6d563161a8f3a5519fc7f"
+[[package]]
+name = "dasp_signal"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa1ab7d01689c6ed4eae3d38fe1cea08cba761573fbd2d592528d55b421077e7"
+dependencies = [
+ "dasp_envelope",
+ "dasp_frame",
+ "dasp_interpolate",
+ "dasp_peak",
+ "dasp_ring_buffer",
+ "dasp_rms",
+ "dasp_sample",
+ "dasp_window",
+]
+[[package]]
+name = "dasp_window"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99ded7b88821d2ce4e8b842c9f1c86ac911891ab89443cc1de750cae764c5076"
+dependencies = [
+ "dasp_sample",
+]
+[[package]]
+name = "der"
+version = "0.7.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb"
+dependencies = [
+ "pem-rfc7468",
+ "zeroize",
+]
+[[package]]
+name = "derive_builder"
+version = "0.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947"
+dependencies = [
+ "derive_builder_macro",
+]
+[[package]]
+name = "derive_builder_core"
+version = "0.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8"
+dependencies = [
+ "darling",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+]
+[[package]]
+name = "derive_builder_macro"
+version = "0.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
+dependencies = [
+ "derive_builder_core",
+ "syn 2.0.110",
+]
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer",
+ "crypto-common",
+]
+[[package]]
+name = "displaydoc"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+]
+[[package]]
+name = "dlv-list"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f"
+dependencies = [
+ "const-random",
+]
+[[package]]
+name = "either"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
+[[package]]
+name = "encode_unicode"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
+[[package]]
+name = "encoding_rs"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
+dependencies = [
+ "cfg-if",
+]
+[[package]]
+name = "env_filter"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bf3c259d255ca70051b30e2e95b5446cdb8949ac4cd22c0d7fd634d89f568e2"
+dependencies = [
+ "log",
+ "regex",
+]
+[[package]]
+name = "env_logger"
+version = "0.11.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "env_filter",
+ "jiff",
+ "log",
+]
+[[package]]
+name = "equivalent"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
+[[package]]
+name = "errno"
+version = "0.3.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
+dependencies = [
+ "libc",
+ "windows-sys 0.61.2",
+]
+[[package]]
+name = "esaxx-rs"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6"
+dependencies = [
+ "cc",
+]
+[[package]]
+name = "fastrand"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
+[[package]]
+name = "filetime"
+version = "0.2.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc0505cd1b6fa6580283f6bdf70a73fcf4aba1184038c90902b92b3dd0df63ed"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "libredox",
+ "windows-sys 0.60.2",
+]
+[[package]]
+name = "find-msvc-tools"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844"
+[[package]]
+name = "flate2"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide",
+]
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+[[package]]
+name = "foreign-types"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
+dependencies = [
+ "foreign-types-shared",
+]
+[[package]]
+name = "foreign-types-shared"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
+[[package]]
+name = "form_urlencoded"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf"
+dependencies = [
+ "percent-encoding",
+]
+[[package]]
+name = "futures-channel"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+]
+[[package]]
+name = "futures-core"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
+[[package]]
+name = "futures-io"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
+[[package]]
+name = "futures-sink"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7"
+[[package]]
+name = "futures-task"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
+[[package]]
+name = "futures-util"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
+dependencies = [
+ "futures-core",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "memchr",
+ "pin-project-lite",
+ "pin-utils",
+ "slab",
+]
+[[package]]
+name = "fxhash"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
+dependencies = [
+ "byteorder",
+]
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+]
+[[package]]
+name = "getrandom"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+[[package]]
+name = "getrandom"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi",
+ "wasip2",
+]
+[[package]]
+name = "h2"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386"
+dependencies = [
+ "atomic-waker",
+ "bytes",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "http",
+ "indexmap",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+[[package]]
+name = "half"
+version = "2.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "zerocopy",
+]
+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+dependencies = [
+ "ahash",
+ "allocator-api2",
+]
+[[package]]
+name = "hashbrown"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash",
+]
+[[package]]
+name = "hashlink"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
+dependencies = [
+ "hashbrown 0.14.5",
+]
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+[[package]]
+name = "hermit-abi"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
+[[package]]
+name = "hex"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
+[[package]]
+name = "hound"
+version = "3.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62adaabb884c94955b19907d60019f4e145d091c75345379e70d1ee696f7854f"
+[[package]]
+name = "http"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+[[package]]
+name = "http-body"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
+dependencies = [
+ "bytes",
+ "http",
+]
+[[package]]
+name = "http-body-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "pin-project-lite",
+]
+[[package]]
+name = "httparse"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
+[[package]]
+name = "hyper"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11"
+dependencies = [
+ "atomic-waker",
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "h2",
+ "http",
+ "http-body",
+ "httparse",
+ "itoa",
+ "pin-project-lite",
+ "pin-utils",
+ "smallvec 1.15.1",
+ "tokio",
+ "want",
+]
+[[package]]
+name = "hyper-rustls"
+version = "0.27.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
+dependencies = [
+ "http",
+ "hyper",
+ "hyper-util",
+ "rustls",
+ "rustls-pki-types",
+ "tokio",
+ "tokio-rustls",
+ "tower-service",
+]
+[[package]]
+name = "hyper-tls"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
+dependencies = [
+ "bytes",
+ "http-body-util",
+ "hyper",
+ "hyper-util",
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+ "tower-service",
+]
+[[package]]
+name = "hyper-util"
+version = "0.1.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52e9a2a24dc5c6821e71a7030e1e14b7b632acac55c40e9d2e082c621261bb56"
+dependencies = [
+ "base64 0.22.1",
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
+ "http",
+ "http-body",
+ "hyper",
+ "ipnet",
+ "libc",
+ "percent-encoding",
+ "pin-project-lite",
+ "socket2",
+ "system-configuration",
+ "tokio",
+ "tower-service",
+ "tracing",
+ "windows-registry",
+]
+[[package]]
+name = "icu_collections"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43"
+dependencies = [
+ "displaydoc",
+ "potential_utf",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+[[package]]
+name = "icu_locale_core"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6"
+dependencies = [
+ "displaydoc",
+ "litemap",
+ "tinystr",
+ "writeable",
+ "zerovec",
+]
+[[package]]
+name = "icu_normalizer"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599"
+dependencies = [
+ "icu_collections",
+ "icu_normalizer_data",
+ "icu_properties",
+ "icu_provider",
+ "smallvec 1.15.1",
+ "zerovec",
+]
+[[package]]
+name = "icu_normalizer_data"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a"
+[[package]]
+name = "icu_properties"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e93fcd3157766c0c8da2f8cff6ce651a31f0810eaa1c51ec363ef790bbb5fb99"
+dependencies = [
+ "icu_collections",
+ "icu_locale_core",
+ "icu_properties_data",
+ "icu_provider",
+ "zerotrie",
+ "zerovec",
+]
+[[package]]
+name = "icu_properties_data"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02845b3647bb045f1100ecd6480ff52f34c35f82d9880e029d329c21d1054899"
+[[package]]
+name = "icu_provider"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614"
+dependencies = [
+ "displaydoc",
+ "icu_locale_core",
+ "writeable",
+ "yoke",
+ "zerofrom",
+ "zerotrie",
+ "zerovec",
+]
+[[package]]
+name = "ident_case"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
+[[package]]
+name = "idna"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de"
+dependencies = [
+ "idna_adapter",
+ "smallvec 1.15.1",
+ "utf8_iter",
+]
+[[package]]
+name = "idna_adapter"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344"
+dependencies = [
+ "icu_normalizer",
+ "icu_properties",
+]
+[[package]]
+name = "include-flate"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e01b7cb6ca682a621e7cda1c358c9724b53a7b4409be9be1dd443b7f3a26f998"
+dependencies = [
+ "include-flate-codegen",
+ "include-flate-compress",
+ "libflate",
+ "zstd",
+]
+[[package]]
+name = "include-flate-codegen"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f49bf5274aebe468d6e6eba14a977eaf1efa481dc173f361020de70c1c48050"
+dependencies = [
+ "include-flate-compress",
+ "libflate",
+ "proc-macro-error",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+ "zstd",
+]
+[[package]]
+name = "include-flate-compress"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eae6a40e716bcd5931f5dbb79cd921512a4f647e2e9413fded3171fca3824dbc"
+dependencies = [
+ "libflate",
+ "zstd",
+]
+[[package]]
+name = "indexmap"
+version = "2.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f"
+dependencies = [
+ "equivalent",
+ "hashbrown 0.16.0",
+]
+[[package]]
+name = "indextts"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bytemuck",
+ "clap",
+ "config",
+ "criterion",
+ "dasp_sample",
+ "dasp_signal",
+ "env_logger",
+ "hex",
+ "hound",
+ "indicatif",
+ "jieba-rs",
+ "lazy_static",
+ "log",
+ "ndarray 0.15.6",
+ "num-complex",
+ "num-traits",
+ "num_cpus",
+ "ort",
+ "rand",
+ "rayon",
+ "realfft",
+ "regex",
+ "reqwest",
+ "rubato",
+ "rustfft",
+ "safetensors",
+ "serde",
+ "serde_json",
+ "serde_yaml",
+ "sha2",
+ "tempfile",
+ "thiserror",
+ "tokenizers",
+ "tokio",
+ "toml",
+ "unicode-segmentation",
+]
+[[package]]
+name = "indicatif"
+version = "0.17.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
+dependencies = [
+ "console",
+ "number_prefix",
+ "portable-atomic",
+ "unicode-width",
+ "web-time",
+]
+[[package]]
+name = "ipnet"
+version = "2.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130"
+[[package]]
+name = "iri-string"
+version = "0.7.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f867b9d1d896b67beb18518eda36fdb77a32ea590de864f1325b294a6d14397"
+dependencies = [
+ "memchr",
+ "serde",
+]
+[[package]]
+name = "is-terminal"
+version = "0.4.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys 0.61.2",
+]
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
+[[package]]
+name = "itertools"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
+dependencies = [
+ "either",
+]
+[[package]]
+name = "itertools"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
+dependencies = [
+ "either",
+]
+[[package]]
+name = "itertools"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
+dependencies = [
+ "either",
+]
+[[package]]
+name = "itoa"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
+[[package]]
+name = "jieba-macros"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c676b32a471d3cfae8dac2ad2f8334cd52e53377733cca8c1fb0a5062fec192"
+dependencies = [
+ "phf_codegen",
+]
+[[package]]
+name = "jieba-rs"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f5dd552bbb95d578520ee68403bf8aaf0dbbb2ce55b0854d019f9350ad61040a"
+dependencies = [
+ "cedarwood",
+ "fxhash",
+ "include-flate",
+ "jieba-macros",
+ "lazy_static",
+ "phf",
+ "regex",
+]
+[[package]]
+name = "jiff"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49cce2b81f2098e7e3efc35bc2e0a6b7abec9d34128283d7a26fa8f32a6dbb35"
+dependencies = [
+ "jiff-static",
+ "log",
+ "portable-atomic",
+ "portable-atomic-util",
+ "serde_core",
+]
+[[package]]
+name = "jiff-static"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "980af8b43c3ad5d8d349ace167ec8170839f753a42d233ba19e08afe1850fa69"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+]
+[[package]]
+name = "jobserver"
+version = "0.1.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
+dependencies = [
+ "getrandom 0.3.4",
+ "libc",
+]
+[[package]]
+name = "js-sys"
+version = "0.3.82"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65"
+dependencies = [
+ "once_cell",
+ "wasm-bindgen",
+]
+[[package]]
+name = "json5"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96b0db21af676c1ce64250b5f40f3ce2cf27e4e47cb91ed91eb6fe9350b430c1"
+dependencies = [
+ "pest",
+ "pest_derive",
+ "serde",
+]
+[[package]]
+name = "lazy_static"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
+[[package]]
+name = "libc"
+version = "0.2.177"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976"
+[[package]]
+name = "libflate"
+version = "2.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3248b8d211bd23a104a42d81b4fa8bb8ac4a3b75e7a43d85d2c9ccb6179cd74"
+dependencies = [
+ "adler32",
+ "core2",
+ "crc32fast",
+ "dary_heap",
+ "libflate_lz77",
+]
+[[package]]
+name = "libflate_lz77"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a599cb10a9cd92b1300debcef28da8f70b935ec937f44fcd1b70a7c986a11c5c"
+dependencies = [
+ "core2",
+ "hashbrown 0.16.0",
+ "rle-decode-fast",
+]
+[[package]]
+name = "libloading"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
+dependencies = [
+ "cfg-if",
+ "windows-link",
+]
+[[package]]
+name = "libredox"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "416f7e718bdb06000964960ffa43b4335ad4012ae8b99060261aa4a8088d5ccb"
+dependencies = [
+ "bitflags",
+ "libc",
+ "redox_syscall",
+]
+[[package]]
+name = "linux-raw-sys"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039"
+[[package]]
+name = "litemap"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
+[[package]]
+name = "lock_api"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
+dependencies = [
+ "scopeguard",
+]
+[[package]]
+name = "log"
+version = "0.4.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432"
+[[package]]
+name = "macro_rules_attribute"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "65049d7923698040cd0b1ddcced9b0eb14dd22c5f86ae59c3740eab64a676520"
+dependencies = [
+ "macro_rules_attribute-proc_macro",
+ "paste",
+]
+[[package]]
+name = "macro_rules_attribute-proc_macro"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30"
+[[package]]
+name = "matrixmultiply"
+version = "0.3.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08"
+dependencies = [
+ "autocfg",
+ "rawpointer",
+]
+[[package]]
+name = "memchr"
+version = "2.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
+[[package]]
+name = "mime"
+version = "0.3.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+[[package]]
+name = "miniz_oxide"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
+dependencies = [
+ "adler2",
+ "simd-adler32",
+]
+[[package]]
+name = "mio"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69d83b0086dc8ecf3ce9ae2874b2d1290252e2a30720bea58a5c6639b0092873"
+dependencies = [
+ "libc",
+ "wasi",
+ "windows-sys 0.61.2",
+]
+[[package]]
+name = "monostate"
+version = "0.1.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3341a273f6c9d5bef1908f17b7267bbab0e95c9bf69a0d4dcf8e9e1b2c76ef67"
+dependencies = [
+ "monostate-impl",
+ "serde",
+ "serde_core",
+]
+[[package]]
+name = "monostate-impl"
+version = "0.1.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e4db6d5580af57bf992f59068d4ea26fd518574ff48d7639b255a36f9de6e7e9"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+]
+[[package]]
+name = "native-tls"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e"
+dependencies = [
+ "libc",
+ "log",
+ "openssl",
+ "openssl-probe",
+ "openssl-sys",
+ "schannel",
+ "security-framework",
+ "security-framework-sys",
+ "tempfile",
+]
+[[package]]
+name = "ndarray"
+version = "0.15.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
+dependencies = [
+ "matrixmultiply",
+ "num-complex",
+ "num-integer",
+ "num-traits",
+ "rawpointer",
+ "rayon",
+]
+[[package]]
+name = "ndarray"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841"
+dependencies = [
+ "matrixmultiply",
+ "num-complex",
+ "num-integer",
+ "num-traits",
+ "portable-atomic",
+ "portable-atomic-util",
+ "rawpointer",
+]
+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+[[package]]
+name = "num-complex"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
+dependencies = [
+ "num-traits",
+]
+[[package]]
+name = "num-integer"
+version = "0.1.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
+dependencies = [
+ "num-traits",
+]
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+[[package]]
+name = "num_cpus"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
+dependencies = [
+ "hermit-abi",
+ "libc",
+]
+[[package]]
+name = "number_prefix"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
+[[package]]
+name = "once_cell"
+version = "1.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
+[[package]]
+name = "onig"
+version = "6.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "336b9c63443aceef14bea841b899035ae3abe89b7c486aaf4c5bd8aafedac3f0"
+dependencies = [
+ "bitflags",
+ "libc",
+ "once_cell",
+ "onig_sys",
+]
+[[package]]
+name = "onig_sys"
+version = "69.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7f86c6eef3d6df15f23bcfb6af487cbd2fed4e5581d58d5bf1f5f8b7f6727dc"
+dependencies = [
+ "cc",
+ "pkg-config",
+]
+[[package]]
+name = "oorandom"
+version = "11.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
+[[package]]
+name = "openssl"
+version = "0.10.75"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328"
+dependencies = [
+ "bitflags",
+ "cfg-if",
+ "foreign-types",
+ "libc",
+ "once_cell",
+ "openssl-macros",
+ "openssl-sys",
+]
+[[package]]
+name = "openssl-macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+]
+[[package]]
+name = "openssl-probe"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e"
+[[package]]
+name = "openssl-sys"
+version = "0.9.111"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+ "vcpkg",
+]
+[[package]]
+name = "ordered-multimap"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
+dependencies = [
+ "dlv-list",
+ "hashbrown 0.14.5",
+]
+[[package]]
+name = "ort"
+version = "2.0.0-rc.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fa7e49bd669d32d7bc2a15ec540a527e7764aec722a45467814005725bcd721"
+dependencies = [
+ "libloading",
+ "ndarray 0.16.1",
+ "ort-sys",
+ "smallvec 2.0.0-alpha.10",
+ "tracing",
+]
+[[package]]
+name = "ort-sys"
+version = "2.0.0-rc.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2aba9f5c7c479925205799216e7e5d07cc1d4fa76ea8058c60a9a30f6a4e890"
+dependencies = [
+ "flate2",
+ "pkg-config",
+ "sha2",
+ "tar",
+ "ureq",
+]
+[[package]]
+name = "parking_lot"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
+dependencies = [
+ "lock_api",
+ "parking_lot_core",
+]
+[[package]]
+name = "parking_lot_core"
+version = "0.9.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "smallvec 1.15.1",
+ "windows-link",
+]
+[[package]]
+name = "paste"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
+[[package]]
+name = "pathdiff"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3"
+[[package]]
+name = "pem-rfc7468"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
+dependencies = [
+ "base64ct",
+]
+[[package]]
+name = "percent-encoding"
+version = "2.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
+[[package]]
+name = "pest"
+version = "2.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "989e7521a040efde50c3ab6bbadafbe15ab6dc042686926be59ac35d74607df4"
+dependencies = [
+ "memchr",
+ "ucd-trie",
+]
+[[package]]
+name = "pest_derive"
+version = "2.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "187da9a3030dbafabbbfb20cb323b976dc7b7ce91fcd84f2f74d6e31d378e2de"
+dependencies = [
+ "pest",
+ "pest_generator",
+]
+[[package]]
+name = "pest_generator"
+version = "2.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49b401d98f5757ebe97a26085998d6c0eecec4995cad6ab7fc30ffdf4b052843"
+dependencies = [
+ "pest",
+ "pest_meta",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+]
+[[package]]
+name = "pest_meta"
+version = "2.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72f27a2cfee9f9039c4d86faa5af122a0ac3851441a34865b8a043b46be0065a"
+dependencies = [
+ "pest",
+ "sha2",
+]
+[[package]]
+name = "phf"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
+dependencies = [
+ "phf_shared",
+]
+[[package]]
+name = "phf_codegen"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
+dependencies = [
+ "phf_generator",
+ "phf_shared",
+]
+[[package]]
+name = "phf_generator"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
+dependencies = [
+ "phf_shared",
+ "rand",
+]
+[[package]]
+name = "phf_shared"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
+dependencies = [
+ "siphasher",
+]
+[[package]]
+name = "pin-project-lite"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
+[[package]]
+name = "pin-utils"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
+[[package]]
+name = "pkg-config"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
+[[package]]
+name = "plotters"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
+dependencies = [
+ "num-traits",
+ "plotters-backend",
+ "plotters-svg",
+ "wasm-bindgen",
+ "web-sys",
+]
+[[package]]
+name = "plotters-backend"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
+[[package]]
+name = "plotters-svg"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
+dependencies = [
+ "plotters-backend",
+]
+[[package]]
+name = "portable-atomic"
+version = "1.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483"
+[[package]]
+name = "portable-atomic-util"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
+dependencies = [
+ "portable-atomic",
+]
+[[package]]
+name = "potential_utf"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77"
+dependencies = [
+ "zerovec",
+]
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+[[package]]
+name = "primal-check"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc0d895b311e3af9902528fbb8f928688abbd95872819320517cc24ca6b2bd08"
+dependencies = [
+ "num-integer",
+]
+[[package]]
+name = "proc-macro-error"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
+dependencies = [
+ "proc-macro-error-attr",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+ "version_check",
+]
+[[package]]
+name = "proc-macro-error-attr"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "version_check",
+]
+[[package]]
+name = "proc-macro2"
+version = "1.0.103"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
+dependencies = [
+ "unicode-ident",
+]
+[[package]]
+name = "quote"
+version = "1.0.42"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f"
+dependencies = [
+ "proc-macro2",
+]
+[[package]]
+name = "r-efi"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom 0.2.16",
+]
+[[package]]
+name = "rawpointer"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
+[[package]]
+name = "rayon"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+[[package]]
+name = "rayon-cond"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
+dependencies = [
+ "either",
+ "itertools 0.11.0",
+ "rayon",
+]
+[[package]]
+name = "rayon-core"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+[[package]]
+name = "realfft"
+version = "3.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f821338fddb99d089116342c46e9f1fbf3828dba077674613e734e01d6ea8677"
+dependencies = [
+ "rustfft",
+]
+[[package]]
+name = "redox_syscall"
+version = "0.5.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
+dependencies = [
+ "bitflags",
+]
+[[package]]
+name = "regex"
+version = "1.12.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+[[package]]
+name = "regex-automata"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+[[package]]
+name = "regex-syntax"
+version = "0.8.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
+[[package]]
+name = "reqwest"
+version = "0.12.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d0946410b9f7b082a427e4ef5c8ff541a88b357bc6c637c40db3a68ac70a36f"
+dependencies = [
+ "base64 0.22.1",
+ "bytes",
+ "encoding_rs",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
+ "h2",
+ "http",
+ "http-body",
+ "http-body-util",
+ "hyper",
+ "hyper-rustls",
+ "hyper-tls",
+ "hyper-util",
+ "js-sys",
+ "log",
+ "mime",
+ "native-tls",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustls-pki-types",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "sync_wrapper",
+ "tokio",
+ "tokio-native-tls",
+ "tower",
+ "tower-http",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+[[package]]
+name = "ring"
+version = "0.17.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
+dependencies = [
+ "cc",
+ "cfg-if",
+ "getrandom 0.2.16",
+ "libc",
+ "untrusted",
+ "windows-sys 0.52.0",
+]
+[[package]]
+name = "rle-decode-fast"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422"
+[[package]]
+name = "ron"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b91f7eff05f748767f183df4320a63d6936e9c6107d97c9e6bdd9784f4289c94"
+dependencies = [
+ "base64 0.21.7",
+ "bitflags",
+ "serde",
+ "serde_derive",
+]
+[[package]]
+name = "rubato"
+version = "0.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5d18b486e7d29a408ef3f825bc1327d8f87af091c987ca2f5b734625940e234"
+dependencies = [
+ "num-complex",
+ "num-integer",
+ "num-traits",
+ "realfft",
+]
+[[package]]
+name = "rust-ini"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3e0698206bcb8882bf2a9ecb4c1e7785db57ff052297085a6efd4fe42302068a"
+dependencies = [
+ "cfg-if",
+ "ordered-multimap",
+]
+[[package]]
+name = "rustfft"
+version = "6.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21db5f9893e91f41798c88680037dba611ca6674703c1a18601b01a72c8adb89"
+dependencies = [
+ "num-complex",
+ "num-integer",
+ "num-traits",
+ "primal-check",
+ "strength_reduce",
+ "transpose",
+]
+[[package]]
+name = "rustix"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e"
+dependencies = [
+ "bitflags",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.61.2",
+]
+[[package]]
+name = "rustls"
+version = "0.23.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "533f54bc6a7d4f647e46ad909549eda97bf5afc1585190ef692b4286b198bd8f"
+dependencies = [
+ "once_cell",
+ "rustls-pki-types",
+ "rustls-webpki",
+ "subtle",
+ "zeroize",
+]
+[[package]]
+name = "rustls-pki-types"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94182ad936a0c91c324cd46c6511b9510ed16af436d7b5bab34beab0afd55f7a"
+dependencies = [
+ "zeroize",
+]
+[[package]]
+name = "rustls-webpki"
+version = "0.103.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52"
+dependencies = [
+ "ring",
+ "rustls-pki-types",
+ "untrusted",
+]
+[[package]]
+name = "rustversion"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
+[[package]]
+name = "ryu"
+version = "1.0.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
+[[package]]
+name = "safetensors"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44560c11236a6130a46ce36c836a62936dc81ebf8c36a37947423571be0e55b6"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+[[package]]
+name = "schannel"
+version = "0.1.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+[[package]]
+name = "security-framework"
+version = "2.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
+dependencies = [
+ "bitflags",
+ "core-foundation",
+ "core-foundation-sys",
+ "libc",
+ "security-framework-sys",
+]
+[[package]]
+name = "security-framework-sys"
+version = "2.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+]
+[[package]]
+name = "serde_json"
+version = "1.0.145"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
+dependencies = [
+ "itoa",
+ "memchr",
+ "ryu",
+ "serde",
+ "serde_core",
+]
+[[package]]
+name = "serde_spanned"
+version = "0.6.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3"
+dependencies = [
+ "serde",
+]
+[[package]]
+name = "serde_urlencoded"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
+dependencies = [
+ "form_urlencoded",
+ "itoa",
+ "ryu",
+ "serde",
+]
+[[package]]
+name = "serde_yaml"
+version = "0.9.34+deprecated"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
+dependencies = [
+ "indexmap",
+ "itoa",
+ "ryu",
+ "serde",
+ "unsafe-libyaml",
+]
+[[package]]
+name = "sha2"
+version = "0.10.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+[[package]]
+name = "shlex"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
+[[package]]
+name = "signal-hook-registry"
+version = "1.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b"
+dependencies = [
+ "libc",
+]
+[[package]]
+name = "simd-adler32"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe"
+[[package]]
+name = "siphasher"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
+[[package]]
+name = "slab"
+version = "0.4.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589"
+[[package]]
+name = "smallvec"
+version = "1.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
+[[package]]
+name = "smallvec"
+version = "2.0.0-alpha.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51d44cfb396c3caf6fbfd0ab422af02631b69ddd96d2eff0b0f0724f9024051b"
+[[package]]
+name = "socket2"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "17129e116933cf371d018bb80ae557e889637989d8638274fb25622827b03881"
+dependencies = [
+ "libc",
+ "windows-sys 0.60.2",
+]
+[[package]]
+name = "socks"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b"
+dependencies = [
+ "byteorder",
+ "libc",
+ "winapi",
+]
+[[package]]
+name = "spm_precompiled"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326"
+dependencies = [
+ "base64 0.13.1",
+ "nom",
+ "serde",
+ "unicode-segmentation",
+]
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
+[[package]]
+name = "strength_reduce"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82"
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+[[package]]
+name = "subtle"
+version = "2.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "unicode-ident",
+]
+[[package]]
+name = "syn"
+version = "2.0.110"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a99801b5bd34ede4cf3fc688c5919368fea4e4814a4664359503e6015b280aea"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+[[package]]
+name = "sync_wrapper"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
+dependencies = [
+ "futures-core",
+]
+[[package]]
+name = "synstructure"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+]
+[[package]]
+name = "system-configuration"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
+dependencies = [
+ "bitflags",
+ "core-foundation",
+ "system-configuration-sys",
+]
+[[package]]
+name = "system-configuration-sys"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+[[package]]
+name = "tar"
+version = "0.4.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a"
+dependencies = [
+ "filetime",
+ "libc",
+ "xattr",
+]
+[[package]]
+name = "tempfile"
+version = "3.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16"
+dependencies = [
+ "fastrand",
+ "getrandom 0.3.4",
+ "once_cell",
+ "rustix",
+ "windows-sys 0.61.2",
+]
+[[package]]
+name = "thiserror"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl",
+]
+[[package]]
+name = "thiserror-impl"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+]
+[[package]]
+name = "tiny-keccak"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
+dependencies = [
+ "crunchy",
+]
+[[package]]
+name = "tinystr"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869"
+dependencies = [
+ "displaydoc",
+ "zerovec",
+]
+[[package]]
+name = "tinytemplate"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+[[package]]
+name = "tokenizers"
+version = "0.19.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd"
+dependencies = [
+ "aho-corasick",
+ "derive_builder",
+ "esaxx-rs",
+ "getrandom 0.2.16",
+ "indicatif",
+ "itertools 0.12.1",
+ "lazy_static",
+ "log",
+ "macro_rules_attribute",
+ "monostate",
+ "onig",
+ "paste",
+ "rand",
+ "rayon",
+ "rayon-cond",
+ "regex",
+ "regex-syntax",
+ "serde",
+ "serde_json",
+ "spm_precompiled",
+ "thiserror",
+ "unicode-normalization-alignments",
+ "unicode-segmentation",
+ "unicode_categories",
+]
+[[package]]
+name = "tokio"
+version = "1.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408"
+dependencies = [
+ "bytes",
+ "libc",
+ "mio",
+ "parking_lot",
+ "pin-project-lite",
+ "signal-hook-registry",
+ "socket2",
+ "tokio-macros",
+ "windows-sys 0.61.2",
+]
+[[package]]
+name = "tokio-macros"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+]
+[[package]]
+name = "tokio-native-tls"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
+dependencies = [
+ "native-tls",
+ "tokio",
+]
+[[package]]
+name = "tokio-rustls"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61"
+dependencies = [
+ "rustls",
+ "tokio",
+]
+[[package]]
+name = "tokio-util"
+version = "0.7.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2efa149fe76073d6e8fd97ef4f4eca7b67f599660115591483572e406e165594"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "pin-project-lite",
+ "tokio",
+]
+[[package]]
+name = "toml"
+version = "0.8.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362"
+dependencies = [
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_edit",
+]
+[[package]]
+name = "toml_datetime"
+version = "0.6.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
+dependencies = [
+ "serde",
+]
+[[package]]
+name = "toml_edit"
+version = "0.22.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
+dependencies = [
+ "indexmap",
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_write",
+ "winnow",
+]
+[[package]]
+name = "toml_write"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801"
+[[package]]
+name = "tower"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "pin-project-lite",
+ "sync_wrapper",
+ "tokio",
+ "tower-layer",
+ "tower-service",
+]
+[[package]]
+name = "tower-http"
+version = "0.6.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2"
+dependencies = [
+ "bitflags",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "iri-string",
+ "pin-project-lite",
+ "tower",
+ "tower-layer",
+ "tower-service",
+]
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+[[package]]
+name = "tower-service"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+[[package]]
+name = "tracing"
+version = "0.1.41"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
+dependencies = [
+ "pin-project-lite",
+ "tracing-core",
+]
+[[package]]
+name = "tracing-core"
+version = "0.1.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678"
+dependencies = [
+ "once_cell",
+]
+[[package]]
+name = "transpose"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ad61aed86bc3faea4300c7aee358b4c6d0c8d6ccc36524c96e4c92ccf26e77e"
+dependencies = [
+ "num-integer",
+ "strength_reduce",
+]
+[[package]]
+name = "try-lock"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+[[package]]
+name = "typenum"
+version = "1.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
+[[package]]
+name = "ucd-trie"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971"
+[[package]]
+name = "unicode-ident"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
+[[package]]
+name = "unicode-normalization-alignments"
+version = "0.1.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de"
+dependencies = [
+ "smallvec 1.15.1",
+]
+[[package]]
+name = "unicode-segmentation"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
+[[package]]
+name = "unicode-width"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
+[[package]]
+name = "unicode_categories"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
+[[package]]
+name = "unsafe-libyaml"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
+[[package]]
+name = "untrusted"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
+[[package]]
+name = "ureq"
+version = "3.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d39cb1dbab692d82a977c0392ffac19e188bd9186a9f32806f0aaa859d75585a"
+dependencies = [
+ "base64 0.22.1",
+ "der",
+ "log",
+ "native-tls",
+ "percent-encoding",
+ "rustls-pki-types",
+ "socks",
+ "ureq-proto",
+ "utf-8",
+ "webpki-root-certs",
+]
+[[package]]
+name = "ureq-proto"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60b4531c118335662134346048ddb0e54cc86bd7e81866757873055f0e38f5d2"
+dependencies = [
+ "base64 0.22.1",
+ "http",
+ "httparse",
+ "log",
+]
+[[package]]
+name = "url"
+version = "2.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "percent-encoding",
+ "serde",
+]
+[[package]]
+name = "utf-8"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
+[[package]]
+name = "utf8_iter"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+[[package]]
+name = "walkdir"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
+[[package]]
+name = "want"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e"
+dependencies = [
+ "try-lock",
+]
+[[package]]
+name = "wasi"
+version = "0.11.1+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
+[[package]]
+name = "wasip2"
+version = "1.0.1+wasi-0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7"
+dependencies = [
+ "wit-bindgen",
+]
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.105"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "rustversion",
+ "wasm-bindgen-macro",
+ "wasm-bindgen-shared",
+]
+[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.55"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "551f88106c6d5e7ccc7cd9a16f312dd3b5d36ea8b4954304657d5dfba115d4a0"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "once_cell",
+ "wasm-bindgen",
+ "web-sys",
+]
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.105"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.105"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc"
+dependencies = [
+ "bumpalo",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+ "wasm-bindgen-shared",
+]
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.105"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76"
+dependencies = [
+ "unicode-ident",
+]
+[[package]]
+name = "web-sys"
+version = "0.3.82"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a1f95c0d03a47f4ae1f7a64643a6bb97465d9b740f0fa8f90ea33915c99a9a1"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+[[package]]
+name = "web-time"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+[[package]]
+name = "webpki-root-certs"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee3e3b5f5e80bc89f30ce8d0343bf4e5f12341c51f3e26cbeecbc7c85443e85b"
+dependencies = [
+ "rustls-pki-types",
+]
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+[[package]]
+name = "winapi-util"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+[[package]]
+name = "windows-link"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+[[package]]
+name = "windows-registry"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720"
+dependencies = [
+ "windows-link",
+ "windows-result",
+ "windows-strings",
+]
+[[package]]
+name = "windows-result"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5"
+dependencies = [
+ "windows-link",
+]
+[[package]]
+name = "windows-strings"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091"
+dependencies = [
+ "windows-link",
+]
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+[[package]]
+name = "windows-sys"
+version = "0.60.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
+dependencies = [
+ "windows-targets 0.53.5",
+]
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link",
+]
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.6",
+ "windows_aarch64_msvc 0.52.6",
+ "windows_i686_gnu 0.52.6",
+ "windows_i686_gnullvm 0.52.6",
+ "windows_i686_msvc 0.52.6",
+ "windows_x86_64_gnu 0.52.6",
+ "windows_x86_64_gnullvm 0.52.6",
+ "windows_x86_64_msvc 0.52.6",
+]
+[[package]]
+name = "windows-targets"
+version = "0.53.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3"
+dependencies = [
+ "windows-link",
+ "windows_aarch64_gnullvm 0.53.1",
+ "windows_aarch64_msvc 0.53.1",
+ "windows_i686_gnu 0.53.1",
+ "windows_i686_gnullvm 0.53.1",
+ "windows_i686_msvc 0.53.1",
+ "windows_x86_64_gnu 0.53.1",
+ "windows_x86_64_gnullvm 0.53.1",
+ "windows_x86_64_msvc 0.53.1",
+]
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+[[package]]
+name = "windows_i686_gnu"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3"
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+[[package]]
+name = "windows_i686_msvc"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
+[[package]]
+name = "winnow"
+version = "0.7.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf"
+dependencies = [
+ "memchr",
+]
+[[package]]
+name = "wit-bindgen"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
+[[package]]
+name = "writeable"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
+[[package]]
+name = "xattr"
+version = "1.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156"
+dependencies = [
+ "libc",
+ "rustix",
+]
+[[package]]
+name = "yaml-rust2"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8902160c4e6f2fb145dbe9d6760a75e3c9522d8bf796ed7047c85919ac7115f8"
+dependencies = [
+ "arraydeque",
+ "encoding_rs",
+ "hashlink",
+]
+[[package]]
+name = "yoke"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954"
+dependencies = [
+ "stable_deref_trait",
+ "yoke-derive",
+ "zerofrom",
+]
+[[package]]
+name = "yoke-derive"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+ "synstructure",
+]
+[[package]]
+name = "zerocopy"
+version = "0.8.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c"
+dependencies = [
+ "zerocopy-derive",
+]
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+]
+[[package]]
+name = "zerofrom"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
+dependencies = [
+ "zerofrom-derive",
+]
+[[package]]
+name = "zerofrom-derive"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+ "synstructure",
+]
+[[package]]
+name = "zeroize"
+version = "1.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
+[[package]]
+name = "zerotrie"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851"
+dependencies = [
+ "displaydoc",
+ "yoke",
+ "zerofrom",
+]
+[[package]]
+name = "zerovec"
+version = "0.11.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
+dependencies = [
+ "yoke",
+ "zerofrom",
+ "zerovec-derive",
+]
+[[package]]
+name = "zerovec-derive"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.110",
+]
+[[package]]
+name = "zstd"
+version = "0.13.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
+dependencies = [
+ "zstd-safe",
+]
+[[package]]
+name = "zstd-safe"
+version = "7.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d"
+dependencies = [
+ "zstd-sys",
+]
+[[package]]
+name = "zstd-sys"
+version = "2.0.16+zstd.1.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748"
+dependencies = [
+ "cc",
+ "pkg-config",
+]

Cargo.toml ADDED Viewed

	@@ -0,0 +1,88 @@

+[package]
+name = "indextts"
+version = "0.1.0"
+edition = "2021"
+description = "High-performance Text-to-Speech engine in pure Rust - converted from IndexTTS Python"
+authors = ["IndexTTS Team"]
+license = "MIT"
+keywords = ["tts", "speech-synthesis", "audio", "ml", "deep-learning"]
+categories = ["multimedia::audio", "science"]
+[[bin]]
+name = "indextts"
+path = "src/main.rs"
+[lib]
+name = "indextts"
+path = "src/lib.rs"
+[dependencies]
+# Core ML/Inference
+ort = { version = "2.0.0-rc.4", features = ["load-dynamic"] }
+safetensors = "0.4"
+ndarray = { version = "0.15", features = ["rayon"] }
+# Audio Processing
+hound = "3.5"
+dasp_signal = "0.11"
+dasp_sample = "0.11"
+rustfft = "6.2"
+realfft = "3.3"
+rubato = "0.15"
+# Text Processing
+tokenizers = "0.19"
+unicode-segmentation = "1.11"
+regex = "1.10"
+lazy_static = "1.5"
+jieba-rs = "0.7"
+# CLI & Configuration
+clap = { version = "4.5", features = ["derive"] }
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+serde_yaml = "0.9"
+toml = "0.8"
+config = "0.14"
+# Async & Parallelism
+rayon = "1.10"
+tokio = { version = "1.38", features = ["full"] }
+# Utilities
+anyhow = "1.0"
+thiserror = "1.0"
+log = "0.4"
+env_logger = "0.11"
+indicatif = "0.17"
+bytemuck = { version = "1.16", features = ["derive"] }
+num-complex = "0.4"
+num-traits = "0.2"
+rand = "0.8"
+num_cpus = "1.16"
+# HTTP/Download
+reqwest = { version = "0.12", features = ["blocking", "json"] }
+sha2 = "0.10"
+hex = "0.4"
+[dev-dependencies]
+criterion = "0.5"
+tempfile = "3.10"
+[profile.release]
+opt-level = 3
+lto = true
+codegen-units = 1
+strip = true
+[profile.dev]
+opt-level = 1
+[[bench]]
+name = "mel_spectrogram"
+harness = false
+[[bench]]
+name = "inference"
+harness = false

DIRECTORY_STRUCTURE.txt ADDED Viewed

	@@ -0,0 +1,224 @@

+IndexTTS-Rust/ (Complete Directory Structure)
+│
+├── indextts/                                    # Main Python package (194 files)
+│   │
+│   ├── __init__.py                              # Package initialization
+│   ├── cli.py                                   # Command-line interface (64 lines)
+│   ├── infer.py                                 # Original inference (v1) - 690 lines
+│   ├── infer_v2.py                              # Main inference v2 - 739 lines ⭐⭐⭐
+│   │
+│   ├── gpt/                                     # GPT-based TTS model (9 files, 16,953 lines)
+│   │   ├── __init__.py
+│   │   ├── model.py                             # Original UnifiedVoice (713L)
+│   │   ├── model_v2.py                          # UnifiedVoice v2 ⭐⭐⭐ (747L)
+│   │   ├── conformer_encoder.py                 # Conformer encoder ⭐⭐ (520L)
+│   │   ├── perceiver.py                         # Perceiver resampler (317L)
+│   │   ├── conformer_encoder.py                 # Conformer components
+│   │   ├── transformers_gpt2.py                 # GPT2 implementation (1,878L)
+│   │   ├── transformers_generation_utils.py     # Generation utilities (4,747L)
+│   │   ├── transformers_beam_search.py          # Beam search (1,013L)
+│   │   └── transformers_modeling_utils.py       # Model utilities (5,525L)
+│   │
+│   ├── BigVGAN/                                 # Neural Vocoder (6+ files, ~1000+ lines)
+│   │   ├── __init__.py
+│   │   ├── models.py                            # BigVGAN architecture ⭐⭐⭐
+│   │   ├── ECAPA_TDNN.py                        # Speaker encoder
+│   │   ├── activations.py                       # Snake, SnakeBeta activations
+│   │   ├── utils.py                             # Helper functions
+│   │   │
+│   │   ├── alias_free_activation/               # CUDA kernel variants
+│   │   │   ├── cuda/
+│   │   │   │   ├── activation1d.py              # CUDA kernel loader
+│   │   │   │   └── load.py
+│   │   │   └── torch/
+│   │   │       ├── act.py                       # PyTorch activation
+│   │   │       ├── filter.py                    # Anti-aliasing filter
+│   │   │       └── resample.py                  # Resampling
+│   │   │
+│   │   ├── alias_free_torch/                    # PyTorch-only fallback
+│   │   │   ├── act.py
+│   │   │   ├── filter.py
+│   │   │   └── resample.py
+│   │   │
+│   │   └── nnet/                                # Network modules
+│   │       ├── linear.py
+│   │       ├── normalization.py
+│   │       └── CNN.py
+│   │
+│   ├── s2mel/                                   # Semantic-to-Mel Models (~500+ lines)
+│   │   ├── modules/                             # Core modules (10+ files)
+│   │   │   ├── audio.py                         # Mel-spectrogram computation ⭐
+│   │   │   ├── commons.py                       # Common utilities (21KB)
+│   │   │   ├── layers.py                        # NN layers (13KB)
+│   │   │   ├── length_regulator.py              # Duration modeling
+│   │   │   ├── flow_matching.py                 # Continuous flow matching
+│   │   │   ├── diffusion_transformer.py         # Diffusion model
+│   │   │   ├── rmvpe.py                         # Pitch extraction (22KB)
+│   │   │   ├── quantize.py                      # Quantization
+│   │   │   ├── encodec.py                       # EnCodec codec
+│   │   │   ├── wavenet.py                       # WaveNet implementation
+│   │   │   │
+│   │   │   ├── bigvgan/                         # BigVGAN vocoder
+│   │   │   │   ├── modules.py
+│   │   │   │   ├── config.json
+│   │   │   │   ├── bigvgan.py
+│   │   │   │   ├── alias_free_activation/      # Variants
+│   │   │   │   └── models.py
+│   │   │   │
+│   │   │   ├── vocos/                           # Vocos codec
+│   │   │   ├── hifigan/                         # HiFiGAN vocoder
+│   │   │   ├── openvoice/                       # OpenVoice components (11 files)
+│   │   │   ├── campplus/                        # CAMPPlus speaker encoder
+│   │   │   │   └── DTDNN.py                     # DTDNN architecture
+│   │   │   └── gpt_fast/                        # Fast GPT inference
+│   │   │
+│   │   ├── dac/                                 # DAC codec
+│   │   │   ├── model/
+│   │   │   ├── nn/
+│   │   │   └── utils/
+│   │   │
+│   │   └─��� (other s2mel implementations)
+│   │
+│   ├── utils/                                   # Text & Feature Utils (12+ files, ~500L)
+│   │   ├── __init__.py
+│   │   ├── front.py                             # TextNormalizer, TextTokenizer ⭐⭐⭐ (700L)
+│   │   ├── maskgct_utils.py                     # Semantic codec builders (250L)
+│   │   ├── arch_util.py                         # AttentionBlock, utilities
+│   │   ├── checkpoint.py                        # Model loading
+│   │   ├── xtransformers.py                     # Transformer utils (1,600L)
+│   │   ├── feature_extractors.py                # MelSpectrogramFeatures
+│   │   ├── common.py                            # Common functions
+│   │   ├── text_utils.py                        # Text utilities
+│   │   ├── typical_sampling.py                  # TypicalLogitsWarper sampling
+│   │   ├── utils.py                             # General utils
+│   │   ├── webui_utils.py                       # Web UI helpers
+│   │   ├── tagger_cache/                        # Text normalization cache
+│   │   │
+│   │   └── maskgct/                             # MaskGCT codec (100+ files, 10KB+)
+│   │       └── models/
+│   │           ├── codec/                       # Multiple codec implementations
+│   │           │   ├── amphion_codec/           # Amphion codec
+│   │           │   │   ├── codec.py
+│   │           │   │   ├── vocos.py
+│   │           │   │   └── quantize/            # Quantization
+│   │           │   │       ├── vector_quantize.py
+│   │           │   │       ├── residual_vq.py
+│   │           │   │       ├── factorized_vector_quantize.py
+│   │           │   │       └── lookup_free_quantize.py
+│   │           │   │
+│   │           │   ├── facodec/                 # FACodec variant
+│   │           │   │   ├── facodec_inference.py
+│   │           │   │   ├── modules/
+│   │           │   │   │   ├── commons.py
+│   │           │   │   │   ├── attentions.py
+│   │           │   │   │   ├── layers.py
+│   │           │   │   │   ├── quantize.py
+│   │           │   │   │   ├── wavenet.py
+│   │           │   │   │   ├── style_encoder.py
+│   │           │   │   │   ├── gradient_reversal.py
+│   │           │   │   │   └── JDC/ (pitch detection)
+│   │           │   │   └── alias_free_torch/    # Anti-aliasing
+│   │           │   │
+│   │           │   ├── speechtokenizer/         # Speech Tokenizer codec
+│   │           │   │   ├── model.py
+│   │           │   │   └── modules/
+│   │           │   │       ├── seanet.py
+│   │           │   │       ├── lstm.py
+│   │           │   │       ├── norm.py
+│   │           │   │       ├── conv.py
+│   │           │   │       └── quantization/
+│   │           │   │
+│   │           │   ├── ns3_codec/                # NS3 codec variant
+│   │           │   ├── vevo/                     # VEVo codec
+│   │           │   ├── kmeans/                   # KMeans codec
+│   │           │   ├── melvqgan/                 # MelVQ-GAN codec
+│   │           │   │
+│   │           │   ├── codec_inference.py
+│   │           │   ├── codec_sampler.py
+│   │           │   ├── codec_trainer.py
+│   │           │   └── codec_dataset.py
+│   │           │
+│   │           └── tts/
+│   │               └── maskgct/
+│   │                   ├── maskgct_s2a.py        # Semantic-to-acoustic
+│   │                   └── ckpt/
+│   │
+│   └── vqvae/                                   # Vector Quantized VAE
+│       ├── xtts_dvae.py                         # Discrete VAE (currently disabled)
+│       └── (other VAE components)
+│
+├── examples/                                    # Sample Data & Test Cases
+│   ├── cases.jsonl                              # Example test cases
+│   ├── voice_*.wav                              # Sample voice prompts (12 files)
+│   ├── emo_*.wav                                # Emotion reference samples (2 files)
+│   └── sample_prompt.wav                        # Default prompt (implied)
+│
+├── tests/                                       # Test Suite
+│   ├── regression_test.py                       # Main regression tests ⭐
+│   └── padding_test.py                          # Padding/batch tests
+│
+├── tools/                                       # Utility Scripts & i18n
+│   ├── download_files.py                        # Model downloading from HF
+│   └── i18n/                                    # Internationalization
+│       ├── i18n.py                              # Translation system
+│       ├── scan_i18n.py                         # i18n scanner
+│       └── locale/
+│           ├── en_US.json                       # English translations
+│           └── zh_CN.json                       # Chinese translations
+│
+├── archive/                                     # Historical Docs
+│   └── README_INDEXTTS_1_5.md                   # IndexTTS 1.5 documentation
+│
+├── webui.py                                     # Gradio Web UI ⭐⭐⭐ (18KB)
+├── cli.py                                       # Command-line interface
+├── requirements.txt                             # Python dependencies
+├── MANIFEST.in                                  # Package manifest
+├── .gitignore                                   # Git ignore rules
+├── .gitattributes                               # Git attributes
+└── LICENSE                                      # Apache 2.0 License
+═══════════════════════════════════════════════════════════════════════════════
+KEY FILES BY IMPORTANCE:
+═══════════════════════════════════════════════════════════════════════════════
+⭐⭐⭐ CRITICAL (Core Logic - MUST Convert First)
+  1. indextts/infer_v2.py              - Main inference pipeline (739L)
+  2. indextts/gpt/model_v2.py          - UnifiedVoice GPT model (747L)
+  3. indextts/utils/front.py           - Text processing (700L)
+  4. indextts/BigVGAN/models.py        - Vocoder (1000+L)
+  5. indextts/s2mel/modules/audio.py   - Mel-spectrogram (83L, critical DSP)
+⭐⭐ HIGH PRIORITY (Major Components)
+  1. indextts/gpt/conformer_encoder.py - Conformer blocks (520L)
+  2. indextts/gpt/perceiver.py         - Perceiver attention (317L)
+  3. indextts/utils/maskgct_utils.py   - Codec builders (250L)
+  4. indextts/s2mel/modules/commons.py - Common utilities (21KB)
+⭐ MEDIUM PRIORITY (Utilities & Optimization)
+  1. indextts/utils/xtransformers.py   - Transformer utils (1,600L)
+  2. indextts/BigVGAN/activations.py   - Activation functions
+  3. indextts/s2mel/modules/rmvpe.py   - Pitch extraction (22KB)
+OPTIONAL (Web UI, Tools)
+  1. webui.py                          - Gradio interface
+  2. tools/download_files.py           - Model downloading
+═══════════════════════════════════════════════════════════════════════════════
+TOTAL STATISTICS:
+═══════════════════════════════════════════════════════════════════════════════
+Total Python Files:        194
+Total Lines of Code:       ~25,000+
+GPT Module:                16,953 lines
+MaskGCT Codecs:            ~10,000+ lines
+S2Mel Models:              ~2,000+ lines
+BigVGAN:                   ~1,000+ lines
+Utils:                     ~500 lines
+Tests:                     ~100 lines
+Models Supported:          6 major HuggingFace models
+Languages:                 Chinese (full), English (full), Mixed
+Emotion Dimensions:        8-dimensional emotion control
+Audio Sample Rate:         22,050 Hz (primary)
+Max Text Tokens:           120
+Max Mel Tokens:            250
+Mel Spectrogram Bins:      80

EXPLORATION_SUMMARY.md ADDED Viewed

	@@ -0,0 +1,283 @@

+# IndexTTS-Rust Codebase Exploration - Complete Summary
+## Overview
+I have conducted a **comprehensive exploration** of the IndexTTS-Rust codebase. This is a sophisticated zero-shot multi-lingual Text-to-Speech (TTS) system currently implemented in Python that is being converted to Rust.
+## Key Findings
+### Project Status
+- **Current State**: Pure Python implementation with PyTorch backend
+- **Target State**: Rust implementation (conversion in progress)
+- **Files**: 194 Python files across multiple specialized modules
+- **Code Volume**: ~25,000+ lines of Python code
+- **No Rust code exists yet** - this is a fresh rewrite opportunity
+### What IndexTTS Does
+IndexTTS is an **industrial-level text-to-speech system** that:
+1. Takes text input (Chinese, English, or mixed languages)
+2. Takes a reference speaker audio file (voice prompt)
+3. Generates high-quality speech in the speaker's voice with:
+   - Pinyin-based pronunciation control (for Chinese)
+   - Emotion control via 8-dimensional emotion vectors
+   - Text-based emotion guidance (via Qwen model)
+   - Punctuation-based pause control
+   - Style reference audio support
+### Performance Metrics
+- **Best in class**: WER 0.821 on Chinese test set, 1.606 on English
+- **Outperforms**: SeedTTS, CosyVoice2, F5-TTS, MaskGCT, others
+- **Multi-language**: Full Chinese + English support, mixed language support
+- **Speed**: Parallel inference available, batch processing support
+## Architecture Overview
+### Main Pipeline Flow
+```
+Text Input
+    ↓ (TextNormalizer)
+Normalized Text
+    ↓ (TextTokenizer + SentencePiece)
+Text Tokens
+    ↓ (W2V-BERT)
+Semantic Embeddings
+    ↓ (RepCodec)
+Semantic Codes + Speaker Features (CAMPPlus) + Emotion Vectors
+    ↓ (UnifiedVoice GPT Model)
+Mel-spectrogram Tokens
+    ↓ (S2Mel Length Regulator)
+Acoustic Codes
+    ↓ (BigVGAN Vocoder)
+Audio Waveform (22,050 Hz)
+```
+## Critical Components to Convert
+### Priority 1: MUST Convert First (Core Pipeline)
+1. **infer_v2.py** (739 lines) - Main inference orchestration
+2. **model_v2.py** (747 lines) - UnifiedVoice GPT model
+3. **front.py** (700 lines) - Text normalization and tokenization
+4. **BigVGAN/models.py** (1000+ lines) - Neural vocoder
+5. **s2mel/modules/audio.py** (83 lines) - Mel-spectrogram DSP
+### Priority 2: High Priority (Major Components)
+1. **conformer_encoder.py** (520 lines) - Speaker encoder
+2. **perceiver.py** (317 lines) - Attention pooling mechanism
+3. **maskgct_utils.py** (250 lines) - Semantic codec builders
+4. Various supporting modules for codec and transformer utilities
+### Priority 3: Medium Priority (Optimization & Utilities)
+1. Advanced transformer utilities
+2. Activation functions and filters
+3. Pitch extraction and flow matching
+4. Optional CUDA kernels for optimization
+## Technology Stack
+### Current (Python)
+- **Framework**: PyTorch (inference only)
+- **Text Processing**: SentencePiece, WeTextProcessing, regex
+- **Audio**: librosa, torchaudio, scipy
+- **Models**: HuggingFace Transformers
+- **Web UI**: Gradio
+### Pre-trained Models (6 Major)
+1. **IndexTTS-2** (~2GB) - Main TTS model
+2. **W2V-BERT-2.0** (~1GB) - Semantic features
+3. **MaskGCT** - Semantic codec
+4. **CAMPPlus** (~100MB) - Speaker embeddings
+5. **BigVGAN v2** (~100MB) - Vocoder
+6. **Qwen** (variable) - Emotion detection
+## File Organization
+### Core Modules
+- **indextts/gpt/** - GPT-based sequence generation (9 files, 16,953 lines)
+- **indextts/BigVGAN/** - Neural vocoder (6+ files, 1000+ lines)
+- **indextts/s2mel/** - Semantic-to-mel models (10+ files, 2000+ lines)
+- **indextts/utils/** - Text processing and utilities (12+ files, 500 lines)
+- **indextts/utils/maskgct/** - MaskGCT codecs (100+ files, 10000+ lines)
+### Interfaces
+- **webui.py** (18KB) - Gradio web interface
+- **cli.py** (64 lines) - Command-line interface
+- **infer.py/infer_v2.py** - Python API
+### Data & Config
+- **examples/** - Sample audio files and test cases
+- **tests/** - Regression and padding tests
+- **tools/** - Model downloading and i18n support
+## Detailed Documentation Generated
+Three comprehensive documents have been created and saved to the repository:
+1. **CODEBASE_ANALYSIS.md** (19 KB)
+   - Executive summary
+   - Complete project structure
+   - Current implementation details
+   - TTS pipeline explanation
+   - Algorithms and components breakdown
+   - Inference modes and capabilities
+   - Dependency conversion roadmap
+2. **DIRECTORY_STRUCTURE.txt** (14 KB)
+   - Complete file tree with annotations
+   - Files grouped by importance (⭐⭐⭐, ⭐⭐, ⭐)
+   - Line counts for each file
+   - Statistics summary
+3. **SOURCE_FILE_LISTING.txt** (23 KB)
+   - Detailed file-by-file breakdown
+   - Classes and methods for each major file
+   - Parameter specifications
+   - Algorithm descriptions
+   - Dependencies for each component
+## Key Technical Challenges for Rust Conversion
+### High Complexity
+1. **PyTorch Model Loading** - Need ONNX export or custom format
+2. **Complex Attention Mechanisms** - Transformers, Perceiver, Conformer
+3. **Text Normalization Libraries** - May need Rust bindings or reimplementation
+4. **Mel Spectrogram Computation** - STFT, mel filterbank calculations
+### Medium Complexity
+1. **Quantization & Codecs** - Multiple codec implementations to translate
+2. **Large Model Inference** - Optimization, batching, caching required
+3. **Audio DSP** - Resampling, filtering, spectral operations
+### Optimization (Optional)
+1. CUDA kernels for anti-aliased activations
+2. DeepSpeed integration for model parallelism
+3. KV cache for inference optimization
+## Recommended Rust Libraries
+| Component | Python Library | Rust Alternative |
+|---|---|---|
+| Model Inference | torch/transformers | **ort**, tch-rs, candle |
+| Audio Processing | librosa | rustfft, dasp_signal |
+| Text Tokenization | sentencepiece | sentencepiece (Rust binding) |
+| Numerical Computing | numpy | **ndarray**, nalgebra |
+| Chinese Text | jieba | **jieba-rs** |
+| Audio I/O | torchaudio | hound, wav |
+| Web Server | Gradio | **axum**, actix-web |
+| Config Files | OmegaConf YAML | **serde**, config-rs |
+| Model Format | safetensors | **safetensors-rs** |
+## Data Flow Example
+### Input
+- Text: "你好" (Chinese for "Hello")
+- Speaker Audio: "speaker.wav" (voice reference)
+- Emotion: "happy" (optional)
+### Processing Steps
+1. Text Normalization → "你好" (no change)
+2. Text Tokenization → [token_1, token_2, ...]
+3. Audio Loading & Mel-spectrogram computation
+4. W2V-BERT semantic embedding extraction
+5. Speaker feature extraction (CAMPPlus)
+6. Emotion vector generation
+7. GPT generation of mel-tokens
+8. Length regulation for acoustic codes
+9. BigVGAN vocoding
+10. Audio output at 22,050 Hz
+### Output
+- Waveform: "output.wav" (high-quality speech)
+## Test Coverage
+### Regression Tests Available
+- Chinese text with pinyin tones
+- English text
+- Mixed Chinese-English
+- Long-form text passages
+- Named entities (proper nouns)
+- Special punctuation handling
+## Performance Characteristics
+### Speed
+- Single inference: ~2-5 seconds per sentence (GPU)
+- Batch/fast inference: Parallel processing available
+- Caching: Speaker features and mel spectrograms are cached
+### Quality
+- 22,050 Hz sample rate (CD-quality audio)
+- 80-dimensional mel-spectrogram
+- 8-channel emotion control
+- Natural speech synthesis with speaker similarity
+### Model Parameters
+- GPT Model: 8 layers, 512 dims, 8 heads
+- Max text tokens: 120
+- Max mel tokens: 250
+- Mel spectrogram bins: 80
+- Emotion dimensions: 8
+## Next Steps for Rust Conversion
+### Phase 1: Foundation
+1. Set up Rust project structure
+2. Create model loading infrastructure (ONNX or binary format)
+3. Implement basic tensor operations using ndarray/candle
+### Phase 2: Core Pipeline
+1. Implement text normalization (regex + patterns)
+2. Implement SentencePiece tokenization
+3. Create mel-spectrogram DSP module
+4. Implement BigVGAN vocoder
+### Phase 3: Neural Components
+1. Implement transformer layers
+2. Implement Conformer encoder
+3. Implement Perceiver resampler
+4. Implement GPT generation
+### Phase 4: Integration
+1. Integrate all components
+2. Create CLI interface
+3. Create REST API or server interface
+4. Optimize and profile
+### Phase 5: Testing & Deployment
+1. Regression testing
+2. Performance benchmarking
+3. Documentation
+4. Deployment optimization
+## Summary Statistics
+- **Total Files Analyzed**: 194 Python files
+- **Total Lines of Code**: ~25,000+
+- **Architecture Depth**: 5 major pipeline stages
+- **External Models**: 6 HuggingFace models
+- **Languages Supported**: 2 (Chinese, English, with mixed support)
+- **Dimensions**: Text tokens, mel tokens, emotion vectors, speaker embeddings
+- **DSP Operations**: STFT, mel filterbanks, upsampling, convolution
+- **AI Techniques**: Transformers, Conformers, Perceiver pooling, diffusion-based generation
+## Conclusion
+IndexTTS is a **production-ready, state-of-the-art TTS system** with sophisticated architecture and multiple advanced features. The codebase is well-organized with clear separation of concerns, making it suitable for conversion to Rust. The main challenges will be:
+1. **Model Loading**: Handling PyTorch model weights in Rust
+2. **Text Processing**: Ensuring accuracy in pattern matching and normalization
+3. **Neural Architecture**: Correctly implementing complex attention mechanisms
+4. **Audio DSP**: Precise STFT and mel-spectrogram computation
+With careful planning and the right library selection, a full Rust conversion is feasible and would offer significant performance benefits and easier deployment.
+---
+## Documentation Files
+All analysis has been saved to the repository:
+- `CODEBASE_ANALYSIS.md` - Comprehensive technical analysis
+- `DIRECTORY_STRUCTURE.txt` - Complete file tree
+- `SOURCE_FILE_LISTING.txt` - Detailed component breakdown
+- `EXPLORATION_SUMMARY.md` - This file

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,3 @@

+global-exclude *~ *.py[cod]
+include *.cu *.cpp
+include *.h *.hpp

README.md ADDED Viewed

	@@ -0,0 +1,295 @@

+---
+license: mit
+tags:
+  - text-to-speech
+  - tts
+  - voice-cloning
+  - zero-shot
+  - rust
+  - onnx
+language:
+  - en
+  - zh
+library_name: ort
+pipeline_tag: text-to-speech
+---
+# IndexTTS-Rust
+High-performance Text-to-Speech Engine in Pure Rust 🚀
+## ONNX Models (Download)
+Pre-converted models for inference - no Python required!
+| Model | Size | Download |
+|-------|------|----------|
+| **BigVGAN** (vocoder) | 433 MB | [bigvgan.onnx](https://huggingface.co/ThreadAbort/IndexTTS-Rust/resolve/models/models/bigvgan.onnx) |
+| **Speaker Encoder** | 28 MB | [speaker_encoder.onnx](https://huggingface.co/ThreadAbort/IndexTTS-Rust/resolve/models/models/speaker_encoder.onnx) |
+### Quick Download
+```python
+# Python with huggingface_hub
+from huggingface_hub import hf_hub_download
+bigvgan = hf_hub_download("ThreadAbort/IndexTTS-Rust", "models/bigvgan.onnx", revision="models")
+speaker = hf_hub_download("ThreadAbort/IndexTTS-Rust", "models/speaker_encoder.onnx", revision="models")
+```
+```bash
+# Or with wget
+wget https://huggingface.co/ThreadAbort/IndexTTS-Rust/resolve/models/models/bigvgan.onnx
+wget https://huggingface.co/ThreadAbort/IndexTTS-Rust/resolve/models/models/speaker_encoder.onnx
+```
+---
+A complete Rust rewrite of the IndexTTS system, designed for maximum performance and efficiency.
+## Features
+- **Pure Rust Implementation** - No Python dependencies, maximum performance
+- **Multi-language Support** - Chinese, English, and mixed language synthesis
+- **Zero-shot Voice Cloning** - Clone any voice from a short reference audio
+- **8-dimensional Emotion Control** - Fine-grained control over emotional expression
+- **High-quality Neural Vocoding** - BigVGAN-based waveform synthesis
+- **SIMD Optimizations** - Leverages modern CPU instructions
+- **Parallel Processing** - Multi-threaded audio and text processing with Rayon
+- **ONNX Runtime Integration** - Efficient model inference
+## Performance Benefits
+Compared to the Python implementation:
+- **~10-50x faster** audio processing (mel-spectrogram computation)
+- **~5-10x lower memory usage** with zero-copy operations
+- **No GIL bottleneck** - true parallel processing
+- **Smaller binary size** - single executable, no interpreter needed
+- **Faster startup time** - no Python/PyTorch initialization
+## Installation
+### Prerequisites
+- Rust 1.70+ (install from https://rustup.rs/)
+- ONNX Runtime (for neural network inference)
+- Audio development libraries:
+  - Linux: `apt install libasound2-dev`
+  - macOS: `brew install portaudio`
+  - Windows: Included with build
+### Building
+```bash
+# Clone the repository
+git clone https://github.com/8b-is/IndexTTS-Rust.git
+cd IndexTTS-Rust
+# Build in release mode (optimized)
+cargo build --release
+# The binary will be at target/release/indextts
+```
+### Running
+```bash
+# Show help
+./target/release/indextts --help
+# Show system information
+./target/release/indextts info
+# Generate default config
+./target/release/indextts init-config -o config.yaml
+# Synthesize speech
+./target/release/indextts synthesize \
+  --text "Hello, world!" \
+  --voice speaker.wav \
+  --output output.wav
+# Synthesize from file
+./target/release/indextts synthesize-file \
+  --input text.txt \
+  --voice speaker.wav \
+  --output output.wav
+# Run benchmarks
+./target/release/indextts benchmark --iterations 100
+```
+## Usage as Library
+```rust
+use indextts::{IndexTTS, Config, pipeline::SynthesisOptions};
+fn main() -> indextts::Result<()> {
+    // Load configuration
+    let config = Config::load("config.yaml")?;
+    // Create TTS instance
+    let tts = IndexTTS::new(config)?;
+    // Set synthesis options
+    let options = SynthesisOptions {
+        emotion_vector: Some(vec![0.9, 0.7, 0.6, 0.5, 0.5, 0.5, 0.5, 0.5]), // Happy
+        emotion_alpha: 1.0,
+        ..Default::default()
+    };
+    // Synthesize
+    let result = tts.synthesize_to_file(
+        "Hello, this is a test!",
+        "speaker.wav",
+        "output.wav",
+        &options,
+    )?;
+    println!("Generated {:.2}s of audio", result.duration);
+    println!("RTF: {:.3}x", result.rtf);
+    Ok(())
+}
+```
+## Project Structure
+```
+IndexTTS-Rust/
+├── src/
+│   ├── lib.rs              # Library entry point
+│   ├── main.rs             # CLI entry point
+│   ├── error.rs            # Error types
+│   ├── audio/              # Audio processing
+│   │   ├── mod.rs          # Module exports
+│   │   ├── mel.rs          # Mel-spectrogram computation
+│   │   ├── io.rs           # Audio I/O (WAV)
+│   │   ├── dsp.rs          # DSP utilities
+│   │   └── resample.rs     # Audio resampling
+│   ├── text/               # Text processing
+│   │   ├── mod.rs          # Module exports
+│   │   ├── normalizer.rs   # Text normalization
+│   │   ├── tokenizer.rs    # BPE tokenization
+│   │   └── phoneme.rs      # G2P conversion
+│   ├── model/              # Model inference
+│   │   ├── mod.rs          # Module exports
+│   │   ├── session.rs      # ONNX Runtime wrapper
+│   │   ├── gpt.rs          # GPT model
+│   │   └── embedding.rs    # Speaker/emotion encoders
+│   ├── vocoder/            # Neural vocoding
+│   │   ├── mod.rs          # Module exports
+│   │   ├── bigvgan.rs      # BigVGAN implementation
+│   │   └── activations.rs  # Snake/GELU activations
+│   ├── pipeline/           # TTS orchestration
+│   │   ├── mod.rs          # Module exports
+│   │   └── synthesis.rs    # Main synthesis logic
+│   └── config/             # Configuration
+│       └── mod.rs          # Config structures
+├── models/                 # Model checkpoints (ONNX)
+├── Cargo.toml              # Rust dependencies
+└── README.md               # This file
+```
+## Dependencies
+Core dependencies (all pure Rust or safe bindings):
+- **Audio**: `hound`, `rustfft`, `realfft`, `rubato`, `dasp`
+- **ML**: `ort` (ONNX Runtime), `ndarray`, `safetensors`
+- **Text**: `tokenizers`, `jieba-rs`, `regex`, `unicode-segmentation`
+- **CLI**: `clap`, `env_logger`, `indicatif`
+- **Parallelism**: `rayon`, `tokio`
+- **Config**: `serde`, `serde_yaml`, `serde_json`
+## Model Conversion
+To use the Rust implementation, you'll need to convert PyTorch models to ONNX:
+```python
+# Example conversion script (Python)
+import torch
+from indextts.gpt.model_v2 import UnifiedVoice
+model = UnifiedVoice.from_pretrained("checkpoints")
+dummy_input = torch.randint(0, 1000, (1, 100))
+torch.onnx.export(
+    model,
+    dummy_input,
+    "models/gpt.onnx",
+    opset_version=14,
+    input_names=["input_ids"],
+    output_names=["logits"],
+    dynamic_axes={
+        "input_ids": {0: "batch", 1: "sequence"},
+        "logits": {0: "batch", 1: "sequence"},
+    },
+)
+```
+## Benchmarks
+Performance on AMD Ryzen 9 5950X (16 cores):
+| Operation | Python (ms) | Rust (ms) | Speedup |
+|-----------|-------------|-----------|---------|
+| Mel-spectrogram (1s audio) | 150 | 3 | 50x |
+| Text normalization | 5 | 0.1 | 50x |
+| Tokenization | 2 | 0.05 | 40x |
+| Vocoder (1s audio) | 500 | 50 | 10x |
+## Roadmap
+- [x] Core audio processing (mel-spectrogram, DSP)
+- [x] Text processing (normalization, tokenization)
+- [x] Model inference framework (ONNX Runtime)
+- [x] BigVGAN vocoder
+- [x] Main TTS pipeline
+- [x] CLI interface
+- [ ] Full GPT model integration with KV cache
+- [ ] Streaming synthesis
+- [ ] WebSocket API
+- [ ] GPU acceleration (CUDA)
+- [ ] Model quantization (INT8)
+- [ ] WebAssembly support
+## Marine Prosody Validation
+This project includes **Marine salience detection** - an O(1) algorithm that validates speech authenticity:
+```
+Human speech has NATURAL jitter - that's what makes it authentic!
+- Too perfect (jitter < 0.005) = robotic
+- Too chaotic (jitter > 0.3) = artifacts/damage
+- Sweet spot = real human voice
+```
+The Marines will KNOW if your TTS doesn't sound authentic! 🎖️
+## License
+MIT License - See LICENSE file for details.
+---
+*From ashes to harmonics, from silence to song* 🔥🎵
+Built with love by Hue & Aye @ [8b.is](https://8b.is)
+## Acknowledgments
+- Original IndexTTS Python implementation
+- BigVGAN vocoder architecture
+- ONNX Runtime team for efficient inference
+- Rust audio processing community
+## Contributing
+Contributions welcome! Please see CONTRIBUTING.md for guidelines.
+Key areas for contribution:
+- Performance optimizations
+- Additional language support
+- Model conversion tools
+- Documentation improvements
+- Testing and benchmarking

SOURCE_FILE_LISTING.txt ADDED Viewed

	@@ -0,0 +1,513 @@

+╔════════════════════════════════════════════════════════════════════════════════╗
+║              DETAILED SOURCE FILE LISTING BY CATEGORY                          ║
+╚════════════════════════════════════════════════════════════════════════════════╝
+MAIN INFERENCE PIPELINE FILES
+═════════════════════════════════════════════════════════════════════════════════
+/home/user/IndexTTS-Rust/indextts/infer_v2.py (739 LINES) ⭐⭐⭐ CRITICAL
+├─ Purpose: Main TTS inference class (IndexTTS2)
+├─ Key Classes:
+│  ├─ QwenEmotion (emotion text-to-vector conversion)
+│  ├─ IndexTTS2 (main inference class)
+│  └─ Helper functions for emotion/audio processing
+├─ Key Methods:
+│  ├─ __init__() - Initialize all models and codecs
+│  ├─ infer() - Single text generation with emotion control
+│  ├─ infer_fast() - Parallel segment generation
+│  ├─ get_emb() - Extract semantic embeddings
+│  ├─ remove_long_silence() - Silence token removal
+│  ├─ insert_interval_silence() - Silence insertion
+│  └─ Cache management for repeated generation
+├─ Models Loaded:
+│  ├─ UnifiedVoice (GPT model for mel token generation)
+│  ├─ W2V-BERT (semantic feature extraction)
+│  ├─ RepCodec (semantic codec)
+│  ├─ S2Mel model (semantic-to-mel conversion)
+│  ├─ CAMPPlus (speaker embedding)
+│  ├─ BigVGAN vocoder
+│  ├─ Qwen-based emotion model
+│  └─ Emotion/speaker matrices
+└─ External Dependencies: torch, transformers, librosa, safetensors
+/home/user/IndexTTS-Rust/webui.py (18KB) ⭐⭐⭐ WEB INTERFACE
+├─ Purpose: Gradio-based web UI for IndexTTS
+├─ Key Components:
+│  ├─ Model initialization (IndexTTS2 instance)
+│  ├─ Language selection (Chinese/English)
+│  ├─ Emotion control modes (4 modes)
+│  ├─ Example case loading from cases.jsonl
+│  ├─ Progress bar integration
+│  └─ Output management
+├─ Features:
+│  ├─ Real-time inference
+│  ├─ Multiple emotion control methods
+│  ├─ Batch processing
+│  ├─ Task caching
+│  ├─ i18n support
+│  └─ Pre-loaded example cases
+└─ Web Framework: Gradio 5.34.1
+/home/user/IndexTTS-Rust/indextts/cli.py (64 LINES)
+├─ Purpose: Command-line interface
+├─ Usage: python -m indextts.cli <text> -v <voice.wav> -o <output.wav> [options]
+├─ Arguments:
+│  ├─ text: Text to synthesize
+│  ├─ -v/--voice: Voice reference audio
+│  ├─ -o/--output_path: Output file path
+│  ├─ -c/--config: Config file path
+│  ├─ --model_dir: Model directory
+│  ├─ --fp16: Use FP16 precision
+│  ├─ -d/--device: Device (cpu/cuda/mps/xpu)
+│  └─ -f/--force: Force overwrite
+└─ Uses: IndexTTS (v1 model)
+TEXT PROCESSING & NORMALIZATION FILES
+═════════════════════════════════════════════════════════════════════════════════
+/home/user/IndexTTS-Rust/indextts/utils/front.py (700 LINES) ⭐⭐⭐ CRITICAL
+├─ Purpose: Text normalization and tokenization
+├─ Key Classes:
+│  ├─ TextNormalizer (700+ lines)
+│  │  ├─ Pattern Definitions:
+│  │  │  ├─ PINYIN_TONE_PATTERN (regex for pinyin with tones 1-5)
+│  │  │  ├─ NAME_PATTERN (regex for Chinese names)
+│  │  │  └─ ENGLISH_CONTRACTION_PATTERN (regex for 's contractions)
+│  │  ├─ Methods:
+│  │  │  ├─ normalize() - Main normalization
+│  │  │  ├─ use_chinese() - Language detection
+│  │  │  ├─ save_pinyin_tones() - Extract pinyin with tones
+│  │  │  ├─ restore_pinyin_tones() - Restore pinyin
+│  │  │  ├─ save_names() - Extract names
+│  │  │  ├─ restore_names() - Restore names
+│  │  │  ├─ correct_pinyin() - Phoneme correction (jqx→v)
+│  │  │  └─ char_rep_map - Character replacement dictionary
+│  │  └─ Normalizers:
+│  │     ├─ zh_normalizer (Chinese) - Uses WeTextProcessing/wetext
+│  │     └─ en_normalizer (English) - Uses tn library
+│  │
+│  └─ TextTokenizer (200+ lines)
+│     ├─ Methods:
+│     │  ├─ encode() - Text to token IDs
+│     │  ├─ decode() - Token IDs to text
+│     │  ├─ convert_tokens_to_ids()
+│     │  ├─ convert_ids_to_tokens()
+│     │  └─ Vocab management
+│     ├─ Special Tokens:
+│     │  ├�� BOS: "<s>" (ID 0)
+│     │  ├─ EOS: "</s>" (ID 1)
+│     │  └─ UNK: "<unk>"
+│     └─ Tokenizer: SentencePiece (BPE-based)
+├─ Language Support:
+│  ├─ Chinese (simplified & traditional)
+│  ├─ English
+│  └─ Mixed Chinese-English
+└─ Critical Pattern Matching:
+   ├─ Pinyin tone detection
+   ├─ Name entity detection
+   ├─ Email matching
+   ├─ Character replacement
+   └─ Punctuation handling
+GPT MODEL ARCHITECTURE FILES
+═════════════════════════════════════════════════════════════════════════════════
+/home/user/IndexTTS-Rust/indextts/gpt/model_v2.py (747 LINES) ⭐⭐⭐ CRITICAL
+├─ Purpose: UnifiedVoice GPT-based TTS model
+├─ Key Classes:
+│  ├─ UnifiedVoice (700+ lines)
+│  │  ├─ Architecture:
+│  │  │  ├─ Input Embeddings: Text (256 vocab), Mel (8194 vocab)
+│  │  │  ├─ Position Embeddings: Learned embeddings for mel/text
+│  │  │  ├─ GPT Transformer: Configurable layers/heads
+│  │  │  ├─ Conditioning Encoder: Conformer or Perceiver-based
+│  │  │  ├─ Emotion Conditioning: Separate conformer + perceiver
+│  │  │  └─ Output Heads: Text prediction, Mel prediction
+│  │  │
+│  │  ├─ Parameters:
+│  │  │  ├─ layers: 8 (transformer depth)
+│  │  │  ├─ model_dim: 512 (embedding dimension)
+│  │  │  ├─ heads: 8 (attention heads)
+│  │  │  ├─ max_text_tokens: 120
+│  │  │  ├─ max_mel_tokens: 250
+│  │  │  ├─ number_mel_codes: 8194
+│  │  │  ├─ condition_type: "conformer_perceiver" or "conformer_encoder"
+│  │  │  └─ Various activation functions
+│  │  │
+│  │  ├─ Key Methods:
+│  │  │  ├─ forward() - Forward pass
+│  │  │  ├─ post_init_gpt2_config() - Initialize for inference
+│  │  │  ├─ generate_mel() - Mel token generation
+│  │  │  ├─ forward_with_cond_scale() - With classifier-free guidance
+│  │  │  └─ Cache management
+│  │  │
+│  │  └─ Conditioning System:
+│  │     ├─ Speaker conditioning via mel spectrogram
+│  │     ├─ Conformer encoder for speaker features
+│  │     ├─ Perceiver for attention pooling
+│  │     ├─ Emotion conditioning (separate pathway)
+│  │     └─ Emotion vector support (8-dimensional)
+│  │
+│  ├─ ResBlock (40+ lines)
+│  │  ├─ Conv1d layers with GroupNorm
+│  │  └─ ReLU activation with residual connection
+│  │
+│  ├─ GPT2InferenceModel (200+ lines)
+│  │  ├─ Inference wrapper for GPT2
+│  │  ├─ KV cache support
+│  │  ├─ Model parallelism support
+│  │  └─ Token-by-token generation
+│  │
+│  ├─ ConditioningEncoder (30 lines)
+│  │  ├─ Conv1d initialization
+│  │  ├─ Attention blocks
+│  │  └─ Optional mean pooling
+│  │
+│  ├─ MelEncoder (30 lines)
+│  │  ├─ Conv1d layers
+│  │  ├─ ResBlocks
+│  │  └─ 4x reduction
+│  │
+│  ├─ LearnedPositionEmbeddings (15 lines)
+│  │  └─ Learnable positional embeddings
+│  │
+│  └─ build_hf_gpt_transformer() (20 lines)
+│     └─ Builds HuggingFace GPT2 with custom embeddings
+│
+├─ External Dependencies: torch, transformers, indextts.gpt modules
+└─ Critical Inference Parameters:
+   ├─ Temperature control for generation
+   ├─ Top-k/top-p sampling
+   ├─ Classifier-free guidance scale
+   └─ Generation length limits
+/home/user/IndexTTS-Rust/indextts/gpt/conformer_encoder.py (520 LINES) ⭐⭐
+├─ Purpose: Conformer-based speaker conditioning encoder
+├─ Key Classes:
+│  ├─ ConformerEncoder (main)
+│  │  ├─ Modules:
+│  │  │  ├─ Subsampling layer (Conv2d)
+│  │  │  ├─ Positional encoding
+│  │  │  ├─ Conformer blocks
+│  │  │  ├─ Layer normalization
+│  │  │  └─ Optional projection layer
+│  │  │
+│  │  ├─ Configuration Parameters:
+│  │  │  ├─ input_size: 1024 (mel spectrogram bins)
+│  │  │  ├─ output_size: depends on config
+│  │  │  ├─ linear_units: hidden dim for FFN
+│  │  │  ├─ attention_heads: 8
+│  │  │  ├─ num_blocks: 4
+│  │  │  └─ input_layer: "linear" or "conv2d"
+│  │  │
+│  │  └─ Architecture: Conv → Pos Enc → [Conformer Block] * N → LayerNorm
+│  │
+│  ├─ ConformerBlock (80+ lines)
+│  │  ├─ Residual connections
+│  │  ├─ FFN → Attention → Conv → FFN structure
+│  │  ├─ Feed-forward network (2-layer with dropout)
+│  │  ├─ Multi-head self-attention
+│  │  ├─ Convolution module (depthwise)
+│  │  └─ Layer normalization
+│  │
+│  ├─ ConvolutionModule (50 lines)
+│  │  ├─ Pointwise Conv 1x1
+│  │  ├─ Depthwise Conv with kernel_size (e.g., 15)
+│  │  ├─ Batch normalization or layer normalization
+│  │  ├─ Activation (ReLU/SiLU)
+│  │  └─ Projection
+│  │
+│  ├─ PositionwiseFeedForward (15 lines)
+│  │  ├─ Dense layer (idim → hidden)
+│  │  ├─ Activation (ReLU)
+│  │  ├─ Dropout
+│  │  └─ Dense layer (hidden → idim)
+│  │
+│  └─ MultiHeadedAttention (custom)
+│     ├─ Scaled dot-product attention
+│     ├─ Multiple heads
+│     └─ Optional relative position bias
+│
+├─ External Dependencies: torch, custom conformer modules
+└─ Use Case: Processing mel spectrogram to extract speaker features
+/home/user/IndexTTS-Rust/indextts/gpt/perceiver.py (317 LINES) ⭐⭐
+├─ Purpose: Perceiver resampler for attention pooling
+├─ Key Classes:
+│  ├─ PerceiverResampler (250+ lines)
+│  │  ├─ Architecture:
+│  │  │  ├─ Learnable latent queries
+│  │  │  ├─ Cross-attention layers
+│  │  │  ├─ Feed-forward networks
+│  │  │  └─ Layer normalization
+│  │  │
+│  │  ├─ Parameters:
+│  │  │  ├─ dim: 512 (embedding dimension)
+│  │  │  ├─ dim_context: 512 (context dimension)
+│  │  │  ├─ num_latents: 32 (number of latent queries)
+│  │  │  ├─ num_latent_channels: 64
+│  │  │  ├─ num_layers: 6
+│  │  │  ├─ ff_mult: 4 (FFN expansion)
+│  │  │  └─ heads: 8
+│  │  │
+│  │  ├─ Key Methods:
+│  │  │  ├─ forward() - Attend and pool
+│  │  │  └─ _cross_attend_block() - Single cross-attention layer
+│  │  │
+│  │  └─ Cross-Attention Mechanism:
+│  │     ├─ Queries: Learnable latents
+│  │     ├─ Keys/Values: Input context
+│  │     ├─ Output: Pooled features (num_latents × dim)
+│  │     └─ FFN projection for dimension mixing
+│  │
+│  └─ FeedForward (15 lines)
+│     ├─ Dense (dim → hidden)
+│     ├─ GELU activation
+│     └─ Dense (hidden → dim)
+│
+├─ External Dependencies: torch, einsum operations
+└─ Use Case: Pool conditioning encoder output to fixed-size representation
+VOCODER & AUDIO SYNTHESIS FILES
+═════════════════════════════════════════════════════════════════════════════════
+/home/user/IndexTTS-Rust/indextts/BigVGAN/models.py (1000+ LINES) ⭐⭐⭐
+├─ Purpose: BigVGAN neural vocoder for mel-to-audio conversion
+├─ Key Classes:
+│  ├─ BigVGAN (400+ lines)
+│  │  ├─ Architecture:
+│  │  │  ├─ Initial Conv1d (80 mel bins → 192 channels)
+│  │  │  ├─ Upsampling layers (transposed conv)
+│  │  │  ├─ AMP blocks (anti-aliased multi-period)
+│  │  │  ├─ Final Conv1d (channels → 1 waveform)
+│  │  │  └─ Tanh activation for output
+│  │  │
+│  │  ├─ Upsampling: 4x → 8x → 8x → 4x (256x total)
+│  │  │  ├─ Maps from 22050 Hz mel frames to audio samples
+│  │  │  ├─ Kernel sizes: [16, 16, 4, 4]
+│  │  │  └─ Padding: [6, 6, 2, 2]
+│  │  │
+│  │  ├─ Parameters:
+│  │  │  ├─ num_mels: 80
+│  │  │  ├─ num_freq: 513
+│  │  │  ├─ num_mels: 80
+│  │  │  ├─ n_fft: 1024
+│  │  │  ├─ hop_size: 256
+│  │  │  ├─ win_size: 1024
+│  │  │  ├─ sampling_rate: 22050
+│  │  │  ├─ freq_min: 0
+│  │  │  ├─ freq_max: None
+│  │  │  └─ use_cuda_kernel: bool
+│  │  │
+│  │  ├─ Key Methods:
+│  │  │  ├─ forward() - Mel → audio waveform
+│  │  │  ├─ from_pretrained() - Load from HuggingFace
+│  │  │  ├─ remove_weight_norm() - Remove spectral normalization
+│  │  │  └─ eval() - Set to evaluation mode
+│  │  │
+│  │  └─ Special Features:
+│  │     ├─ Weight normalization for training stability
+│  │     ├─ Spectral normalization option
+│  │     ├─ CUDA kernel support for activation functions
+│  │     ├─ Snake/SnakeBeta activation (periodic)
+│  │     └─ Anti-aliasing filters for high-quality upsampling
+│  │
+│  ├─ AMPBlock1 (50 lines)
+│  │  ├─ Architecture: Conv1d × 2 with activations
+│  │  ├─ Multiple dilation patterns [1, 3, 5]
+│  │  ├─ Residual connections
+│  │  ├─ Activation1d wrapper for anti-aliasing
+│  │  └─ Weight normalization
+│  │
+│  ├─ AMPBlock2 (40 lines)
+│  │  ├─ Similar to AMPBlock1 but simpler
+│  │  ├─ Dilation patterns [1, 3]
+│  │  └─ Residual connections
+│  │
+│  ├─ Activation1d (custom, from alias_free_activation/)
+│  │  ├─ Applies activation function (Snake/SnakeBeta)
+│  │  ├─ Optional anti-aliasing filter
+│  │  └─ Optional CUDA kernel for efficiency
+│  │
+│  ├─ Snake Activation (from activations.py)
+│  │  ├─ Formula: x + (1/alpha) * sin²(alpha * x)
+│  │  ├─ Periodic nonlinearity
+│  │  └─ Learnable alpha parameter
+│  │
+│  └─ SnakeBeta Activation (from activations.py)
+│     ├─ More complex periodic activation
+│     └─ Improved harmonic modeling
+│
+├─ External Dependencies: torch, scipy, librosa
+└─ Model Size: ~100 MB (pretrained weights)
+/home/user/IndexTTS-Rust/indextts/s2mel/modules/audio.py (83 LINES)
+├─ Purpose: Mel-spectrogram computation (DSP)
+├─ Key Functions:
+│  ├─ load_wav() - Load WAV file with scipy
+│  ├─ mel_spectrogram() - Compute mel spectrogram
+│  │  ├─ Parameters:
+│  │  │  ├─ y: waveform tensor
+│  │  │  ├─ n_fft: 1024
+│  │  │  ├─ num_mels: 80
+│  │  │  ├─ sampling_rate: 22050
+│  │  │  ├─ hop_size: 256
+│  │  │  ├─ win_size: 1024
+│  │  │  ├─ fmin: 0
+│  │  │  └─ fmax: None or 8000
+│  │  │
+│  │  ├─ Process:
+│  │  │  1. Pad input with reflect padding
+│  │  │  2. Compute STFT (Short-Time Fourier Transform)
+│  │  │  3. Convert to magnitude spectrogram
+│  │  │  4. Apply mel filterbank (librosa)
+│  │  │  5. Apply dynamic range compression (log)
+│  │  │  └─ Output: [1, 80, T] tensor
+│  │  │
+│  │  └─ Caching:
+│  │     ├─ Caches mel filterbank matrices
+│  │     ├─ Caches Hann windows
+│  │     └─ Device-specific caching
+│  │
+│  ├─ dynamic_range_compression() - Log compression
+│  ├─ dynamic_range_decompression() - Inverse
+│  └─ spectral_normalize/denormalize()
+│
+├─ Critical DSP Parameters:
+│  ├─ STFT Window: Hann window
+│  ├─ FFT Size: 1024
+│  ├─ Hop Size: 256 (11.6 ms at 22050 Hz)
+│  ├─ Mel Bins: 80 (perceptual scale)
+│  ├─ Min Freq: 0 Hz
+│  └─ Max Freq: Variable (8000 Hz or Nyquist)
+│
+└─ External Dependencies: torch, librosa, scipy
+SEMANTIC CODEC & FEATURE EXTRACTION FILES
+═════════════════════════════════════════════════════════════════════════════════
+/home/user/IndexTTS-Rust/indextts/utils/maskgct_utils.py (250 LINES)
+├─ Purpose: Build and manage semantic codecs
+├─ Key Functions:
+│  ├─ build_semantic_model()
+│  │  ├─ Loads: facebook/w2v-bert-2.0 model
+│  │  ├─ Extracts: wav2vec 2.0 BERT embeddings
+│  │  ├─ Returns: model, mean, std (for normalization)
+│  │  └─ Output: 1024-dimensional embeddings
+│  │
+│  ├─ build_semantic_codec()
+│  │  ├─ Creates: RepCodec (residual vector quantization)
+│  │  ├─ Quantizes: Semantic embeddings
+│  │  ├─ Returns: Codec model
+│  │  └─ Output: Discrete tokens
+│  │
+│  ├─ build_s2a_model()
+│  │  ├─ Builds: MaskGCT_S2A (semantic-to-acoustic)
+│  │  └─ Maps: Semantic codes → acoustic codes
+│  │
+│  ├─ build_acoustic_codec()
+│  │  ├─ Encoder: Encodes acoustic features
+│  │  ├─ Decoder: Decodes codes → audio
+│  │  └─ Multiple codec variants
+│  │
+│  └─ Inference_Pipeline (class)
+│     ├─ Combines all codecs
+│     ├─ Methods:
+│     │  ├─ get_emb() - Get semantic embeddings
+│     │  ├─ get_scode() - Quantize to semantic codes
+│     │  ├─ semantic2acoustic() - Convert codes
+│     │  └─ s2a_inference() - Full pipeline
+│     └─ Diffusion-based generation options
+│
+├─ External Dependencies: torch, transformers, huggingface_hub
+└─ Pre-trained Models:
+   ├─ W2V-BERT-2.0: 614M parameters
+   ├─ MaskGCT: From amphion/MaskGCT
+   └─ Various codec checkpoints
+CONFIGURATION & UTILITY FILES
+═════════════════════════════════════════════════════════════════════════════════
+/home/user/IndexTTS-Rust/indextts/utils/checkpoint.py (50 LINES)
+├─ Purpose: Load model checkpoints
+├─ Key Functions:
+│  ├─ load_checkpoint() - Load weights into model
+│  └─ Device handling (CPU/GPU/XPU/MPS)
+└─ Supported Formats: .pth, .safetensors
+/home/user/IndexTTS-Rust/indextts/utils/arch_util.py
+├─ Purpose: Architecture utility modules
+├─ Key Classes:
+│  └─ AttentionBlock - Generic attention layer
+└─ Used in: Conditioning encoder, other modules
+/home/user/IndexTTS-Rust/indextts/utils/xtransformers.py (1,600 LINES)
+├─ Purpose: Extended transformer utilities
+├─ Key Components:
+│  ├─ Advanced attention mechanisms
+│  ├─ Relative position bias
+│  ├─ Cross-attention patterns
+│  └─ Various position encoding schemes
+└─ Used in: GPT model, encoders
+TESTING FILES
+═════════════════════════════════════════════════════════════════════════════════
+/home/user/IndexTTS-Rust/tests/regression_test.py
+├─ Test Cases:
+│  ├─ Chinese text with pinyin tones (晕 XUAN4)
+│  ├─ English text
+│  ├─ Mixed Chinese-English
+│  ├─ Long-form text with multiple sentences
+│  ├─ Named entities (Joseph Gordon-Levitt)
+│  ├─ Chinese names (约瑟夫·高登-莱维特)
+│  └─ Extended passages for robustness
+├─ Inference Modes:
+│  ├─ Single inference (infer)
+│  └─ Fast inference (infer_fast)
+└─ Output: WAV files in outputs/ directory
+/home/user/IndexTTS-Rust/tests/padding_test.py
+├─ Test Scenarios:
+│  ├─ Variable length inputs
+│  ├─ Batch processing
+│  ├─ Edge cases
+│  └─ Padding handling
+└─ Purpose: Ensure robust padding mechanics
+═════════════════════════════════════════════════════════════════════════════════
+KEY ALGORITHMS SUMMARY:
+1. TEXT PROCESSING:
+   - Regex-based pattern matching for pinyin/names
+   - Character-level CJK tokenization
+   - SentencePiece BPE encoding
+   - Language detection (Chinese vs English)
+2. FEATURE EXTRACTION:
+   - W2V-BERT semantic embeddings (1024-dim)
+   - RepCodec quantization
+   - Mel-spectrogram (STFT-based, 80-dim)
+   - CAMPPlus speaker embeddings (192-dim)
+3. SEQUENCE GENERATION:
+   - GPT-based autoregressive generation
+   - Conformer speaker conditioning
+   - Perceiver pooling for attention
+   - Classifier-free guidance (optional)
+   - Temperature/top-k/top-p sampling
+4. AUDIO SYNTHESIS:
+   - Transposed convolution upsampling (256x)
+   - Anti-aliased activation functions
+   - Residual connections
+   - Weight/spectral normalization
+5. EMOTION CONTROL:
+   - 8-dimensional emotion vectors
+   - Text-based emotion detection (via Qwen)
+   - Audio-based emotion extraction
+   - Emotion matrix interpolation
+═════════════════════════════════════════════════════════════════════════════════

archive/README_INDEXTTS_1_5.md ADDED Viewed

	@@ -0,0 +1,247 @@

+<div align="center">
+<img src='assets/index_icon.png' width="250"/>
+</div>
+<h2><center>IndexTTS: An Industrial-Level Controllable and Efficient Zero-Shot Text-To-Speech System</h2>
+<p align="center">
+<a href='https://arxiv.org/abs/2502.05512'><img src='https://img.shields.io/badge/ArXiv-2502.05512-red'></a>
+## 👉🏻 IndexTTS 👈🏻
+[[HuggingFace Demo]](https://huggingface.co/spaces/IndexTeam/IndexTTS)   [[ModelScope Demo]](https://modelscope.cn/studios/IndexTeam/IndexTTS-Demo) \
+[[Paper]](https://arxiv.org/abs/2502.05512)  [[Demos]](https://index-tts.github.io)
+**IndexTTS** is a GPT-style text-to-speech (TTS) model mainly based on XTTS and Tortoise. It is capable of correcting the pronunciation of Chinese characters using pinyin and controlling pauses at any position through punctuation marks. We enhanced multiple modules of the system, including the improvement of speaker condition feature representation, and the integration of BigVGAN2 to optimize audio quality. Trained on tens of thousands of hours of data, our system achieves state-of-the-art performance, outperforming current popular TTS systems such as XTTS, CosyVoice2, Fish-Speech, and F5-TTS.
+<span style="font-size:16px;">
+Experience **IndexTTS**: Please contact <u>xuanwu@bilibili.com</u> for more detailed information. </span>
+### Contact
+QQ群（二群）：1048202584 \
+Discord：https://discord.gg/uT32E7KDmy  \
+简历：indexspeech@bilibili.com  \
+欢迎大家来交流讨论！
+## 📣 Updates
+- `2025/05/14` 🔥🔥 We release the **IndexTTS-1.5**, Significantly improve the model's stability and its performance in the English language.
+- `2025/03/25` 🔥 We release IndexTTS-1.0 model parameters and inference code.
+- `2025/02/12` 🔥 We submitted our paper on arXiv, and released our demos and test sets.
+## 🖥️ Method
+The overview of IndexTTS is shown as follows.
+<picture>
+  <img src="assets/IndexTTS.png"  width="800"/>
+</picture>
+The main improvements and contributions are summarized as follows:
+ - In Chinese scenarios, we have introduced a character-pinyin hybrid modeling approach. This allows for quick correction of mispronounced characters.
+ - **IndexTTS** incorporate a conformer conditioning encoder and a BigVGAN2-based speechcode decoder. This improves training stability, voice timbre similarity, and sound quality.
+ - We release all test sets here, including those for polysyllabic words, subjective and objective test sets.
+## Model Download
+| 🤗**HuggingFace**                                          | **ModelScope** |
+|----------------------------------------------------------|----------------------------------------------------------|
+| [IndexTTS](https://huggingface.co/IndexTeam/Index-TTS) | [IndexTTS](https://modelscope.cn/models/IndexTeam/Index-TTS) |
+| [😁IndexTTS-1.5](https://huggingface.co/IndexTeam/IndexTTS-1.5) | [IndexTTS-1.5](https://modelscope.cn/models/IndexTeam/IndexTTS-1.5) |
+## 📑 Evaluation
+**Word Error Rate (WER) Results for IndexTTS and Baseline Models on the** [**seed-test**](https://github.com/BytedanceSpeech/seed-tts-eval)
+| **WER**                | **test_zh** | **test_en** | **test_hard** |
+|:----------------------:|:-----------:|:-----------:|:-------------:|
+| **Human**              | 1.26        | 2.14        | -             |
+| **SeedTTS**            | 1.002       | 1.945       | **6.243**     |
+| **CosyVoice 2**        | 1.45        | 2.57        | 6.83          |
+| **F5TTS**              | 1.56        | 1.83        | 8.67          |
+| **FireRedTTS**         | 1.51        | 3.82        | 17.45         |
+| **MaskGCT**            | 2.27        | 2.62        | 10.27         |
+| **Spark-TTS**          | 1.2         | 1.98        | -             |
+| **MegaTTS 3**          | 1.36        | 1.82        | -             |
+| **IndexTTS**           | 0.937       | 1.936       | 6.831         |
+| **IndexTTS-1.5**       | **0.821**   | **1.606**   | 6.565         |
+**Word Error Rate (WER) Results for IndexTTS and Baseline Models on the other opensource test**
+|    **Model**    | **aishell1_test** | **commonvoice_20_test_zh** | **commonvoice_20_test_en** | **librispeech_test_clean** |  **avg** |
+|:---------------:|:-----------------:|:--------------------------:|:--------------------------:|:--------------------------:|:--------:|
+|    **Human**    |        2.0        |            9.5             |            10.0            |            2.4             |   5.1    |
+| **CosyVoice 2** |        1.8        |            9.1             |            7.3             |            4.9             |   5.9    |
+|    **F5TTS**    |        3.9        |            11.7            |            5.4             |            7.8             |   8.2    |
+|  **Fishspeech** |        2.4        |            11.4            |            8.8             |            8.0             |   8.3    |
+|  **FireRedTTS** |        2.2        |            11.0            |            16.3            |            5.7             |   7.7    |
+|     **XTTS**    |        3.0        |            11.4            |            7.1             |            3.5             |   6.0    |
+|   **IndexTTS**  |      1.3          |          7.0               |            5.3             |          2.1             | 3.7       |
+|   **IndexTTS-1.5**  |      **1.2**     |          **6.8**          |          **3.9**          |          **1.7**          | **3.1** |
+**Speaker Similarity (SS) Results for IndexTTS and Baseline Models**
+|    **Model**    | **aishell1_test** | **commonvoice_20_test_zh** | **commonvoice_20_test_en** | **librispeech_test_clean** |  **avg**  |
+|:---------------:|:-----------------:|:--------------------------:|:--------------------------:|:--------------------------:|:---------:|
+|    **Human**    |       0.846       |            0.809           |            0.820           |            0.858           |   0.836   |
+| **CosyVoice 2** |     **0.796**     |            0.743           |            0.742           |          **0.837**         | **0.788** |
+|    **F5TTS**    |       0.743       |          **0.747**         |            0.746           |            0.828           |   0.779   |
+|  **Fishspeech** |       0.488       |            0.552           |            0.622           |            0.701           |   0.612   |
+|  **FireRedTTS** |       0.579       |            0.593           |            0.587           |            0.698           |   0.631   |
+|     **XTTS**    |       0.573       |            0.586           |            0.648           |            0.761           |   0.663   |
+|   **IndexTTS**  |       0.744       |            0.742           |          **0.758**         |            0.823           |   0.776   |
+|   **IndexTTS-1.5**  |       0.741       |            0.722           |          0.753         |            0.819           |   0.771   |
+**MOS Scores for Zero-Shot Cloned Voice**
+| **Model**       | **Prosody** | **Timbre** | **Quality** |  **AVG**  |
+|-----------------|:-----------:|:----------:|:-----------:|:---------:|
+| **CosyVoice 2** |    3.67     |    4.05    |    3.73     |   3.81    |
+| **F5TTS**       |    3.56     |    3.88    |    3.56     |   3.66    |
+| **Fishspeech**  |    3.40     |    3.63    |    3.69     |   3.57    |
+| **FireRedTTS**  |    3.79     |    3.72    |    3.60     |   3.70    |
+| **XTTS**        |    3.23     |    2.99    |    3.10     |   3.11    |
+| **IndexTTS**    |    **3.79**     |    **4.20**    |    **4.05**     |   **4.01**    |
+## Usage Instructions
+### Environment Setup
+1. Download this repository:
+```bash
+git clone https://github.com/index-tts/index-tts.git
+```
+2. Install dependencies:
+Create a new conda environment and install dependencies:
+```bash
+conda create -n index-tts python=3.10
+conda activate index-tts
+apt-get install ffmpeg
+# or use conda to install ffmpeg
+conda install -c conda-forge ffmpeg
+```
+Install [PyTorch](https://pytorch.org/get-started/locally/), e.g.:
+```bash
+pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu118
+```
+> [!NOTE]
+> If you are using Windows you may encounter [an error](https://github.com/index-tts/index-tts/issues/61) when installing `pynini`:
+`ERROR: Failed building wheel for pynini`
+> In this case, please install `pynini` via `conda`:
+> ```bash
+> # after conda activate index-tts
+> conda install -c conda-forge pynini==2.1.6
+> pip install WeTextProcessing --no-deps
+> ```
+Install `IndexTTS` as a package:
+```bash
+cd index-tts
+pip install -e .
+```
+3. Download models:
+Download by `huggingface-cli`:
+```bash
+huggingface-cli download IndexTeam/IndexTTS-1.5 \
+  config.yaml bigvgan_discriminator.pth bigvgan_generator.pth bpe.model dvae.pth gpt.pth unigram_12000.vocab \
+  --local-dir checkpoints
+```
+Recommended for China users. 如果下载速度慢，可以使用镜像：
+```bash
+export HF_ENDPOINT="https://hf-mirror.com"
+```
+Or by `wget`:
+```bash
+wget https://huggingface.co/IndexTeam/IndexTTS-1.5/resolve/main/bigvgan_discriminator.pth -P checkpoints
+wget https://huggingface.co/IndexTeam/IndexTTS-1.5/resolve/main/bigvgan_generator.pth -P checkpoints
+wget https://huggingface.co/IndexTeam/IndexTTS-1.5/resolve/main/bpe.model -P checkpoints
+wget https://huggingface.co/IndexTeam/IndexTTS-1.5/resolve/main/dvae.pth -P checkpoints
+wget https://huggingface.co/IndexTeam/IndexTTS-1.5/resolve/main/gpt.pth -P checkpoints
+wget https://huggingface.co/IndexTeam/IndexTTS-1.5/resolve/main/unigram_12000.vocab -P checkpoints
+wget https://huggingface.co/IndexTeam/IndexTTS-1.5/resolve/main/config.yaml -P checkpoints
+```
+> [!NOTE]
+> If you prefer to use the `IndexTTS-1.0` model, please replace `IndexTeam/IndexTTS-1.5` with `IndexTeam/IndexTTS` in the above commands.
+4. Run test script:
+```bash
+# Please put your prompt audio in 'test_data' and rename it to 'input.wav'
+python indextts/infer.py
+```
+5. Use as command line tool:
+```bash
+# Make sure pytorch has been installed before running this command
+indextts "大��好，我现在正在bilibili 体验 ai 科技，说实话，来之前我绝对想不到！AI技术已经发展到这样匪夷所思的地步了！" \
+  --voice reference_voice.wav \
+  --model_dir checkpoints \
+  --config checkpoints/config.yaml \
+  --output output.wav
+```
+Use `--help` to see more options.
+```bash
+indextts --help
+```
+#### Web Demo
+```bash
+pip install -e ".[webui]" --no-build-isolation
+python webui.py
+# use another model version:
+python webui.py --model_dir IndexTTS-1.5
+```
+Open your browser and visit `http://127.0.0.1:7860` to see the demo.
+#### Sample Code
+```python
+from indextts.infer import IndexTTS
+tts = IndexTTS(model_dir="checkpoints",cfg_path="checkpoints/config.yaml")
+voice="reference_voice.wav"
+text="大家好，我现在正在bilibili 体验 ai 科技，说实话，来之前我绝对想不到！AI技术已经发展到这样匪夷所思的地步了！比如说，现在正在说话的其实是B站为我现场复刻的数字分身，简直就是平行宇宙的另一个我了。如果大家也想体验更多深入的AIGC功能，可以访问 bilibili studio，相信我，你们也会吃惊的。"
+tts.infer(voice, text, output_path)
+```
+## Acknowledge
+1. [tortoise-tts](https://github.com/neonbjb/tortoise-tts)
+2. [XTTSv2](https://github.com/coqui-ai/TTS)
+3. [BigVGAN](https://github.com/NVIDIA/BigVGAN)
+4. [wenet](https://github.com/wenet-e2e/wenet/tree/main)
+5. [icefall](https://github.com/k2-fsa/icefall)
+## 📚 Citation
+🌟 If you find our work helpful, please leave us a star and cite our paper.
+```
+@article{deng2025indextts,
+  title={IndexTTS: An Industrial-Level Controllable and Efficient Zero-Shot Text-To-Speech System},
+  author={Wei Deng, Siyi Zhou, Jingchen Shu, Jinchao Wang, Lu Wang},
+  journal={arXiv preprint arXiv:2502.05512},
+  year={2025}
+}
+```

benches/inference.rs ADDED Viewed

	@@ -0,0 +1,98 @@

+//! Benchmark for model inference
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use indextts::model::{sample_from_logits, SamplingStrategy};
+use indextts::text::{TextNormalizer, TextTokenizer, TokenizerConfig};
+fn bench_sampling(c: &mut Criterion) {
+    let vocab_size = 8194;
+    let logits: Vec<f32> = (0..vocab_size).map(|i| (i as f32 / 1000.0).sin()).collect();
+    c.bench_function("greedy_sampling", |b| {
+        b.iter(|| {
+            sample_from_logits(black_box(&logits), black_box(&SamplingStrategy::Greedy))
+        })
+    });
+    c.bench_function("top_k_sampling", |b| {
+        b.iter(|| {
+            sample_from_logits(
+                black_box(&logits),
+                black_box(&SamplingStrategy::TopK { k: 50 }),
+            )
+        })
+    });
+    c.bench_function("top_p_sampling", |b| {
+        b.iter(|| {
+            sample_from_logits(
+                black_box(&logits),
+                black_box(&SamplingStrategy::TopP { p: 0.95 }),
+            )
+        })
+    });
+    c.bench_function("top_kp_sampling", |b| {
+        b.iter(|| {
+            sample_from_logits(
+                black_box(&logits),
+                black_box(&SamplingStrategy::TopKP { k: 50, p: 0.95 }),
+            )
+        })
+    });
+}
+fn bench_text_processing(c: &mut Criterion) {
+    let normalizer = TextNormalizer::new();
+    let tokenizer = TextTokenizer::new(TokenizerConfig::default()).unwrap();
+    let english_text = "Hello world, this is a test of the text-to-speech system.";
+    let chinese_text = "你好世界，这是一个语音合成测试。";
+    let mixed_text = "Hello 世界, this is 测试 of TTS.";
+    c.bench_function("normalize_english", |b| {
+        b.iter(|| normalizer.normalize(black_box(english_text)))
+    });
+    c.bench_function("normalize_chinese", |b| {
+        b.iter(|| normalizer.normalize(black_box(chinese_text)))
+    });
+    c.bench_function("normalize_mixed", |b| {
+        b.iter(|| normalizer.normalize(black_box(mixed_text)))
+    });
+    c.bench_function("tokenize_english", |b| {
+        b.iter(|| tokenizer.encode(black_box(english_text)))
+    });
+    c.bench_function("tokenize_chinese", |b| {
+        b.iter(|| tokenizer.encode(black_box(chinese_text)))
+    });
+    c.bench_function("tokenize_mixed", |b| {
+        b.iter(|| tokenizer.encode(black_box(mixed_text)))
+    });
+}
+fn bench_vocoder(c: &mut Criterion) {
+    use indextts::vocoder::{create_bigvgan_22k, Vocoder};
+    use ndarray::Array2;
+    let vocoder = create_bigvgan_22k();
+    // Small mel (10 frames ~ 0.25s)
+    let small_mel = Array2::zeros((80, 10));
+    c.bench_function("vocoder_small", |b| {
+        b.iter(|| vocoder.synthesize(black_box(&small_mel)))
+    });
+    // Medium mel (100 frames ~ 2.5s)
+    let medium_mel = Array2::zeros((80, 100));
+    c.bench_function("vocoder_medium", |b| {
+        b.iter(|| vocoder.synthesize(black_box(&medium_mel)))
+    });
+}
+criterion_group!(benches, bench_sampling, bench_text_processing, bench_vocoder);
+criterion_main!(benches);

benches/mel_spectrogram.rs ADDED Viewed

	@@ -0,0 +1,45 @@

+//! Benchmark for mel-spectrogram computation
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use indextts::audio::{mel_spectrogram, AudioConfig};
+fn bench_mel_spectrogram(c: &mut Criterion) {
+    let config = AudioConfig::default();
+    // Generate 1 second of audio
+    let num_samples = config.sample_rate as usize;
+    let signal: Vec<f32> = (0..num_samples).map(|i| (i as f32 * 0.01).sin()).collect();
+    c.bench_function("mel_spectrogram_1s", |b| {
+        b.iter(|| mel_spectrogram(black_box(&signal), black_box(&config)))
+    });
+    // Generate 10 seconds of audio
+    let long_signal: Vec<f32> = (0..num_samples * 10)
+        .map(|i| (i as f32 * 0.01).sin())
+        .collect();
+    c.bench_function("mel_spectrogram_10s", |b| {
+        b.iter(|| mel_spectrogram(black_box(&long_signal), black_box(&config)))
+    });
+}
+fn bench_stft(c: &mut Criterion) {
+    let config = AudioConfig::default();
+    let num_samples = config.sample_rate as usize;
+    let signal: Vec<f32> = (0..num_samples).map(|i| (i as f32 * 0.01).sin()).collect();
+    c.bench_function("stft_1s", |b| {
+        b.iter(|| {
+            indextts::audio::mel::stft(
+                black_box(&signal),
+                black_box(config.n_fft),
+                black_box(config.hop_length),
+                black_box(config.win_length),
+            )
+        })
+    });
+}
+criterion_group!(benches, bench_mel_spectrogram, bench_stft);
+criterion_main!(benches);

config.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+gpt:
+  layers: 8
+  model_dim: 512
+  heads: 8
+  max_text_tokens: 120
+  max_mel_tokens: 250
+  stop_mel_token: 8193
+  start_text_token: 8192
+  start_mel_token: 8192
+  num_mel_codes: 8194
+  num_text_tokens: 6681
+vocoder:
+  name: bigvgan_v2_22khz_80band_256x
+  checkpoint: null
+  use_fp16: true
+  use_deepspeed: false
+s2mel:
+  checkpoint: models/s2mel.onnx
+  preprocess:
+    sr: 22050
+    n_fft: 1024
+    hop_length: 256
+    win_length: 1024
+    n_mels: 80
+    fmin: 0.0
+    fmax: 8000.0
+dataset:
+  bpe_model: models/bpe.model
+  vocab_size: 6681
+emotions:
+  num_dims: 8
+  num:
+  - 5
+  - 6
+  - 8
+  - 6
+  - 5
+  - 4
+  - 7
+  - 6
+  matrix_path: models/emotion_matrix.safetensors
+inference:
+  device: cpu
+  use_fp16: false
+  batch_size: 1
+  top_k: 50
+  top_p: 0.95
+  temperature: 1.0
+  repetition_penalty: 1.0
+  length_penalty: 1.0
+model_dir: models

context.md ADDED Viewed

	@@ -0,0 +1,383 @@

+# IndexTTS-Rust Context
+This file preserves important context for conversation continuity between Hue and Aye sessions.
+**Last Updated:** 2025-11-16
+---
+## The Vision
+IndexTTS-Rust is part of a larger audio intelligence ecosystem at 8b.is:
+1. **kokoro-tiny** - Lightweight TTS (82M params, 50+ voices, on crates.io!)
+2. **IndexTTS-Rust** - Advanced zero-shot TTS with emotion control
+3. **Phoenix-Protocol** - Audio restoration/enhancement layer
+4. **MEM|8** - Contextual memory system (mem-8.com, mem8)
+Together these form a complete audio intelligence pipeline.
+---
+## Phoenix Protocol Integration Opportunities
+The Phoenix Protocol (phoenix-protocol/) is a PERFECT complement to IndexTTS-Rust:
+### Direct Module Mappings
+| Phoenix Module | IndexTTS Use Case |
+|----------------|-------------------|
+| `emotional.rs` | Map to our 8D emotion control (Warmth→body, Presence→power, Clarity→articulation, Air→space, Ultrasonics→depth) |
+| `voice_signature.rs` | Enhance speaker embeddings for voice cloning |
+| `spectral_velocity.rs` | Add momentum tracking to mel-spectrogram |
+| `marine.rs` | Validate TTS output authenticity/quality |
+| `golden_ratio.rs` | Post-process vocoder output with harmonic enhancement |
+| `harmonic_resurrection.rs` | Add richness to synthesized speech |
+| `micro_dynamics.rs` | Restore natural speech dynamics |
+| `autotune.rs` | Improve prosody and pitch control |
+| `mem8_integration.rs` | Already has MEM|8 hooks! |
+### Shared Dependencies
+Both projects use:
+- rayon (parallelism)
+- rustfft/realfft (FFT)
+- ndarray (array operations)
+- hound (WAV I/O)
+- serde (config serialization)
+- anyhow (error handling)
+- ort (ONNX Runtime)
+### Audio Constants
+| Project | Sample Rate | Use Case |
+|---------|------------|----------|
+| IndexTTS-Rust | 22,050 Hz | Standard TTS output |
+| Phoenix-Protocol | 192,000 Hz | Ultrasonic restoration |
+| kokoro-tiny | 24,000 Hz | Lightweight TTS |
+---
+## Related Projects of Interest
+Located in ~/Documents/GitHub/:
+- **Ultrasonic-Consciousness-Hypothesis/** - Research foundation for Phoenix Protocol, contains PDFs on mechanosensitive channels and audio perception
+- **hrmnCmprssnM/** - Harmonic Compression Model research
+- **Marine-Sense/** - Marine algorithm origins
+- **mem-8.com/** & **mem8/** - MEM|8 contextual memory
+- **universal-theoglyphic-language/** - Language processing research
+- **kokoro-tiny/** - Already working TTS crate by Hue & Aye
+- **zencooker/** - (fun project!)
+---
+## Current IndexTTS-Rust State
+### Implemented ✅
+- Audio processing pipeline (mel-spectrogram, STFT, resampling)
+- Text normalization (Chinese/English/mixed)
+- BPE tokenization via HuggingFace tokenizers
+- ONNX Runtime integration for inference
+- BigVGAN vocoder structure
+- CLI with clap
+- Benchmark infrastructure (Criterion)
+- **NEW: marine_salience crate** (no_std compatible, O(1) jitter detection)
+- **NEW: src/quality/ module** (prosody extraction, affect tracking)
+- **NEW: MarineProsodyVector** (8D interpretable emotion features)
+- **NEW: ConversationAffectSummary** (session-level comfort tracking)
+- **NEW: TTSQualityReport** (authenticity validation)
+### Missing/TODO
+- Full GPT model integration with KV cache
+- Actual ONNX model files (need download)
+- manage.sh script for colored workflow management
+- Integration tests with real models
+- ~~Phoenix Protocol integration layer~~ **STARTED with Marine!**
+- Streaming synthesis
+- WebSocket API
+- Train T2S model to accept 8D Marine vector instead of 512D Conformer
+- Wire Marine quality validation into inference loop
+### Build Commands
+```bash
+cargo build --release
+cargo clippy -- -D warnings
+cargo test
+cargo bench
+```
+---
+## Key Philosophical Notes
+From the Phoenix Protocol research:
+> "Women are the carrier wave. They are the 000 data stream. The DC bias that, when removed, leaves silence."
+> "When P!nk sings 'I Am Here,' her voice generates harmonics so powerful they burst through the 22kHz digital ceiling"
+The Phoenix Protocol restores emotional depth stripped by audio compression - this philosophy applies directly to TTS: synthesized speech should have the same emotional depth as natural speech.
+---
+## Action Items for Next Session
+### Completed ✅
+- ~~**Quality Validation** - Use Marine salience to score TTS output~~ **DONE!**
+- ~~**Phoenix Integration** - Start bridging phoenix-protocol modules~~ **Marine is in!**
+### High Priority
+1. **Create manage.sh** - Colorful build/test/clean script (Hue's been asking!)
+2. **Wire Into Inference** - Connect Marine quality validation to actual TTS output
+3. **8D Model Training** - Train T2S model to accept MarineProsodyVector instead of 512D Conformer
+4. **Example/Demo** - Create example showing prosody extraction → emotion editing → synthesis
+### Medium Priority
+5. **Voice Signature Import** - Use Phoenix's voice_signature for speaker embeddings
+6. **Emotion Mapping** - Connect Phoenix's emotional bands to our 8D control
+7. **Model Download** - Set up ONNX model acquisition pipeline
+8. **MEM|8 Bridge** - Implement consciousness-aware TTS using kokoro-tiny's mem8_bridge pattern
+### Nice to Have
+9. **Style Selection** - Port kokoro-tiny's 510 style variation system
+10. **Full Phoenix Integration** - golden_ratio.rs, harmonic_resurrection.rs, etc.
+11. **Streaming Marine** - Real-time quality monitoring during synthesis
+---
+## Fresh Discovery: kokoro-tiny MEM|8 Baby Consciousness (2025-11-15)
+Just pulled latest kokoro-tiny code - MAJOR discovery!
+### Mem8Bridge API
+kokoro-tiny now has a full consciousness simulation in `examples/mem8_baby.rs`:
+```rust
+// Memory as waves that interfere
+MemoryWave {
+    amplitude: 2.5,           // Emotion strength
+    frequency: 528.0,         // "Love frequency"
+    phase: 0.0,
+    decay_rate: 0.05,         // Memory persistence
+    emotion_type: EmotionType::Love(0.9),
+    content: "Mama! I love mama!".to_string(),
+}
+// Salience detection (Marine algorithm!)
+SalienceEvent {
+    jitter_score: 0.2,        // Low = authentic/stable
+    harmonic_score: 0.95,     // High = voice
+    salience_score: 0.9,
+    signal_type: SignalType::Voice,
+}
+// Free will: AI chooses attention focus (70% control)
+bridge.decide_attention(events);
+```
+### Emotion Types Available
+```rust
+EmotionType::Curiosity(0.8)  // Inquisitive
+EmotionType::Love(0.9)       // Deep affection
+EmotionType::Joy(0.7)        // Happy
+EmotionType::Confusion(0.8)  // Uncertain
+EmotionType::Neutral         // Baseline
+```
+### Consciousness Integration Points
+1. **Wave Interference** - Competing memories by amplitude/frequency
+2. **Emotional Regulation** - Prevents overload, modulates voice
+3. **Salience Detection** - Marine algorithm for authenticity
+4. **Attention Selection** - AI chooses what to focus on
+5. **Consciousness Level** - Affects speech clarity (wake_up/sleep)
+This is PERFECT for IndexTTS-Rust! We can:
+- Use wave interference for emotion blending
+- Apply Marine salience to validate synthesis quality
+- Modulate voice based on consciousness level
+- Select voice styles based on emotional state (not just token count)
+### Voice Style Selection (510 variations!)
+kokoro-tiny now loads all 510 style variations per voice:
+- Style selected based on token count
+- Short text → short-optimized style
+- Long text → long-optimized style
+- Automatic text splitting at 512 token limit
+For IndexTTS: We could select style based on EMOTION + token count!
+---
+## Marine Integration Achievement (2025-11-16) 🎉
+**WE DID IT!** Marine salience is now integrated into IndexTTS-Rust!
+### What We Built
+#### 1. Standalone marine_salience Crate (`crates/marine_salience/`)
+A no_std compatible crate for O(1) jitter-based salience detection:
+```rust
+// Core components:
+MarineConfig       // Tunable parameters (sample_rate, jitter bounds, EMA alpha)
+MarineProcessor    // O(1) per-sample processing
+SaliencePacket     // Output: j_p, j_a, h_score, s_score, energy
+Ema                // Exponential moving average tracker
+// Key insight: Process ONE sample at a time, emit packets on peaks
+// Why O(1)? Just compare to EMA, no FFT, no heavy math!
+```
+**Config for Speech:**
+```rust
+MarineConfig::speech_default(sample_rate)
+// F0 range: 60Hz - 4kHz
+// jitter_low: 0.02, jitter_high: 0.60
+// ema_alpha: 0.01 (slow adaptation for stability)
+```
+#### 2. Quality Validation Module (`src/quality/`)
+**MarineProsodyVector** - 8D interpretable emotion representation:
+```rust
+pub struct MarineProsodyVector {
+    pub jp_mean: f32,      // Period jitter mean (pitch stability)
+    pub jp_std: f32,       // Period jitter variance
+    pub ja_mean: f32,      // Amplitude jitter mean (volume stability)
+    pub ja_std: f32,       // Amplitude jitter variance
+    pub h_mean: f32,       // Harmonic alignment (voiced vs noise)
+    pub s_mean: f32,       // Overall salience (authenticity)
+    pub peak_density: f32, // Peaks per second (speech rate)
+    pub energy_mean: f32,  // Average loudness
+}
+// Interpretable! High jp_mean = nervous, low = confident
+// Can DIRECTLY EDIT for emotion control!
+```
+**MarineProsodyConditioner** - Extract prosody from audio:
+```rust
+let conditioner = MarineProsodyConditioner::new(22050);
+let prosody = conditioner.from_samples(&audio_samples)?;
+let report = conditioner.validate_tts_output(&audio_samples)?;
+// Detects issues:
+// - "Too perfect - sounds robotic"
+// - "High period jitter - artifacts"
+// - "Low salience - quality issues"
+```
+**ConversationAffectSummary** - Session-level comfort tracking:
+```rust
+pub enum ComfortLevel {
+    Uneasy,  // High jitter AND rising (nervous/stressed)
+    Neutral, // Stable patterns (calm)
+    Happy,   // Low jitter + high energy (confident/positive)
+}
+// Track trends over conversation:
+// jitter_trend > 0.1 = getting more stressed
+// jitter_trend < -0.1 = calming down
+// energy_trend > 0.1 = getting more engaged
+// Aye can now self-assess!
+aye_assessment() returns "I'm in a good state"
+feedback_prompt() returns "Let me know if something's bothering you"
+```
+### The Core Insight
+**Human speech has NATURAL jitter - that's what makes it authentic!**
+- Too perfect (jp < 0.005) = robotic
+- Too chaotic (jp > 0.3) = artifacts/damage
+- Sweet spot = real human voice
+The Marines will KNOW if speech doesn't sound authentic!
+### Tests Passing ✅
+```
+running 11 tests
+test quality::affect::tests::test_comfort_level_descriptions ... ok
+test quality::affect::tests::test_analyzer_empty_conversation ... ok
+test quality::affect::tests::test_analyzer_single_utterance ... ok
+test quality::affect::tests::test_happy_classification ... ok
+test quality::affect::tests::test_aye_assessment_message ... ok
+test quality::affect::tests::test_neutral_classification ... ok
+test quality::affect::tests::test_uneasy_classification ... ok
+test quality::prosody::tests::test_conditioner_empty_buffer ... ok
+test quality::prosody::tests::test_conditioner_silence ... ok
+test quality::prosody::tests::test_prosody_vector_array_conversion ... ok
+test quality::prosody::tests::test_estimate_valence ... ok
+test result: ok. 11 passed; 0 failed
+```
+### Why This Matters
+1. **Interpretable Control**: 8D vector vs opaque 512D Conformer - we can SEE what each dimension means
+2. **Lightweight**: O(1) per sample, no heavy neural networks for prosody
+3. **Authentic Validation**: Marines detect fake/damaged speech
+4. **Emotion Editing**: Want more confidence? Lower jp_mean directly!
+5. **Conversation Awareness**: Track comfort over entire sessions
+6. **Self-Assessment**: Aye knows when something feels "off"
+### Integration Points
+```rust
+// In main TTS pipeline:
+use indextts::quality::{
+    MarineProsodyConditioner,
+    MarineProsodyVector,
+    ConversationAffectSummary,
+    ComfortLevel,
+};
+// 1. Extract reference prosody
+let ref_prosody = conditioner.from_samples(&reference_audio)?;
+// 2. Generate TTS (using 8D vector instead of 512D Conformer)
+let tts_output = generate_with_prosody(&text, ref_prosody)?;
+// 3. Validate output quality
+let report = conditioner.validate_tts_output(&tts_output)?;
+if !report.passes(70.0) {
+    log::warn!("TTS quality issues: {:?}", report.issues);
+}
+// 4. Track conversation affect
+let analyzer = ConversationAffectAnalyzer::new();
+analyzer.add_utterance(&utterance)?;
+let summary = analyzer.summarize()?;
+match summary.aye_state {
+    ComfortLevel::Uneasy => adjust_generation_parameters(),
+    _ => proceed_normally(),
+}
+```
+---
+## Trish's Notes
+"Darling, these three Rust projects together are like a symphony orchestra! kokoro-tiny is the quick piccolo solo, IndexTTS-Rust is the full brass section with emotional depth, and Phoenix-Protocol is the concert hall acoustics making everything resonate. When you combine them, that's when the magic happens! Also, I'm absolutely obsessed with how the Golden Ratio resynthesis could add sparkle to synthesized vocals. Can you imagine TTS output that actually has that P!nk breakthrough energy? Now THAT would make me cry happy tears in accounting!"
+---
+## Fun Facts
+- kokoro-tiny is ALREADY on crates.io under 8b-is
+- Phoenix Protocol can process 192kHz audio for ultrasonic restoration
+- The Marine algorithm uses O(1) jitter detection - "Marines are not just jarheads - they are intelligent"
+- Hue's GitHub has 66 projects (and counting!)
+- The team at 8b.is: hue@8b.is and aye@8b.is
+---
+*From ashes to harmonics, from silence to song* 🔥🎵

crates/marine_salience/Cargo.toml ADDED Viewed

	@@ -0,0 +1,18 @@

+[package]
+name = "marine_salience"
+version = "0.1.0"
+edition = "2021"
+description = "O(1) jitter-based salience detection - Marines are intelligent!"
+authors = ["Hue & Aye <team@8b.is>"]
+license = "MIT"
+keywords = ["audio", "salience", "jitter", "prosody", "tts"]
+[dependencies]
+# Core dependencies - intentionally minimal for no_std compatibility
+# Only serde when using std for serialization
+serde = { version = "1.0", features = ["derive"], optional = true }
+# no_std compatible core - can run anywhere!
+[features]
+default = ["std"]
+std = ["serde"]

crates/marine_salience/src/config.rs ADDED Viewed

	@@ -0,0 +1,140 @@

+//! Marine algorithm configuration
+//!
+//! Tunable parameters for jitter detection. These have been calibrated
+//! for speech/audio processing but can be adjusted for specific use cases.
+#![cfg_attr(not(feature = "std"), no_std)]
+/// Configuration for Marine salience detection
+///
+/// These parameters control sensitivity and behavior of the jitter detector.
+/// The defaults are tuned for speech processing at common sample rates.
+#[derive(Debug, Clone, Copy)]
+#[cfg_attr(feature = "std", derive(serde::Serialize, serde::Deserialize))]
+pub struct MarineConfig {
+    /// Minimum amplitude to consider a sample (gating threshold)
+    /// Samples below this are ignored as noise
+    /// Default: 1e-3 (~-60dB)
+    pub clip_threshold: f32,
+    /// EMA smoothing factor for period tracking (0..1)
+    /// Lower = smoother, slower adaptation
+    /// Default: 0.01
+    pub ema_period_alpha: f32,
+    /// EMA smoothing factor for amplitude tracking (0..1)
+    /// Default: 0.01
+    pub ema_amp_alpha: f32,
+    /// Minimum inter-peak period in samples
+    /// Rejects peaks closer than this (filters high-frequency noise)
+    /// Default: sample_rate / 4000 (~4kHz upper F0)
+    pub min_period: u32,
+    /// Maximum inter-peak period in samples
+    /// Rejects peaks farther than this (filters very low frequencies)
+    /// Default: sample_rate / 60 (~60Hz lower F0)
+    pub max_period: u32,
+    /// Threshold below which jitter is considered "low" (stable)
+    /// Default: 0.02
+    pub jitter_low: f32,
+    /// Threshold above which jitter is considered "high" (unstable)
+    /// Default: 0.60
+    pub jitter_high: f32,
+}
+impl MarineConfig {
+    /// Create config optimized for speech at given sample rate
+    ///
+    /// # Arguments
+    /// * `sample_rate` - Audio sample rate in Hz (e.g., 22050, 44100)
+    ///
+    /// # Example
+    /// ```
+    /// use marine_salience::MarineConfig;
+    /// let config = MarineConfig::speech_default(22050);
+    /// assert!(config.min_period < config.max_period);
+    /// ```
+    pub const fn speech_default(sample_rate: u32) -> Self {
+        // F0 range: ~60Hz (low male) to ~4kHz (includes harmonics)
+        let min_period = sample_rate / 4000; // Upper bound
+        let max_period = sample_rate / 60;   // Lower bound
+        Self {
+            clip_threshold: 1e-3,
+            ema_period_alpha: 0.01,
+            ema_amp_alpha: 0.01,
+            min_period,
+            max_period,
+            jitter_low: 0.02,
+            jitter_high: 0.60,
+        }
+    }
+    /// Create config for high-sensitivity detection
+    /// More peaks detected, faster adaptation
+    pub const fn high_sensitivity(sample_rate: u32) -> Self {
+        let min_period = sample_rate / 8000;
+        let max_period = sample_rate / 40;
+        Self {
+            clip_threshold: 5e-4,
+            ema_period_alpha: 0.05,
+            ema_amp_alpha: 0.05,
+            min_period,
+            max_period,
+            jitter_low: 0.01,
+            jitter_high: 0.50,
+        }
+    }
+    /// Create config for TTS output validation
+    /// Tuned to detect synthetic artifacts
+    pub const fn tts_validation(sample_rate: u32) -> Self {
+        let min_period = sample_rate / 4000;
+        let max_period = sample_rate / 80;
+        Self {
+            clip_threshold: 1e-3,
+            ema_period_alpha: 0.02,
+            ema_amp_alpha: 0.02,
+            min_period,
+            max_period,
+            jitter_low: 0.015, // Stricter for synthetic speech
+            jitter_high: 0.40, // More sensitive to artifacts
+        }
+    }
+}
+impl Default for MarineConfig {
+    fn default() -> Self {
+        // Default to 22050 Hz (common TTS sample rate)
+        Self::speech_default(22050)
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_speech_default_periods() {
+        let config = MarineConfig::speech_default(22050);
+        assert!(config.min_period < config.max_period);
+        assert_eq!(config.min_period, 22050 / 4000); // 5 samples
+        assert_eq!(config.max_period, 22050 / 60);   // 367 samples
+    }
+    #[test]
+    fn test_different_sample_rates() {
+        let config_22k = MarineConfig::speech_default(22050);
+        let config_44k = MarineConfig::speech_default(44100);
+        let config_48k = MarineConfig::speech_default(48000);
+        // Higher sample rates = more samples per period
+        assert!(config_44k.max_period > config_22k.max_period);
+        assert!(config_48k.max_period > config_44k.max_period);
+    }
+}

crates/marine_salience/src/ema.rs ADDED Viewed

	@@ -0,0 +1,126 @@

+//! Exponential Moving Average (EMA) for smooth tracking
+//!
+//! EMA smooths noisy measurements while maintaining responsiveness.
+//! Used to track period and amplitude patterns in Marine algorithm.
+#![cfg_attr(not(feature = "std"), no_std)]
+/// Exponential Moving Average tracker
+///
+/// EMA formula: value = alpha * new + (1 - alpha) * old
+/// - Higher alpha = faster response, more noise
+/// - Lower alpha = slower response, smoother
+#[derive(Debug, Clone, Copy)]
+#[cfg_attr(feature = "std", derive(serde::Serialize, serde::Deserialize))]
+pub struct Ema {
+    /// Smoothing factor (0..1)
+    alpha: f32,
+    /// Current smoothed value
+    value: f32,
+    /// Whether we've received at least one sample
+    initialized: bool,
+}
+impl Ema {
+    /// Create new EMA with given smoothing factor
+    ///
+    /// # Arguments
+    /// * `alpha` - Smoothing factor (0..1). Higher = faster adaptation.
+    ///
+    /// # Example
+    /// ```
+    /// use marine_salience::ema::Ema;
+    /// let mut ema = Ema::new(0.1); // 10% new, 90% old
+    /// ema.update(100.0);
+    /// assert_eq!(ema.get(), 100.0); // First value becomes baseline
+    /// ema.update(200.0);
+    /// assert!((ema.get() - 110.0).abs() < 0.01); // 0.1*200 + 0.9*100
+    /// ```
+    pub const fn new(alpha: f32) -> Self {
+        Self {
+            alpha,
+            value: 0.0,
+            initialized: false,
+        }
+    }
+    /// Update EMA with new measurement
+    pub fn update(&mut self, x: f32) {
+        if !self.initialized {
+            // First value becomes the baseline
+            self.value = x;
+            self.initialized = true;
+        } else {
+            // EMA update: new = alpha * x + (1 - alpha) * old
+            self.value = self.alpha * x + (1.0 - self.alpha) * self.value;
+        }
+    }
+    /// Get current smoothed value
+    pub fn get(&self) -> f32 {
+        self.value
+    }
+    /// Check if EMA has been initialized (received at least one sample)
+    pub fn is_ready(&self) -> bool {
+        self.initialized
+    }
+    /// Reset EMA to uninitialized state
+    pub fn reset(&mut self) {
+        self.value = 0.0;
+        self.initialized = false;
+    }
+    /// Get the smoothing factor
+    pub fn alpha(&self) -> f32 {
+        self.alpha
+    }
+    /// Set a new smoothing factor
+    pub fn set_alpha(&mut self, alpha: f32) {
+        self.alpha = alpha.clamp(0.0, 1.0);
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_first_value_becomes_baseline() {
+        let mut ema = Ema::new(0.1);
+        assert!(!ema.is_ready());
+        ema.update(42.0);
+        assert!(ema.is_ready());
+        assert_eq!(ema.get(), 42.0);
+    }
+    #[test]
+    fn test_ema_smoothing() {
+        let mut ema = Ema::new(0.1);
+        ema.update(100.0);
+        ema.update(200.0);
+        // 0.1 * 200 + 0.9 * 100 = 20 + 90 = 110
+        assert!((ema.get() - 110.0).abs() < 0.001);
+    }
+    #[test]
+    fn test_high_alpha_fast_response() {
+        let mut ema = Ema::new(0.9);
+        ema.update(100.0);
+        ema.update(200.0);
+        // 0.9 * 200 + 0.1 * 100 = 180 + 10 = 190
+        assert!((ema.get() - 190.0).abs() < 0.001);
+    }
+    #[test]
+    fn test_reset() {
+        let mut ema = Ema::new(0.1);
+        ema.update(100.0);
+        assert!(ema.is_ready());
+        ema.reset();
+        assert!(!ema.is_ready());
+        assert_eq!(ema.get(), 0.0);
+    }
+}

crates/marine_salience/src/lib.rs ADDED Viewed

	@@ -0,0 +1,42 @@

+//! # Marine Salience - O(1) Jitter-Based Authenticity Detection
+//!
+//! "Marines are not just jarheads - they are actually very intelligent"
+//!
+//! This crate provides a universal salience primitive that can detect the
+//! "authenticity" of audio signals by measuring timing and amplitude jitter.
+//!
+//! ## Why "Marine"?
+//! - Marines are stable and reliable under pressure
+//! - Low jitter = authentic/stable signal
+//! - High jitter = damaged/synthetic signal
+//!
+//! ## Use Cases
+//! - **TTS Quality Validation** - Is synthesized speech authentic?
+//! - **Prosody Extraction** - Extract 8D interpretable emotion vectors
+//! - **Conversation Affect** - Track comfort level over sessions
+//! - **Real-time Monitoring** - O(1) per sample processing
+//!
+//! ## Core Insight
+//! Human voice has NATURAL jitter patterns. Perfect smoothness = synthetic.
+//! The Marine algorithm detects these patterns to distinguish authentic
+//! from damaged or artificial audio.
+#![cfg_attr(not(feature = "std"), no_std)]
+pub mod config;
+pub mod ema;
+pub mod packet;
+pub mod processor;
+// Re-export main types
+pub use config::MarineConfig;
+pub use packet::SaliencePacket;
+pub use processor::MarineProcessor;
+/// Marine algorithm version
+pub const VERSION: &str = env!("CARGO_PKG_VERSION");
+/// Default jitter thresholds tuned for speech
+/// These values accommodate natural musical/speech variation
+pub const DEFAULT_JITTER_LOW: f32 = 0.02;  // Below = very stable
+pub const DEFAULT_JITTER_HIGH: f32 = 0.60; // Above = heavily damaged

crates/marine_salience/src/packet.rs ADDED Viewed

	@@ -0,0 +1,122 @@

+//! Salience packet - the output of Marine analysis
+//!
+//! Contains jitter measurements and quality scores for a detected peak.
+#![cfg_attr(not(feature = "std"), no_std)]
+/// Salience packet emitted on peak detection
+///
+/// Contains all the jitter and quality metrics for a single audio event.
+/// These packets can be aggregated to form prosody vectors or quality scores.
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[cfg_attr(feature = "std", derive(serde::Serialize, serde::Deserialize))]
+pub struct SaliencePacket {
+    /// Period jitter - timing instability between peaks
+    /// Lower = more stable/musical, Higher = more chaotic
+    /// Range: 0.0+ (normalized difference from expected period)
+    pub j_p: f32,
+    /// Amplitude jitter - loudness instability
+    /// Lower = consistent volume, Higher = erratic dynamics
+    /// Range: 0.0+ (normalized difference from expected amplitude)
+    pub j_a: f32,
+    /// Harmonic alignment score
+    /// 1.0 = perfectly voiced/harmonic, 0.0 = noise
+    /// For now this is simplified; can be enhanced with FFT
+    pub h_score: f32,
+    /// Overall salience score (authenticity)
+    /// 1.0 = perfect quality, 0.0 = heavily damaged
+    /// Computed from inverse of combined jitter
+    pub s_score: f32,
+    /// Local peak energy (amplitude squared)
+    /// Represents loudness at this event
+    pub energy: f32,
+    /// Sample index where this peak occurred
+    /// Useful for temporal analysis
+    pub sample_index: u64,
+}
+impl SaliencePacket {
+    /// Create a new salience packet
+    pub fn new(
+        j_p: f32,
+        j_a: f32,
+        h_score: f32,
+        s_score: f32,
+        energy: f32,
+        sample_index: u64,
+    ) -> Self {
+        Self {
+            j_p,
+            j_a,
+            h_score,
+            s_score,
+            energy,
+            sample_index,
+        }
+    }
+    /// Get combined jitter metric
+    /// Average of period and amplitude jitter
+    pub fn combined_jitter(&self) -> f32 {
+        (self.j_p + self.j_a) / 2.0
+    }
+    /// Check if this represents high-quality audio
+    /// (low jitter, high salience)
+    pub fn is_high_quality(&self, threshold: f32) -> bool {
+        self.s_score >= threshold
+    }
+    /// Check if this indicates damaged/synthetic audio
+    pub fn is_damaged(&self, jitter_threshold: f32) -> bool {
+        self.combined_jitter() > jitter_threshold
+    }
+}
+/// Special salience markers for non-peak events
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[cfg_attr(feature = "std", derive(serde::Serialize, serde::Deserialize))]
+pub enum SalienceMarker {
+    /// Normal peak detected
+    Peak(SaliencePacket),
+    /// Fracture/gap detected (silence)
+    Fracture,
+    /// High noise floor detected
+    Noise,
+    /// Insufficient data for analysis
+    Insufficient,
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_combined_jitter() {
+        let packet = SaliencePacket::new(0.1, 0.3, 1.0, 0.8, 0.5, 0);
+        assert!((packet.combined_jitter() - 0.2).abs() < 0.001);
+    }
+    #[test]
+    fn test_is_high_quality() {
+        let good = SaliencePacket::new(0.01, 0.02, 1.0, 0.95, 0.5, 0);
+        let bad = SaliencePacket::new(0.5, 0.6, 0.5, 0.3, 0.5, 0);
+        assert!(good.is_high_quality(0.8));
+        assert!(!bad.is_high_quality(0.8));
+    }
+    #[test]
+    fn test_is_damaged() {
+        let good = SaliencePacket::new(0.01, 0.02, 1.0, 0.95, 0.5, 0);
+        let bad = SaliencePacket::new(0.5, 0.6, 0.5, 0.3, 0.5, 0);
+        assert!(!good.is_damaged(0.3));
+        assert!(bad.is_damaged(0.3));
+    }
+}

crates/marine_salience/src/processor.rs ADDED Viewed

	@@ -0,0 +1,334 @@

+//! Core Marine processor - O(1) per-sample jitter detection
+//!
+//! The heart of the Marine algorithm. Processes audio samples one at a time,
+//! detecting peaks and computing jitter metrics in constant time.
+//!
+//! "Marines are not just jarheads - they are actually very intelligent"
+#![cfg_attr(not(feature = "std"), no_std)]
+use crate::config::MarineConfig;
+use crate::ema::Ema;
+use crate::packet::{SalienceMarker, SaliencePacket};
+/// Marine salience processor
+///
+/// Processes audio samples one at a time, detecting peaks and computing
+/// jitter metrics. Designed for O(1) per-sample operation.
+///
+/// # Example
+/// ```
+/// use marine_salience::{MarineConfig, MarineProcessor};
+///
+/// let config = MarineConfig::speech_default(22050);
+/// let mut processor = MarineProcessor::new(config);
+///
+/// // Process samples (e.g., from audio buffer)
+/// let samples = vec![0.0, 0.5, 1.0, 0.5, 0.0, -0.5, -1.0, -0.5];
+/// for sample in &samples {
+///     if let Some(marker) = processor.process_sample(*sample) {
+///         match marker {
+///             marine_salience::packet::SalienceMarker::Peak(packet) => {
+///                 println!("Peak detected! Salience: {:.2}", packet.s_score);
+///             }
+///             _ => {}
+///         }
+///     }
+/// }
+/// ```
+pub struct MarineProcessor {
+    /// Configuration parameters
+    cfg: MarineConfig,
+    /// Previous sample (t-2)
+    prev2: f32,
+    /// Previous sample (t-1)
+    prev1: f32,
+    /// Current sample index
+    idx: u64,
+    /// Sample index of last detected peak
+    last_peak_idx: u64,
+    /// Amplitude of last detected peak
+    last_peak_amp: f32,
+    /// EMA tracker for inter-peak periods
+    ema_period: Ema,
+    /// EMA tracker for peak amplitudes
+    ema_amp: Ema,
+    /// Number of peaks detected so far
+    peak_count: u64,
+}
+impl MarineProcessor {
+    /// Create a new Marine processor with given configuration
+    pub fn new(cfg: MarineConfig) -> Self {
+        Self {
+            cfg,
+            prev2: 0.0,
+            prev1: 0.0,
+            idx: 0,
+            last_peak_idx: 0,
+            last_peak_amp: 0.0,
+            ema_period: Ema::new(cfg.ema_period_alpha),
+            ema_amp: Ema::new(cfg.ema_amp_alpha),
+            peak_count: 0,
+        }
+    }
+    /// Process a single audio sample - O(1) operation
+    ///
+    /// Returns Some(SalienceMarker) when a peak is detected or special
+    /// condition occurs, None otherwise.
+    ///
+    /// # Arguments
+    /// * `sample` - Audio sample value (typically -1.0 to 1.0)
+    ///
+    /// # Returns
+    /// - `Some(Peak(packet))` - Peak detected with jitter metrics
+    /// - `Some(Fracture)` - Silence/gap detected
+    /// - `Some(Noise)` - High noise floor detected
+    /// - `None` - No significant event at this sample
+    pub fn process_sample(&mut self, sample: f32) -> Option<SalienceMarker> {
+        let i = self.idx;
+        self.idx += 1;
+        // Pre-gating: ignore samples below threshold
+        if sample.abs() < self.cfg.clip_threshold {
+            self.prev2 = self.prev1;
+            self.prev1 = sample;
+            return None;
+        }
+        // Peak detection: prev1 is peak if prev2 < prev1 > sample
+        // Simple local maximum detection
+        let is_peak = i >= 2
+            && self.prev1.abs() >= self.cfg.clip_threshold
+            && self.prev1.abs() > self.prev2.abs()
+            && self.prev1.abs() > sample.abs();
+        let mut result = None;
+        if is_peak {
+            let peak_idx = i - 1;
+            let amp = self.prev1.abs();
+            let energy = amp * amp;
+            // Calculate period (time since last peak)
+            let period = if self.last_peak_idx == 0 {
+                0.0
+            } else {
+                (peak_idx - self.last_peak_idx) as f32
+            };
+            // Only process if period is within valid range
+            if period > self.cfg.min_period as f32 && period < self.cfg.max_period as f32 {
+                if self.ema_period.is_ready() {
+                    // Calculate jitter metrics
+                    let jp = (period - self.ema_period.get()).abs() / self.ema_period.get();
+                    let ja = (amp - self.ema_amp.get()).abs() / self.ema_amp.get();
+                    // Harmonic score (simplified - TODO: FFT-based detection)
+                    // For now, assume voiced content (h = 1.0)
+                    // In production, this would check for harmonic structure
+                    let h = 1.0;
+                    // Salience score: inverse of combined jitter
+                    // Higher jitter = lower salience
+                    let s = 1.0 / (1.0 + jp + ja);
+                    result = Some(SalienceMarker::Peak(SaliencePacket::new(
+                        jp, ja, h, s, energy, peak_idx,
+                    )));
+                }
+                // Update EMAs with new measurements
+                self.ema_period.update(period);
+                self.ema_amp.update(amp);
+            }
+            self.last_peak_idx = peak_idx;
+            self.last_peak_amp = amp;
+            self.peak_count += 1;
+        }
+        // Update sample history
+        self.prev2 = self.prev1;
+        self.prev1 = sample;
+        result
+    }
+    /// Process a buffer of samples, collecting all salience packets
+    ///
+    /// More efficient than calling process_sample repeatedly when you
+    /// have a full buffer available.
+    ///
+    /// # Arguments
+    /// * `samples` - Buffer of audio samples
+    ///
+    /// # Returns
+    /// Vector of salience packets for all detected peaks
+    #[cfg(feature = "std")]
+    pub fn process_buffer(&mut self, samples: &[f32]) -> Vec<SaliencePacket> {
+        let mut packets = Vec::new();
+        for &sample in samples {
+            if let Some(SalienceMarker::Peak(packet)) = self.process_sample(sample) {
+                packets.push(packet);
+            }
+        }
+        packets
+    }
+    /// Reset processor state (start fresh)
+    pub fn reset(&mut self) {
+        self.prev2 = 0.0;
+        self.prev1 = 0.0;
+        self.idx = 0;
+        self.last_peak_idx = 0;
+        self.last_peak_amp = 0.0;
+        self.ema_period.reset();
+        self.ema_amp.reset();
+        self.peak_count = 0;
+    }
+    /// Get number of peaks detected so far
+    pub fn peak_count(&self) -> u64 {
+        self.peak_count
+    }
+    /// Get current sample index
+    pub fn current_index(&self) -> u64 {
+        self.idx
+    }
+    /// Check if processor has enough data for reliable jitter
+    pub fn is_warmed_up(&self) -> bool {
+        self.peak_count >= 3 && self.ema_period.is_ready()
+    }
+    /// Get current expected period (from EMA)
+    pub fn expected_period(&self) -> Option<f32> {
+        if self.ema_period.is_ready() {
+            Some(self.ema_period.get())
+        } else {
+            None
+        }
+    }
+    /// Get current expected amplitude (from EMA)
+    pub fn expected_amplitude(&self) -> Option<f32> {
+        if self.ema_amp.is_ready() {
+            Some(self.ema_amp.get())
+        } else {
+            None
+        }
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_peak_detection() {
+        let config = MarineConfig::speech_default(22050);
+        let mut processor = MarineProcessor::new(config);
+        // Create simple signal with peaks
+        // Peak at sample 10, 20, 30...
+        let mut samples = vec![0.0; 100];
+        for i in (10..100).step_by(10) {
+            samples[i] = 0.5; // Peak
+            if i > 0 {
+                samples[i - 1] = 0.3; // Rising edge
+            }
+            if i < 99 {
+                samples[i + 1] = 0.3; // Falling edge
+            }
+        }
+        let mut peak_count = 0;
+        for sample in &samples {
+            if let Some(SalienceMarker::Peak(_)) = processor.process_sample(*sample) {
+                peak_count += 1;
+            }
+        }
+        // Should detect several peaks (not all due to period constraints)
+        assert!(peak_count > 0);
+    }
+    #[test]
+    fn test_jitter_calculation() {
+        let mut config = MarineConfig::speech_default(22050);
+        config.min_period = 5;
+        config.max_period = 20;
+        let mut processor = MarineProcessor::new(config);
+        // Create signal with consistent period of 10 samples
+        let mut detected_packets = vec![];
+        for cycle in 0..10 {
+            for i in 0..10 {
+                let sample = if i == 5 {
+                    0.8 // Peak in middle
+                } else if i == 4 || i == 6 {
+                    0.5 // Edges
+                } else {
+                    0.01 // Just above threshold
+                };
+                if let Some(SalienceMarker::Peak(packet)) = processor.process_sample(sample) {
+                    detected_packets.push(packet);
+                }
+            }
+        }
+        // With consistent periods, later packets should have low jitter
+        if detected_packets.len() > 3 {
+            let last = detected_packets.last().unwrap();
+            // Jitter should be relatively low for consistent signal
+            assert!(last.j_p < 0.5, "Period jitter too high: {}", last.j_p);
+        }
+    }
+    #[test]
+    fn test_reset() {
+        let config = MarineConfig::speech_default(22050);
+        let mut processor = MarineProcessor::new(config);
+        // Process some samples
+        for _ in 0..100 {
+            processor.process_sample(0.5);
+        }
+        assert!(processor.current_index() > 0);
+        // Reset and verify
+        processor.reset();
+        assert_eq!(processor.current_index(), 0);
+        assert_eq!(processor.peak_count(), 0);
+        assert!(!processor.is_warmed_up());
+    }
+    #[cfg(feature = "std")]
+    #[test]
+    fn test_process_buffer() {
+        let mut config = MarineConfig::speech_default(22050);
+        config.min_period = 5;
+        config.max_period = 50;
+        let mut processor = MarineProcessor::new(config);
+        // Generate test signal with peaks
+        let mut samples = Vec::new();
+        for _ in 0..20 {
+            samples.extend_from_slice(&[0.01, 0.3, 0.8, 0.3, 0.01]);
+        }
+        let packets = processor.process_buffer(&samples);
+        // Should detect multiple peaks
+        assert!(packets.len() > 0);
+    }
+}

docs/Integrating Marine Algorithm into IndexTTS-Rust.md ADDED Viewed

	@@ -0,0 +1,450 @@

+# **A Technical Report on the Integration of the Marine Salience Algorithm into the IndexTTS2-Rust Architecture**
+## **Executive Summary**
+This report details a comprehensive technical framework for the integration of the novel Marine Algorithm 1 into the existing IndexTTS-Rust project. The IndexTTS-Rust system is understood to be a Rust implementation of the IndexTTS2 architecture, a cascaded autoregressive (AR) Text-to-Speech (TTS) model detailed in the aaai2026.tex paper.1
+The primary objective of this integration is to leverage the unique, time-domain salience detection capabilities of the Marine Algorithm (e.g., jitter analysis) 1 to significantly improve the quality, controllability, and emotional expressiveness of the synthesized speech.
+The core of this strategy involves **replacing the Conformer-based emotion perceiver of the IndexTTS2 Text-to-Semantic (T2S) module** 1 with a new, lightweight, and prosodically-aware Rust module based on the Marine Algorithm. This report provides a full analysis of the architectural foundations, a detailed integration strategy, a complete Rust-level implementation guide, and an analysis of the training and inferential implications of this modification.
+## **Part 1: Architectural Foundations: The IndexTTS2 Pipeline and the Marine Salience Primitive**
+A successful integration requires a deep, functional understanding of the two systems being merged. This section deconstructs the IndexTTS2 architecture as the "host" system 1 and re-frames the Marine Algorithm 1 as the "implant" feature extractor.
+### **1.1 Deconstruction of the IndexTTS2 Generative Pipeline**
+The aaai2026.tex paper describes IndexTTS2 as a state-of-the-art, cascaded zero-shot TTS system.1 Its architecture is composed of three distinct, sequentially-trained modules:
+1. **Text-to-Semantic (T2S) Module:** This is an autoregressive (AR) Transformer-based model. Its primary function is to convert a sequence of text inputs into a sequence of "semantic tokens." This module is the system's "brain," determining the content, rhythm, and prosody of the speech.
+2. **Semantic-to-Mel (S2M) Module:** This is a non-autoregressive (NAR) model. It takes the discrete semantic tokens from the T2S module and converts them into a dense mel-spectrogram. This module functions as the system's "vocal tract," rendering the semantic instructions into a spectral representation. The paper notes this module "incorporate\[s\] GPT latent representations to significantly improve the stability of the generated speech".1
+3. **Vocoder Module:** This is a pre-trained BigVGANv2 vocoder.1 Its sole function is to perform the final conversion from the mel-spectrogram (from S2M) into a raw audio waveform.
+The critical component for this integration is the **T2S Conditioning Mechanism**. The IndexTTS2 T2S module's behavior is conditioned on two separate audio prompts, a design intended to achieve disentangled control 1:
+* **Timbre Prompt:** This audio prompt is processed by a "speaker perceiver conditioner" to generate a speaker attribute vector, c. This vector defines *who* is speaking (i.e., the vocal identity).
+* **Style Prompt:** This *separate* audio prompt is processed by a "Conformer-based emotion perceiver conditioner" to generate an emotion vector, e. This vector defines *how* they are speaking (i.e., the emotion, prosody, and rhythm).
+The T2S Transformer then consumes these vectors, additively combined, as part of its input: \[c \+ e, p,..., E\_text,..., E\_sem\].1
+A key architectural detail is the IndexTTS2 paper's explicit use of a **Gradient Reversal Layer (GRL)** "to eliminate emotion-irrelevant information" and achieve "speaker-emotion disentanglement".1 The presence of a GRL, an adversarial training technique, strongly implies that the "Conformer-based emotion perceiver" is *not* naturally adept at this separation. A general-purpose Conformer, when processing the style prompt, will inevitably encode both prosodic features (pitch, energy) and speaker-specific features (formants, timbre). The GRL is thus employed as an adversarial "patch" to force the e vector to be "ignorant" of the speaker. This reveals a complex, computationally-heavy, and potentially fragile point in the IndexTTS2 design—a weakness that the Marine Algorithm is perfectly suited to address.
+### **1.2 The Marine Algorithm as a Superior Prosodic Feature Extractor**
+The marine-Universal-Salience-algoritm.tex paper 1 introduces the Marine Algorithm as a "universal, modality-agnostic salience detector" that operates in the time domain with O(1) per-sample complexity. While its described applications are broad, its specific mechanics make it an ideal, purpose-built *prosody quantifier* for speech.
+The algorithm's 5-step process (Pre-gating, Peak Detection, Jitter Computation, Harmonic Alignment, Salience Score) 1 is, in effect, a direct measurement of the suprasegmental features that define prosody:
+* **Period Jitter ($J\_p$):** Defined as $J\_p \= |T\_i \- \\text{EMA}(T)|$, this metric quantifies the instability of the time between successive peaks (the fundamental period).1 In speech, this is a direct, time-domain correlate for *pitch instability*. High, structured $J\_p$ (i.e., high jitter with a stable EMA) represents intentional prosodic features like vibrato, vocal fry, or creaky voice—all key carriers of emotion.
+* **Amplitude Jitter ($J\_a$):** Defined as $J\_a \= |A\_i \- \\text{EMA}(A)|$, this metric quantifies the instability of peak amplitudes.1 In speech, this is a correlate for *amplitude shimmer* or "vocal roughness," which are strong cues for affective states such as arousal, stress, or anger.
+* **Harmonic Alignment ($H$):** This check for integer-multiple relationships in peak spacing 1 directly measures the *purity* and *periodicity* of the tone. It quantifies the distinction between a clear, voiced, harmonic sound and a noisy, chaotic, or unvoiced signal (e.g., breathiness, whispering, or a scream).
+* **Energy ($E$) and Peak Detection:** The algorithm's pre-gating ($\\theta\_c$) and peak detection steps inherently track the signal's energy and the *density* of glottal pulses, which correlate directly to loudness and fundamental frequency (pitch), respectively.
+The algorithm's description as "biologically plausible" and analogous to cochlear/amygdalar filtering 1 is not merely conceptual. It signifies that the algorithm is *a priori* biased to extract the same low-level features that the human auditory system uses to perceive emotion and prosody. This makes it a far more "correct" feature extractor for this task than a generic, large-scale Conformer, which learns from statistical correlation rather than first principles. Furthermore, its O(1) complexity 1 makes it orders of magnitude more efficient than the Transformer-based Conformer it will replace.
+## **Part 2: Integration Strategy: Replacing the T2S Emotion Perceiver**
+The integration path is now clear. The IndexTTS2 T2S module 1 requires a clean, disentangled prosody vector e. The original Conformer-based conditioner provides a "polluted" vector that must be "cleaned" by a GRL.1 The Marine Algorithm 1 is, by its very design, a *naturally disentangled* prosody extractor.
+### **2.1 Formal Proposal: The MarineProsodyConditioner**
+The formal integration strategy is as follows:
+1. The "Conformer-based emotion perceiver conditioner" 1 is **removed** from the IndexTTS2 architecture.
+2. A new, from-scratch Rust module, tentatively named the MarineProsodyConditioner, is **created**.
+3. This new module's sole function is to accept the file path to the style\_prompt audio, load its samples, and process them using a Rust implementation of the Marine Algorithm.1
+4. It will aggregate the resulting time-series of salience data into a single, fixed-size feature vector, e', which will serve as the new "emotion vector."
+### **2.2 Feature Vector Engineering: Defining the New e'**
+The Marine Algorithm produces a *stream* of SaliencePackets, one for each detected peak.1 The T2S Transformer, however, requires a *single, fixed-size* conditioning vector.1 We must therefore define an aggregation strategy to distill this time-series into a descriptive statistical summary.
+The proposed feature vector, the MarineProsodyVector (our new e'), will be an 8-dimensional vector composed of the mean and standard deviation of the algorithm's key outputs over the entire duration of the style prompt.
+**Table 1: MarineProsodyVector Struct Definition**
+This table defines the precise "interface" between the marine\_salience crate and the indextts\_rust crate.
+| Field | Type | Description | Source |
+| :---- | :---- | :---- | :---- |
+| jp\_mean | f32 | Mean Period Jitter ($J\_p$). Correlates to average pitch instability. | 1 |
+| jp\_std | f32 | Std. Dev. of $J\_p$. Correlates to *variance* in pitch instability. | 1 |
+| ja\_mean | f32 | Mean Amplitude Jitter ($J\_a$). Correlates to average vocal roughness. | 1 |
+| ja\_std | f32 | Std. Dev. of $J\_a$. Correlates to *variance* in vocal roughness. | 1 |
+| h\_mean | f32 | Mean Harmonic Alignment ($H$). Correlates to average tonal purity. | 1 |
+| s\_mean | f32 | Mean Salience Score ($S$). Correlates to overall signal "structuredness". | 1 |
+| peak\_density | f32 | Number of detected peaks per second. Correlates to fundamental frequency (F0/pitch). | 1 |
+| energy\_mean | f32 | Mean energy ($E$) of detected peaks. Correlates to loudness/amplitude. | 1 |
+This small, 8-dimensional vector is dense, interpretable, and packed with prosodic information, in stark contrast to the opaque, high-dimensional, and entangled vector produced by the original Conformer.1
+### **2.3 Theoretical Justification: The Synergistic Disentanglement**
+This integration provides a profound architectural improvement by solving the speaker-style disentanglement problem more elegantly and efficiently than the original IndexTTS2 design.1
+The central challenge in the original architecture is that the Conformer-based conditioner processes the *entire* signal, capturing both temporal features (pitch, which is prosody) and spectral features (formants, which define speaker identity). This "entanglement" necessitates the use of the adversarial GRL to "un-learn" the speaker information.1
+The Marine Algorithm 1 fundamentally sidesteps this problem. Its design is based on **peak detection, spacing, and amplitude**.1 It is almost entirely *blind* to the complex spectral-envelope (formant) information that defines a speaker's unique timbre. It measures the *instability* of the fundamental frequency, not the F0 itself, and the *instability* of the amplitude, not the spectral shape.
+Therefore, the MarineProsodyVector (e') is **naturally disentangled**. It is a *pure* representation of prosody, containing negligible speaker-identity information.
+When this new e' vector is fed into the T2S model's input, \[c \+ e',...\], the system receives two *orthogonal* conditioning vectors:
+1. c (from the speaker perceiver 1): Contains the speaker's timbre (formants, etc.).
+2. e' (from the MarineProsodyConditioner 1): Contains the speaker's prosody (jitter, rhythm, etc.).
+This clean separation provides two major benefits:
+1. **Superior Timbre Cloning:** The speaker vector c no longer has to "compete" with an "entangled" style vector e. The T2S model will receive a cleaner speaker signal, leading to more accurate zero-shot voice cloning.
+2. **Superior Emotional Expression:** The style vector e' is a clean, simple, and interpretable signal. The T2S Transformer will be able to learn the mapping from (e.g.) jp\_mean \= 0.8 to "generate creaky semantic tokens" much more easily than from an opaque 512-dimensional Conformer embedding.
+This change simplifies the T2S model's learning task, which should lead to faster convergence and higher final quality. The GRL 1 may become entirely unnecessary, further simplifying the training regime and stabilizing the model.
+## **Part 3: Implementation Guide: A IndexTTS-Rust Integration**
+This section provides a concrete, code-level guide for implementing the proposed integration.
+### **3.1 Addressing the README.md Data Gap**
+A critical limitation in preparing this analysis is the repeated failure to access the user-provided IndexTTS-Rust README.md file.2 This file contains the project's specific file structure, API definitions, and module layout.
+To overcome this, this report will posit a **hypothetical yet idiomatic Rust project structure** based on the logical components described in the IndexTTS2 paper.1 All subsequent code examples will adhere to this structure. The project owner is expected to map these file paths and function names to their actual, private codebase.
+### **3.2 Table 2: Hypothetical IndexTTS-Rust Project Structure**
+The following workspace structure is assumed for all implementation examples.
+Plaintext
+indextts\_rust\_workspace/
+├── Cargo.toml                (Workspace root)
+│
+├── indextts\_rust/            (The main application/library crate)
+│   ├── Cargo.toml
+│   └── src/
+│       ├── main.rs           (Binary entry point)
+│       ├── lib.rs            (Library entry point & API)
+│       ├── error.rs          (Project-wide error types)
+│       ├── audio.rs          (Audio I/O: e.g., fn load\_wav\_samples)
+│       ├── vocoder.rs        (Wrapper for BigVGANv2 model)
+│       ├── t2s/
+│       │   ├── mod.rs        (T2S module definition)
+│       │   ├── model.rs      (AR Transformer implementation)
+│       │   └── conditioner.rs(Handles 'c' and 'e' vector generation)
+│       └── s2m/
+│           ├── mod.rs        (S2M module definition)
+│           └── model.rs      (NAR model implementation)
+│
+└── marine\_salience/          (The NEW crate for the Marine Algorithm)
+    ├── Cargo.toml
+    └── src/
+        ├── lib.rs            (Public API: MarineProcessor, etc.)
+        ├── config.rs         (MarineConfig struct)
+        ├── processor.rs      (MarineProcessor struct and logic)
+        ├── ema.rs            (EmaTracker helper struct)
+        └── packet.rs         (SaliencePacket struct)
+### **3.3 Crate Development: marine\_salience**
+A new, standalone Rust crate, marine\_salience, should be created. This crate will encapsulate all logic for the Marine Algorithm 1, ensuring it is modular, testable, and reusable.
+**Table 3: marine\_salience Crate \- Public API Definition**
+| Struct / fn | Field / Signature | Type | Description |
+| :---- | :---- | :---- | :---- |
+| MarineConfig | clip\_threshold | f32 | $\\theta\_c$, pre-gating sensitivity.1 |
+|  | ema\_period\_alpha | f32 | Smoothing factor for Period EMA. |
+|  | ema\_amplitude\_alpha | f32 | Smoothing factor for Amplitude EMA. |
+| SaliencePacket | j\_p | f32 | Period Jitter ($J\_p$).1 |
+|  | j\_a | f32 | Amplitude Jitter ($J\_a$).1 |
+|  | h\_score | f32 | Harmonic Alignment score ($H$).1 |
+|  | s\_score | f32 | Final Salience Score ($S$).1 |
+|  | energy | f32 | Peak energy ($E$).1 |
+| MarineProcessor | new(config: MarineConfig) | Self | Constructor. |
+|  | process\_sample(\&mut self, sample: f32, sample\_idx: u64) | Option\<SaliencePacket\> | The O(1) processing function. |
+**marine\_salience/src/processor.rs (Implementation Sketch):**
+The MarineProcessor struct will hold the state, including EmaTracker instances for period and amplitude, the last\_peak\_sample index, last\_peak\_amplitude, and the current\_direction of the signal (e.g., \+1 for rising, \-1 for falling).
+The process\_sample function is the O(1) core, implementing the algorithm from 1:
+1. **Pre-gating:** Check if sample.abs() \> config.clip\_threshold.
+2. **Peak Detection:** Track the signal's direction. A change from \+1 (rising) to \-1 (falling) signifies a peak at sample\_idx \- 1, as per the formula x(n-1) \< x(n) \> x(n+1).1
+3. **Jitter Computation:** If a peak is detected at n:
+   * Calculate current period $T\_i \= (n \- self.last\_peak\_sample)$.
+   * Calculate current amplitude $A\_i \= sample\_at(n)$.
+   * Calculate $J\_p \= |T\_i \- self.ema\_period.value()|$.1
+   * Calculate $J\_a \= |A\_i \- self.ema\_amplitude.value()|$.1
+   * Update the EMAs: self.ema\_period.update(T\_i), self.ema\_amplitude.update(A\_i).
+4. **Harmonic Alignment:** Perform the check for $H$.1
+5. **Salience Score:** Compute $S \= w\_e E \+ w\_j(1/J) \+ w\_h H$.1
+6. Update self.last\_peak\_sample \= n, self.last\_peak\_amplitude \= A\_i.
+7. Return Some(SaliencePacket {... }).
+8. If no peak is detected, return None.
+### **3.4 Modifying the indextts\_rust Crate**
+With the marine\_salience crate complete, the indextts\_rust crate can now be modified.
+indextts\_rust/Cargo.toml:
+Add the new crate as a dependency:
+Ini, TOML
+\[dependencies\]
+marine\_salience \= { path \= "../marine\_salience" }
+\#... other dependencies (tch, burn, ndarray, etc.)
+indextts\_rust/src/t2s/conditioner.rs:
+This is the central modification. The file responsible for generating the e vector is completely refactored.
+Rust
+// BEFORE: Original Conformer-based
+//
+// use tch::Tensor;
+// use crate::audio::AudioData;
+//
+// // This struct holds the large, complex Conformer model
+// pub struct ConformerEmotionPerceiver {
+//     //... model weights...
+// }
+//
+// impl ConformerEmotionPerceiver {
+//     pub fn get\_style\_embedding(\&self, audio: \&AudioData) \-\> Result\<Tensor, ModelError\> {
+//         // 1\. Convert AudioData to mel-spectrogram tensor
+//         // 2\. Pass spectrogram through Conformer layers
+//         // 3\. (GRL logic is applied during training)
+//         // 4\. Return an opaque, high-dimensional 'e' vector
+//         //    (e.g., )
+//     }
+// }
+// AFTER: New MarineProsodyConditioner
+//
+use marine\_salience::processor::{MarineProcessor, SaliencePacket};
+use marine\_salience::config::MarineConfig;
+use crate::audio::load\_wav\_samples; // From hypothetical audio.rs
+use std::path::Path;
+use anyhow::Result;
+// This is the struct defined in Table 1
+\#
+pub struct MarineProsodyVector {
+    pub jp\_mean: f32,
+    pub jp\_std: f32,
+    pub ja\_mean: f32,
+    pub ja\_std: f32,
+    pub h\_mean: f32,
+    pub s\_mean: f32,
+    pub peak\_density: f32,
+    pub energy\_mean: f32,
+}
+// This new struct and function replace the Conformer
+pub struct MarineProsodyConditioner {
+    config: MarineConfig,
+}
+impl MarineProsodyConditioner {
+    pub fn new(config: MarineConfig) \-\> Self {
+        Self { config }
+    }
+    pub fn get\_marine\_style\_vector(&self, style\_prompt\_path: \&Path, sample\_rate: f32) \-\> Result\<MarineProsodyVector\> {
+        // 1\. Load audio samples
+        // Assumes audio.rs provides this function
+        let samples \= load\_wav\_samples(style\_prompt\_path)?;
+        let duration\_sec \= samples.len() as f32 / sample\_rate;
+        // 2\. Instantiate and run the MarineProcessor
+        let mut processor \= MarineProcessor::new(self.config.clone());
+        let mut packets \= Vec::\<SaliencePacket\>::new();
+        for (i, sample) in samples.iter().enumerate() {
+            if let Some(packet) \= processor.process\_sample(\*sample, i as u64) {
+                packets.push(packet);
+            }
+        }
+        if packets.is\_empty() {
+            return Err(anyhow::anyhow\!("No peaks detected in style prompt."));
+        }
+        // 3\. Aggregate packets into the final feature vector
+        let num\_packets \= packets.len() as f32;
+        let mut jp\_mean \= 0.0;
+        let mut ja\_mean \= 0.0;
+        let mut h\_mean \= 0.0;
+        let mut s\_mean \= 0.0;
+        let mut energy\_mean \= 0.0;
+        for p in \&packets {
+            jp\_mean \+= p.j\_p;
+            ja\_mean \+= p.j\_a;
+            h\_mean \+= p.h\_score;
+            s\_mean \+= p.s\_score;
+            energy\_mean \+= p.energy;
+        }
+        jp\_mean /= num\_packets;
+        ja\_mean /= num\_packets;
+        h\_mean /= num\_packets;
+        s\_mean /= num\_packets;
+        energy\_mean /= num\_packets;
+        // Calculate standard deviation (variance)
+        let mut jp\_std \= 0.0;
+        let mut ja\_std \= 0.0;
+        for p in \&packets {
+            jp\_std \+= (p.j\_p \- jp\_mean).powi(2);
+            ja\_std \+= (p.j\_a \- ja\_mean).powi(2);
+        }
+        jp\_std \= (jp\_std / num\_packets).sqrt();
+        ja\_std \= (ja\_std / num\_packets).sqrt();
+        let peak\_density \= num\_packets / duration\_sec;
+        Ok(MarineProsodyVector {
+            jp\_mean,
+            jp\_std,
+            ja\_mean,
+            ja\_std,
+            h\_mean,
+            s\_mean,
+            peak\_density,
+            energy\_mean,
+        })
+    }
+}
+### **3.5 Updating the T2S Model (indextts\_rust/src/t2s/model.rs)**
+This change is **breaking** and **mandatory**. The IndexTTS2 T2S model 1 was trained on a high-dimensional e vector (e.g., 512-dim). Our new e' vector is 8-dimensional. The T2S model's architecture must be modified to accept this.
+The change will be in the T2S Transformer's input embedding layer, which projects the conditioning vectors into the model's main hidden dimension (e.g., 1024-dim).
+**(Example using tch-rs or burn pseudo-code):**
+Rust
+// In src/t2s/model.rs
+//
+// pub struct T2S\_Transformer {
+//   ...
+//    speaker\_projector: nn::Linear,
+//    style\_projector: nn::Linear, // The layer to change
+//   ...
+// }
+//
+// impl T2S\_Transformer {
+//    pub fn new(config: \&T2S\_Config, vs: \&nn::Path) \-\> Self {
+//      ...
+//       // BEFORE:
+//       // let style\_projector \= nn::linear(
+//       //     vs / "style\_projector",
+//       //     512, // Original Conformer 'e' dimension
+//       //     config.hidden\_dim,
+//       //     Default::default()
+//       // );
+//
+//       // AFTER:
+//       let style\_projector \= nn::linear(
+//           vs / "style\_projector",
+//           8,   // New MarineProsodyVector 'e'' dimension
+//           config.hidden\_dim,
+//           Default::default()
+//       );
+//      ...
+//    }
+// }
+This change creates a new, untrained model. The S2M and Vocoder modules 1 can remain unchanged, but the T2S module must now be retrained.
+## **Part 4: Training, Inference, and Qualitative Implications**
+This architectural change has profound, positive implications for the entire system, from training to user-facing control.
+### **4.1 Retraining the T2S Module**
+The modification in Part 3.5 is a hard-fork of the model architecture; retraining the T2S module 1 is not optional.
+**Training Plan:**
+1. **Model:** The S2M and Vocoder modules 1 can be completely frozen. Only the T2S module with the new 8-dimensional style\_projector (from 3.5) needs to be trained.
+2. **Dataset Preprocessing:** The *entire* training dataset used for the original IndexTTS2 1 must be re-processed.
+   * For *every* audio file in the dataset, the MarineProsodyConditioner::get\_marine\_style\_vector function (from 3.4) must be run *once*.
+   * The resulting 8-dimensional MarineProsodyVector must be saved as the new "ground truth" style label for that utterance.
+3. **Training:** The T2S module is now trained as described in the aaai2026.tex paper.1 During the training step, it will load the pre-computed MarineProsodyVector as the e' vector, which will be added to the c (speaker) vector and fed into the Transformer.
+4. **Hypothesis:** This training run is expected to converge *faster* and to a *higher* qualitative ceiling. The model is no longer burdened by the complex, adversarial GRL-based disentanglement.1 It is instead learning a much simpler, more direct correlation between a clean prosody vector (e') and the target semantic token sequences.
+### **4.2 Inference-Time Control**
+This integration unlocks a new, powerful mode of "synthetic" or "direct" prosody control, fulfilling the proposals implicit in the user's query.
+* **Mode 1: Reference-Based (Standard):**
+  * A user provides a style\_prompt.wav.
+  * The get\_marine\_style\_vector function (from 3.4) is called.
+  * The resulting MarineProsodyVector e' is fed into the T2S model.
+  * This "copies" the prosody from the reference audio, just as the original IndexTTS2 1 intended, but with higher fidelity.
+* **Mode 2: Synthetic-Control (New):**
+  * The user provides *no* style prompt.
+  * Instead, the user *directly constructs* the 8-dimensional MarineProsodyVector to achieve a desired effect. The application's UI could expose 8 sliders for these values.
+  * **Example 1: "Agitated / Rough Voice"**
+    * e' \= MarineProsodyVector { jp\_mean: 0.8, jp\_std: 0.5, ja\_mean: 0.7, ja\_std: 0.4,... }
+  * **Example 2: "Stable / Monotone Voice"**
+    * e' \= MarineProsodyVector { jp\_mean: 0.05, jp\_std: 0.01, ja\_mean: 0.05, ja\_std: 0.01,... }
+  * **Example 3: "High-Pitch / High-Energy Voice"**
+    * e' \= MarineProsodyVector { peak\_density: 300.0, energy\_mean: 0.9,... }
+This provides a small, interpretable, and powerful "control panel" for prosody, a significant breakthrough in controllable TTS that was not possible with the original opaque Conformer embedding.1
+### **4.3 Bridging to Downstream Fidelity (S2M)**
+The benefits of this integration propagate through the entire cascade. The S2M module's quality is directly dependent on the quality of the semantic tokens it receives from T2S.1
+The aaai2026.tex paper 1 states the S2M module uses "GPT latent representations to significantly improve the stability of the generated speech." This suggests the S2M is a powerful and stable *renderer*. However, a renderer is only as good as the instructions it receives.
+In the original system, the S2M module likely received semantic tokens with "muddled" or "averaged-out" prosody, resulting from the T2S model's struggle with the entangled e vector. The S2M's "stability" 1 may have come at the *cost* of expressiveness, as it learned to smooth over inconsistent prosodic instructions.
+With the new MarineProsodyConditioner, the T2S model will now produce semantic tokens that are *far more richly, explicitly, and accurately* encoded with prosodic intent. The S2M module's "GPT latents" 1 will receive a higher-fidelity, more consistent input signal. This creates a synergistic effect: the S2M's stable rendering capabilities 1 will now be applied to a *more expressive* set of instructions. The result is an end-to-end system that is *both* stable *and* highly expressive.
+## **Part 5: Report Conclusions and Future Trajectories**
+### **5.1 Summary of Improvements**
+The integration framework detailed in this report achieves the project's goals by:
+1. **Replacing** a computationally heavy, black-box Conformer 1 with a lightweight, O(1), biologically-plausible, and Rust-native MarineProcessor.1
+2. **Solving** a core architectural-art problem in the IndexTTS2 design by providing a *naturally disentangled*, speaker-invariant prosody vector, which simplifies or obviates the need for the adversarial GRL.1
+3. **Unlocking** a powerful "synthetic control" mode, allowing users to *directly* manipulate prosody at inference time via an 8-dimensional, interpretable control vector.
+4. **Improving** end-to-end system quality by providing a cleaner, more explicit prosodic signal to the T2S module 1, which in turn provides a higher-fidelity semantic token stream to the S2M module.1
+### **5.2 Future Trajectories**
+This new architecture opens two significant avenues for future research.
+1\. True Streaming Synthesis with Dynamic Conditioning
+The IndexTTS2 T2S module is autoregressive 1, and the Marine Algorithm is O(1) per-sample.1 This is a perfect combination for real-time applications.
+A future version could implement a "Dynamic Conditioning" mode. In this mode, a MarineProcessor runs on a live microphone input (e.g., from the user) in a parallel thread. It continuously calculates the MarineProsodyVector over a short, sliding window (e.g., 500ms). This e' vector is then *hot-swapped* into the T2S model's conditioning state *during* the autoregressive generation loop. The result would be a TTS model that mirrors the user's emotional prosody in real-time.
+2\. Active Quality Monitoring (Vocoder Feedback Loop)
+The Marine Algorithm is a "universal... salience detector" that distinguishes "structured signals from noise".1 This capability can be used as a quality metric for the vocoder's output.
+An advanced implementation could create a feedback loop:
+1. The BigVGANv2 vocoder 1 produces its output audio.
+2. This audio is *immediately* fed *back* into a MarineProcessor.
+3. The processor analyzes the output. The key insight from the Marine paper 1 is the use of the **Exponential Moving Average (EMA)**.
+   * **Desired Prosody (e.g., vocal fry):** Will produce high $J\_p$/$J\_a$, but the $\\text{EMA}(T)$ and $\\text{EMA}(A)$ will remain *stable*. The algorithm will correctly identify this as a *structured* signal.
+   * **Undesired Artifact (e.g., vocoder hiss, phase noise):** Will produce high $J\_p$/$J\_a$, but the $\\text{EMA}(T)$ and $\\text{EMA}(A)$ will become *unstable*. The algorithm will correctly identify this as *unstructured noise*.
+This creates a quantitative, real-time metric for "output fidelity" that can distinguish desirable prosody from undesirable artifacts. This metric could be used to automatically flag or discard bad generations, or even as a reward function for a Reinforcement Learning (RL) agent tasked with fine-tuning the S2M or Vocoder modules.
+#### **Works cited**
+1. marine-Universal-Salience-algoritm.tex
+2. accessed December 31, 1969, uploaded:IndexTTS-Rust README.md

examples/analyze_chris.rs ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00940abda6dd597d7dacdbb97761fb0635d0dcc7dc30d5391fe159129008b03a
+size 8470

examples/cases.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:533a57ea51d412841ab6665c7be3032bb6f5996035dfad66460380c9e72f293f
+size 2271

examples/emo_hate.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89e6e7eee1a28303776e9cf43971e9505529bd0e669f5fcf47f4d1370f9187c4
+size 145368

examples/emo_sad.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d3e5bf2b7bca6458f9e6d7a5ce073c41eb4418895e7df2f994e5a0c96c064a
+size 842016

examples/marine_test.rs ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d179d8f3adc5338e94ee2b92f366a36d03c32b51767223d1eefeb42ce9165374
+size 10845

examples/voice_01.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e33e6ee0107a1dd58e1d66dd90c13df3d55a8683047cc3d7ea206dad84ed3fc8
+size 478050

examples/voice_02.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8fe2dd1dbd54ef85a073fbc4c8fc0198f8d4523cc3320a600de0e347a3d8b491
+size 574074

examples/voice_03.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50e8b632efd794418919e2d33c8c2aab9189a57f4d21ef55020413be9f2b292a
+size 616814

examples/voice_04.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a3d2536245f45fd5e1eef046dd768ae7b72a0dba3ec3f370f145862fe64b3b2
+size 681084

examples/voice_05.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eefb7f4a29a8b36f08d5cc1014ea947dbe9f7bef348f07c40263058e604a98eb
+size 1482796

examples/voice_06.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d85800fe261d106c3274fa792cbb952458c4b0b2e1b908340a8cd0d63c73a30
+size 299052

examples/voice_07.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bcb10f84e63c3fdbfe99ac4184ca403b46a6d20b50540732713d48c4c95375ce
+size 591894

examples/voice_08.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e2c5f4859999b1ada95ee801d50c3c72879147269a4ed99e385fd917dae5c6f
+size 426812

examples/voice_09.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8702467b9b3c83a16bead578e131c4388b3ef82aeff861bd336e622a9ae8a511
+size 1798188

examples/voice_10.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39c2db8b395e4c6ea1122ec7463b5f7bd7dd7d7302f3255780e4c529a9ae9985
+size 1942242

examples/voice_11.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82730e38498413d4371a76e841cd91fa2f74843b79ad3b606d45ad8a7b7a736c
+size 1520734

examples/voice_12.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d67bd4f51773677d5902409813b9bb4c1d59b8243c74fc104553b80b49edd22b
+size 778626

models/bigvgan.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31609a2a49ab4e00d14924eb036f2852c88198ad250de228ae972601e67e032f
+size 2269152

models/bigvgan.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5f6c1fa12c0bde8d17832fd47de1fdbe5cf085e186d30751f53ff3ad016952a
+size 451411968

models/speaker_encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8bc6e37803c99ebcf24cb5e1631bc1a1da00b4acc9ec6ec4c105a3e1f1f5388
+size 2334876

models/speaker_encoder.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d21f2c5de55f48af7319230818262da91442e7f3dcd29d828215e8ee9e1d7e3
+size 27656192

src/audio/dsp.rs ADDED Viewed

	@@ -0,0 +1,210 @@

+//! Digital Signal Processing utilities
+/// Apply pre-emphasis filter to audio signal
+///
+/// y[n] = x[n] - coef * x[n-1]
+///
+/// # Arguments
+/// * `signal` - Input audio signal
+/// * `coef` - Pre-emphasis coefficient (typically 0.97)
+pub fn apply_preemphasis(signal: &[f32], coef: f32) -> Vec<f32> {
+    if signal.is_empty() {
+        return vec![];
+    }
+    let mut output = Vec::with_capacity(signal.len());
+    output.push(signal[0]);
+    for i in 1..signal.len() {
+        output.push(signal[i] - coef * signal[i - 1]);
+    }
+    output
+}
+/// Apply de-emphasis filter (inverse of pre-emphasis)
+///
+/// y[n] = x[n] + coef * y[n-1]
+pub fn apply_deemphasis(signal: &[f32], coef: f32) -> Vec<f32> {
+    if signal.is_empty() {
+        return vec![];
+    }
+    let mut output = Vec::with_capacity(signal.len());
+    output.push(signal[0]);
+    for i in 1..signal.len() {
+        output.push(signal[i] + coef * output[i - 1]);
+    }
+    output
+}
+/// Normalize audio to [-1, 1] range
+pub fn normalize_audio(signal: &[f32]) -> Vec<f32> {
+    if signal.is_empty() {
+        return vec![];
+    }
+    let max_abs = signal.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
+    if max_abs < 1e-8 {
+        return signal.to_vec();
+    }
+    signal.iter().map(|x| x / max_abs).collect()
+}
+/// Normalize audio to specific peak value
+pub fn normalize_audio_peak(signal: &[f32], peak: f32) -> Vec<f32> {
+    if signal.is_empty() {
+        return vec![];
+    }
+    let max_abs = signal.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
+    if max_abs < 1e-8 {
+        return signal.to_vec();
+    }
+    let scale = peak / max_abs;
+    signal.iter().map(|x| x * scale).collect()
+}
+/// Dynamic range compression (log compression)
+///
+/// Used for mel spectrogram normalization
+pub fn dynamic_range_compression(x: f32) -> f32 {
+    let clip_val = 1e-5;
+    (x.max(clip_val)).ln()
+}
+/// Dynamic range compression for array
+pub fn dynamic_range_compression_array(x: &[f32]) -> Vec<f32> {
+    x.iter().map(|&v| dynamic_range_compression(v)).collect()
+}
+/// Dynamic range decompression (exp)
+pub fn dynamic_range_decompression(x: f32) -> f32 {
+    x.exp()
+}
+/// Dynamic range decompression for array
+pub fn dynamic_range_decompression_array(x: &[f32]) -> Vec<f32> {
+    x.iter().map(|&v| dynamic_range_decompression(v)).collect()
+}
+/// Apply RMS normalization
+pub fn normalize_rms(signal: &[f32], target_rms: f32) -> Vec<f32> {
+    if signal.is_empty() {
+        return vec![];
+    }
+    let rms = (signal.iter().map(|x| x * x).sum::<f32>() / signal.len() as f32).sqrt();
+    if rms < 1e-8 {
+        return signal.to_vec();
+    }
+    let scale = target_rms / rms;
+    signal.iter().map(|x| x * scale).collect()
+}
+/// Apply soft clipping to prevent harsh distortion
+pub fn soft_clip(signal: &[f32], threshold: f32) -> Vec<f32> {
+    signal
+        .iter()
+        .map(|&x| {
+            if x.abs() <= threshold {
+                x
+            } else {
+                let sign = x.signum();
+                let excess = x.abs() - threshold;
+                sign * (threshold + (1.0 - (-excess).exp()))
+            }
+        })
+        .collect()
+}
+/// Pad audio signal with zeros
+pub fn pad_audio(signal: &[f32], pad_left: usize, pad_right: usize) -> Vec<f32> {
+    let mut output = vec![0.0; pad_left];
+    output.extend_from_slice(signal);
+    output.extend(vec![0.0; pad_right]);
+    output
+}
+/// Trim silence from beginning and end
+pub fn trim_silence(signal: &[f32], threshold_db: f32) -> Vec<f32> {
+    if signal.is_empty() {
+        return vec![];
+    }
+    let threshold = 10f32.powf(threshold_db / 20.0);
+    // Find first non-silent sample
+    let start = signal
+        .iter()
+        .position(|&x| x.abs() > threshold)
+        .unwrap_or(0);
+    // Find last non-silent sample
+    let end = signal
+        .iter()
+        .rposition(|&x| x.abs() > threshold)
+        .unwrap_or(signal.len() - 1);
+    if start >= end {
+        return vec![];
+    }
+    signal[start..=end].to_vec()
+}
+/// Apply fade in/out to avoid clicks
+pub fn apply_fade(signal: &[f32], fade_in_samples: usize, fade_out_samples: usize) -> Vec<f32> {
+    if signal.is_empty() {
+        return vec![];
+    }
+    let mut output = signal.to_vec();
+    let len = output.len();
+    // Fade in
+    for i in 0..fade_in_samples.min(len) {
+        let factor = i as f32 / fade_in_samples as f32;
+        output[i] *= factor;
+    }
+    // Fade out
+    for i in 0..fade_out_samples.min(len) {
+        let idx = len - 1 - i;
+        let factor = i as f32 / fade_out_samples as f32;
+        output[idx] *= factor;
+    }
+    output
+}
+/// Compute RMS energy
+pub fn compute_rms(signal: &[f32]) -> f32 {
+    if signal.is_empty() {
+        return 0.0;
+    }
+    (signal.iter().map(|x| x * x).sum::<f32>() / signal.len() as f32).sqrt()
+}
+/// Compute peak amplitude
+pub fn compute_peak(signal: &[f32]) -> f32 {
+    signal.iter().map(|x| x.abs()).fold(0.0f32, f32::max)
+}
+/// Compute crest factor (peak/RMS ratio)
+pub fn compute_crest_factor(signal: &[f32]) -> f32 {
+    let rms = compute_rms(signal);
+    if rms < 1e-8 {
+        return 0.0;
+    }
+    compute_peak(signal) / rms
+}

src/audio/io.rs ADDED Viewed

	@@ -0,0 +1,150 @@

+//! Audio I/O operations
+use crate::{Error, Result};
+use hound::{SampleFormat, WavReader, WavSpec, WavWriter};
+use std::path::Path;
+/// Audio data container
+#[derive(Debug, Clone)]
+pub struct AudioData {
+    /// Audio samples (mono, normalized to [-1, 1])
+    pub samples: Vec<f32>,
+    /// Sample rate in Hz
+    pub sample_rate: u32,
+}
+impl AudioData {
+    /// Create new audio data
+    pub fn new(samples: Vec<f32>, sample_rate: u32) -> Self {
+        Self {
+            samples,
+            sample_rate,
+        }
+    }
+    /// Get duration in seconds
+    pub fn duration(&self) -> f32 {
+        self.samples.len() as f32 / self.sample_rate as f32
+    }
+    /// Get number of samples
+    pub fn len(&self) -> usize {
+        self.samples.len()
+    }
+    /// Check if empty
+    pub fn is_empty(&self) -> bool {
+        self.samples.is_empty()
+    }
+}
+/// Load audio from WAV file
+///
+/// # Arguments
+/// * `path` - Path to WAV file
+/// * `target_sr` - Optional target sample rate (will resample if different)
+///
+/// # Returns
+/// Audio data with samples normalized to [-1, 1]
+pub fn load_audio<P: AsRef<Path>>(path: P, target_sr: Option<u32>) -> Result<AudioData> {
+    let path = path.as_ref();
+    if !path.exists() {
+        return Err(Error::FileNotFound(path.display().to_string()));
+    }
+    let reader = WavReader::open(path).map_err(|e| Error::Audio(format!("Failed to open WAV: {}", e)))?;
+    let spec = reader.spec();
+    let sample_rate = spec.sample_rate;
+    let channels = spec.channels as usize;
+    // Read samples based on format
+    let samples: Vec<f32> = match spec.sample_format {
+        SampleFormat::Float => {
+            let samples: Vec<f32> = reader
+                .into_samples::<f32>()
+                .collect::<std::result::Result<Vec<_>, _>>()
+                .map_err(|e| Error::Audio(format!("Failed to read samples: {}", e)))?;
+            samples
+        }
+        SampleFormat::Int => {
+            let bits = spec.bits_per_sample;
+            let samples: Vec<i32> = reader
+                .into_samples::<i32>()
+                .collect::<std::result::Result<Vec<_>, _>>()
+                .map_err(|e| Error::Audio(format!("Failed to read samples: {}", e)))?;
+            // Normalize to [-1, 1]
+            let max_val = (1 << (bits - 1)) as f32;
+            samples.iter().map(|&s| s as f32 / max_val).collect()
+        }
+    };
+    // Convert to mono if stereo
+    let mono_samples = if channels > 1 {
+        samples
+            .chunks(channels)
+            .map(|chunk| chunk.iter().sum::<f32>() / channels as f32)
+            .collect()
+    } else {
+        samples
+    };
+    let mut audio = AudioData::new(mono_samples, sample_rate);
+    // Resample if needed
+    if let Some(target) = target_sr {
+        if target != sample_rate {
+            audio = super::resample::resample(&audio, target)?;
+        }
+    }
+    Ok(audio)
+}
+/// Save audio to WAV file
+///
+/// # Arguments
+/// * `path` - Output path
+/// * `audio` - Audio data to save
+pub fn save_audio<P: AsRef<Path>>(path: P, audio: &AudioData) -> Result<()> {
+    let spec = WavSpec {
+        channels: 1,
+        sample_rate: audio.sample_rate,
+        bits_per_sample: 32,
+        sample_format: SampleFormat::Float,
+    };
+    let mut writer = WavWriter::create(path, spec)
+        .map_err(|e| Error::Audio(format!("Failed to create WAV writer: {}", e)))?;
+    for &sample in &audio.samples {
+        writer
+            .write_sample(sample)
+            .map_err(|e| Error::Audio(format!("Failed to write sample: {}", e)))?;
+    }
+    writer
+        .finalize()
+        .map_err(|e| Error::Audio(format!("Failed to finalize WAV: {}", e)))?;
+    Ok(())
+}
+/// Save audio samples with specified sample rate
+pub fn save_samples<P: AsRef<Path>>(path: P, samples: &[f32], sample_rate: u32) -> Result<()> {
+    let audio = AudioData::new(samples.to_vec(), sample_rate);
+    save_audio(path, &audio)
+}
+/// Load multiple audio files in parallel
+pub fn load_audio_batch<P: AsRef<Path> + Sync>(
+    paths: &[P],
+    target_sr: Option<u32>,
+) -> Result<Vec<AudioData>> {
+    use rayon::prelude::*;
+    paths
+        .par_iter()
+        .map(|p| load_audio(p, target_sr))
+        .collect()
+}

src/audio/mel.rs ADDED Viewed

	@@ -0,0 +1,356 @@

+//! Mel-spectrogram computation
+//!
+//! Implements Short-Time Fourier Transform (STFT) and mel filterbank
+use crate::{Error, Result};
+use ndarray::{Array1, Array2, Axis};
+use num_complex::Complex;
+use realfft::RealFftPlanner;
+use std::f32::consts::PI;
+use super::AudioConfig;
+/// Mel filterbank for converting linear spectrogram to mel scale
+#[derive(Debug, Clone)]
+pub struct MelFilterbank {
+    /// Filterbank matrix (n_mels x n_fft/2+1)
+    pub filters: Array2<f32>,
+    /// Sample rate
+    pub sample_rate: u32,
+    /// Number of mel bands
+    pub n_mels: usize,
+    /// FFT size
+    pub n_fft: usize,
+}
+impl MelFilterbank {
+    /// Create mel filterbank
+    pub fn new(sample_rate: u32, n_fft: usize, n_mels: usize, fmin: f32, fmax: f32) -> Self {
+        let filters = create_mel_filterbank(sample_rate, n_fft, n_mels, fmin, fmax);
+        Self {
+            filters,
+            sample_rate,
+            n_mels,
+            n_fft,
+        }
+    }
+    /// Apply filterbank to power spectrogram
+    pub fn apply(&self, spectrogram: &Array2<f32>) -> Array2<f32> {
+        // spectrogram: (n_fft/2+1, time_frames)
+        // filters: (n_mels, n_fft/2+1)
+        // output: (n_mels, time_frames)
+        self.filters.dot(spectrogram)
+    }
+}
+/// Convert frequency to mel scale
+pub fn hz_to_mel(hz: f32) -> f32 {
+    2595.0 * (1.0 + hz / 700.0).log10()
+}
+/// Convert mel to frequency
+pub fn mel_to_hz(mel: f32) -> f32 {
+    700.0 * (10f32.powf(mel / 2595.0) - 1.0)
+}
+/// Create mel filterbank matrix
+fn create_mel_filterbank(
+    sample_rate: u32,
+    n_fft: usize,
+    n_mels: usize,
+    fmin: f32,
+    fmax: f32,
+) -> Array2<f32> {
+    let n_freqs = n_fft / 2 + 1;
+    // Convert to mel scale
+    let mel_min = hz_to_mel(fmin);
+    let mel_max = hz_to_mel(fmax);
+    // Create mel points
+    let mel_points: Vec<f32> = (0..=n_mels + 1)
+        .map(|i| mel_min + (mel_max - mel_min) * i as f32 / (n_mels + 1) as f32)
+        .collect();
+    // Convert back to Hz
+    let hz_points: Vec<f32> = mel_points.iter().map(|&m| mel_to_hz(m)).collect();
+    // Convert to FFT bin numbers
+    let bin_points: Vec<usize> = hz_points
+        .iter()
+        .map(|&hz| ((n_fft as f32 + 1.0) * hz / sample_rate as f32).floor() as usize)
+        .collect();
+    // Create filterbank
+    let mut filters = Array2::zeros((n_mels, n_freqs));
+    for m in 0..n_mels {
+        let f_left = bin_points[m];
+        let f_center = bin_points[m + 1];
+        let f_right = bin_points[m + 2];
+        // Left slope
+        for k in f_left..f_center {
+            if k < n_freqs {
+                filters[[m, k]] = (k - f_left) as f32 / (f_center - f_left).max(1) as f32;
+            }
+        }
+        // Right slope
+        for k in f_center..f_right {
+            if k < n_freqs {
+                filters[[m, k]] = (f_right - k) as f32 / (f_right - f_center).max(1) as f32;
+            }
+        }
+    }
+    filters
+}
+/// Compute Hann window
+fn hann_window(size: usize) -> Vec<f32> {
+    (0..size)
+        .map(|n| 0.5 * (1.0 - (2.0 * PI * n as f32 / size as f32).cos()))
+        .collect()
+}
+/// Compute Short-Time Fourier Transform (STFT)
+///
+/// # Arguments
+/// * `signal` - Input audio signal
+/// * `n_fft` - FFT size
+/// * `hop_length` - Hop length between frames
+/// * `win_length` - Window length (padded to n_fft)
+///
+/// # Returns
+/// Complex STFT matrix (n_fft/2+1, time_frames)
+pub fn stft(
+    signal: &[f32],
+    n_fft: usize,
+    hop_length: usize,
+    win_length: usize,
+) -> Result<Array2<Complex<f32>>> {
+    if signal.is_empty() {
+        return Err(Error::Audio("Empty signal".into()));
+    }
+    // Create window
+    let window = hann_window(win_length);
+    // Pad signal
+    let pad_length = n_fft / 2;
+    let mut padded = vec![0.0f32; pad_length];
+    padded.extend_from_slice(signal);
+    padded.extend(vec![0.0f32; pad_length]);
+    // Calculate number of frames
+    let num_frames = (padded.len() - n_fft) / hop_length + 1;
+    let n_freqs = n_fft / 2 + 1;
+    // Create FFT planner
+    let mut planner = RealFftPlanner::<f32>::new();
+    let fft = planner.plan_fft_forward(n_fft);
+    // Output matrix
+    let mut stft_matrix = Array2::zeros((n_freqs, num_frames));
+    // Process each frame
+    let mut input_buffer = vec![0.0f32; n_fft];
+    let mut output_buffer = vec![Complex::new(0.0f32, 0.0f32); n_freqs];
+    for (frame_idx, start) in (0..padded.len() - n_fft + 1)
+        .step_by(hop_length)
+        .enumerate()
+    {
+        if frame_idx >= num_frames {
+            break;
+        }
+        // Extract and window the frame
+        for i in 0..win_length {
+            input_buffer[i] = padded[start + i] * window[i];
+        }
+        // Zero pad if win_length < n_fft
+        for i in win_length..n_fft {
+            input_buffer[i] = 0.0;
+        }
+        // Perform FFT
+        fft.process(&mut input_buffer, &mut output_buffer)
+            .map_err(|e| Error::Audio(format!("FFT failed: {}", e)))?;
+        // Store result
+        for (freq_idx, &val) in output_buffer.iter().enumerate() {
+            stft_matrix[[freq_idx, frame_idx]] = val;
+        }
+    }
+    Ok(stft_matrix)
+}
+/// Compute magnitude spectrogram from STFT
+pub fn magnitude_spectrogram(stft_matrix: &Array2<Complex<f32>>) -> Array2<f32> {
+    stft_matrix.mapv(|c| c.norm())
+}
+/// Compute power spectrogram from STFT
+pub fn power_spectrogram(stft_matrix: &Array2<Complex<f32>>) -> Array2<f32> {
+    stft_matrix.mapv(|c| c.norm_sqr())
+}
+/// Compute mel spectrogram from audio signal
+///
+/// # Arguments
+/// * `signal` - Audio samples
+/// * `config` - Audio configuration
+///
+/// # Returns
+/// Log mel spectrogram (n_mels, time_frames)
+pub fn mel_spectrogram(signal: &[f32], config: &AudioConfig) -> Result<Array2<f32>> {
+    // Compute STFT
+    let stft_matrix = stft(signal, config.n_fft, config.hop_length, config.win_length)?;
+    // Compute power spectrogram
+    let power_spec = power_spectrogram(&stft_matrix);
+    // Create mel filterbank
+    let mel_fb = MelFilterbank::new(
+        config.sample_rate,
+        config.n_fft,
+        config.n_mels,
+        config.fmin,
+        config.fmax,
+    );
+    // Apply mel filterbank
+    let mel_spec = mel_fb.apply(&power_spec);
+    // Apply log compression
+    let log_mel_spec = mel_spec.mapv(|x| (x.max(1e-10)).ln());
+    Ok(log_mel_spec)
+}
+/// Compute mel spectrogram with normalization
+pub fn mel_spectrogram_normalized(
+    signal: &[f32],
+    config: &AudioConfig,
+    mean: Option<f32>,
+    std: Option<f32>,
+) -> Result<Array2<f32>> {
+    let mut mel_spec = mel_spectrogram(signal, config)?;
+    // Normalize
+    if let (Some(m), Some(s)) = (mean, std) {
+        mel_spec.mapv_inplace(|x| (x - m) / s);
+    } else {
+        // Compute statistics from spectrogram
+        let m = mel_spec.mean().unwrap_or(0.0);
+        let s = mel_spec.std(0.0);
+        if s > 1e-8 {
+            mel_spec.mapv_inplace(|x| (x - m) / s);
+        }
+    }
+    Ok(mel_spec)
+}
+/// Convert mel spectrogram back to linear spectrogram (approximate)
+pub fn mel_to_linear(mel_spec: &Array2<f32>, mel_fb: &MelFilterbank) -> Array2<f32> {
+    // Pseudo-inverse of mel filterbank
+    let filters_t = mel_fb.filters.t();
+    let gram = mel_fb.filters.dot(&filters_t);
+    // Simple approximation using transpose
+    filters_t.dot(mel_spec)
+}
+/// Compute spectrogram energy per frame
+pub fn frame_energy(mel_spec: &Array2<f32>) -> Array1<f32> {
+    mel_spec.sum_axis(Axis(0))
+}
+/// Detect voice activity based on energy threshold
+pub fn voice_activity_detection(mel_spec: &Array2<f32>, threshold_db: f32) -> Vec<bool> {
+    let energy = frame_energy(mel_spec);
+    let max_energy = energy.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+    let threshold = max_energy + threshold_db; // threshold_db is negative
+    energy.iter().map(|&e| e > threshold).collect()
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_hz_to_mel() {
+        // Test known conversions
+        assert!((hz_to_mel(0.0) - 0.0).abs() < 1e-6);
+        assert!((hz_to_mel(1000.0) - 1000.0).abs() < 50.0); // Roughly linear at low freqs
+    }
+    #[test]
+    fn test_mel_to_hz() {
+        // Round trip
+        let hz = 440.0;
+        let mel = hz_to_mel(hz);
+        let hz_back = mel_to_hz(mel);
+        assert!((hz - hz_back).abs() < 1e-4);
+    }
+    #[test]
+    fn test_mel_filterbank_creation() {
+        let fb = MelFilterbank::new(22050, 1024, 80, 0.0, 8000.0);
+        assert_eq!(fb.filters.shape(), &[80, 513]);
+        // Check that filters are non-empty (some filter banks have coverage)
+        let total_sum: f32 = fb.filters.iter().sum();
+        assert!(total_sum > 0.0, "Filterbank should have some non-zero values");
+    }
+    #[test]
+    fn test_hann_window() {
+        let window = hann_window(1024);
+        assert_eq!(window.len(), 1024);
+        // Check endpoints are near zero
+        assert!(window[0].abs() < 1e-6);
+        // Check middle is near 1
+        assert!((window[512] - 1.0).abs() < 1e-4);
+    }
+    #[test]
+    fn test_stft_basic() {
+        // Create a simple sine wave
+        let sr = 22050;
+        let freq = 440.0;
+        let duration = 0.1;
+        let num_samples = (sr as f32 * duration) as usize;
+        let signal: Vec<f32> = (0..num_samples)
+            .map(|i| (2.0 * PI * freq * i as f32 / sr as f32).sin())
+            .collect();
+        let result = stft(&signal, 1024, 256, 1024);
+        assert!(result.is_ok());
+        let stft_matrix = result.unwrap();
+        assert_eq!(stft_matrix.shape()[0], 513); // n_fft/2 + 1
+        assert!(stft_matrix.shape()[1] > 0); // Some frames
+    }
+    #[test]
+    fn test_mel_spectrogram() {
+        let config = AudioConfig::default();
+        let num_samples = (config.sample_rate as f32 * 0.1) as usize;
+        let signal: Vec<f32> = (0..num_samples).map(|i| (i as f32 * 0.01).sin()).collect();
+        let result = mel_spectrogram(&signal, &config);
+        assert!(result.is_ok());
+        let mel_spec = result.unwrap();
+        assert_eq!(mel_spec.shape()[0], config.n_mels);
+        assert!(mel_spec.shape()[1] > 0);
+    }
+}

src/audio/mod.rs ADDED Viewed

	@@ -0,0 +1,57 @@

+//! Audio processing module for IndexTTS
+//!
+//! Provides mel-spectrogram computation, audio I/O, and DSP operations.
+mod dsp;
+mod io;
+pub mod mel;
+mod resample;
+pub use dsp::{
+    apply_fade, apply_preemphasis, dynamic_range_compression, dynamic_range_decompression,
+    normalize_audio, normalize_audio_peak,
+};
+pub use io::{load_audio, save_audio, AudioData};
+pub use mel::{mel_spectrogram, mel_to_linear, MelFilterbank};
+pub use resample::resample;
+use crate::Result;
+/// Audio processing configuration
+#[derive(Debug, Clone)]
+pub struct AudioConfig {
+    /// Sample rate
+    pub sample_rate: u32,
+    /// FFT size
+    pub n_fft: usize,
+    /// Hop length for STFT
+    pub hop_length: usize,
+    /// Window length
+    pub win_length: usize,
+    /// Number of mel bands
+    pub n_mels: usize,
+    /// Minimum frequency
+    pub fmin: f32,
+    /// Maximum frequency
+    pub fmax: f32,
+}
+impl Default for AudioConfig {
+    fn default() -> Self {
+        Self {
+            sample_rate: 22050,
+            n_fft: 1024,
+            hop_length: 256,
+            win_length: 1024,
+            n_mels: 80,
+            fmin: 0.0,
+            fmax: 8000.0,
+        }
+    }
+}
+/// Compute mel spectrogram from audio file
+pub fn compute_mel_from_file(path: &str, config: &AudioConfig) -> Result<ndarray::Array2<f32>> {
+    let audio = load_audio(path, Some(config.sample_rate))?;
+    mel_spectrogram(&audio.samples, config)
+}

src/audio/resample.rs ADDED Viewed

	@@ -0,0 +1,75 @@

+//! Audio resampling using rubato
+use crate::{Error, Result};
+use rubato::{
+    FastFixedIn, PolynomialDegree, Resampler,
+};
+use super::AudioData;
+/// Resample audio to target sample rate
+///
+/// Uses high-quality sinc interpolation
+pub fn resample(audio: &AudioData, target_sr: u32) -> Result<AudioData> {
+    if audio.sample_rate == target_sr {
+        return Ok(audio.clone());
+    }
+    let resample_ratio = target_sr as f64 / audio.sample_rate as f64;
+    // Create resampler
+    let mut resampler = FastFixedIn::<f32>::new(
+        resample_ratio,
+        1.0, // max relative ratio (no variance)
+        PolynomialDegree::Cubic,
+        1024, // chunk size
+        1,    // channels
+    ).map_err(|e| Error::Audio(format!("Failed to create resampler: {}", e)))?;
+    // Process in chunks
+    let input_frames_needed = resampler.input_frames_next();
+    let mut input_buffer = vec![vec![0.0f32; input_frames_needed]];
+    let mut output_samples = Vec::new();
+    let mut pos = 0;
+    while pos < audio.samples.len() {
+        // Fill input buffer
+        let end = (pos + input_frames_needed).min(audio.samples.len());
+        let chunk_size = end - pos;
+        input_buffer[0][..chunk_size].copy_from_slice(&audio.samples[pos..end]);
+        // Pad with zeros if needed
+        if chunk_size < input_frames_needed {
+            input_buffer[0][chunk_size..].fill(0.0);
+        }
+        // Resample
+        let output = resampler
+            .process(&input_buffer, None)
+            .map_err(|e| Error::Audio(format!("Resampling failed: {}", e)))?;
+        output_samples.extend_from_slice(&output[0]);
+        pos += chunk_size;
+        if chunk_size < input_frames_needed {
+            break;
+        }
+    }
+    // Trim to expected length
+    let expected_len = (audio.samples.len() as f64 * resample_ratio).ceil() as usize;
+    output_samples.truncate(expected_len);
+    Ok(AudioData::new(output_samples, target_sr))
+}
+/// Resample to 22050 Hz (common TTS sample rate)
+pub fn resample_to_22k(audio: &AudioData) -> Result<AudioData> {
+    resample(audio, 22050)
+}
+/// Resample to 16000 Hz (common for ASR)
+pub fn resample_to_16k(audio: &AudioData) -> Result<AudioData> {
+    resample(audio, 16000)
+}