Spaces:
Sleeping
REVERT: Switch back to 1B multilingual model for T4 GPU compatibility
Browse filesRoot cause analysis completed:
- 2.6B model (5.2GB) exceeded T4 GPU memory (15GB with inference overhead)
- Solution: Use 1B multilingual model optimized for English processing
Key Changes in v1.4.0:
- Dockerfile: Download 1B model (stt-1b-en_fr-candle) instead of 2.6B
- Model config: Revert to asr_v0_1_1b() with default 48000 vocab
- Multistream: Use default Config::v0_1() (text_start_token: 32000 < 48000 vocab)
- Python: Use config-stt-en_fr-hf.toml (multilingual but English-optimized)
This should resolve the GPU memory issue while maintaining proper vocab alignment:
- text_start_token: 32000 (from default config)
- model vocab_size: 48000 (from asr_v0_1_1b)
- 32000 < 48000 ✅ (valid token range)
The 1B model provides excellent English performance within T4 memory constraints.
🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
- Cargo.toml +1 -1
- Dockerfile +15 -15
- app.py +2 -2
- diagnostic_test.py +14 -0
- src/model.rs +6 -20
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[package]
|
| 2 |
name = "kyutai-stt-server"
|
| 3 |
-
version = "1.
|
| 4 |
edition = "2021"
|
| 5 |
|
| 6 |
[dependencies]
|
|
|
|
| 1 |
[package]
|
| 2 |
name = "kyutai-stt-server"
|
| 3 |
+
version = "1.4.0"
|
| 4 |
edition = "2021"
|
| 5 |
|
| 6 |
[dependencies]
|
|
@@ -98,38 +98,38 @@ RUN pip3 install --no-cache-dir huggingface-hub
|
|
| 98 |
# Set working directory for models first
|
| 99 |
WORKDIR /app/models
|
| 100 |
|
| 101 |
-
# Create models directory for
|
| 102 |
-
RUN mkdir -p kyutai/stt-
|
| 103 |
|
| 104 |
-
# Create download script for
|
| 105 |
RUN echo 'from huggingface_hub import hf_hub_download\n\
|
| 106 |
import os\n\
|
| 107 |
import subprocess\n\
|
| 108 |
\n\
|
| 109 |
-
os.makedirs("kyutai/stt-
|
| 110 |
-
print("📥 Downloading
|
| 111 |
\n\
|
| 112 |
print("⬇️ Downloading model.safetensors...")\n\
|
| 113 |
hf_hub_download(\n\
|
| 114 |
-
repo_id="kyutai/stt-
|
| 115 |
filename="model.safetensors",\n\
|
| 116 |
-
local_dir="kyutai/stt-
|
| 117 |
local_dir_use_symlinks=False\n\
|
| 118 |
)\n\
|
| 119 |
\n\
|
| 120 |
-
print("⬇️ Downloading tokenizer (
|
| 121 |
hf_hub_download(\n\
|
| 122 |
-
repo_id="kyutai/stt-
|
| 123 |
-
filename="
|
| 124 |
-
local_dir="kyutai/stt-
|
| 125 |
local_dir_use_symlinks=False\n\
|
| 126 |
)\n\
|
| 127 |
\n\
|
| 128 |
print("⬇️ Downloading Mimi audio tokenizer...")\n\
|
| 129 |
hf_hub_download(\n\
|
| 130 |
-
repo_id="kyutai/stt-
|
| 131 |
filename="mimi-pytorch-e351c8d8@125.safetensors",\n\
|
| 132 |
-
local_dir="kyutai/stt-
|
| 133 |
local_dir_use_symlinks=False\n\
|
| 134 |
)\n\
|
| 135 |
\n\
|
|
@@ -189,9 +189,9 @@ EXPOSE 7860
|
|
| 189 |
|
| 190 |
# Create startup script
|
| 191 |
RUN echo '#!/bin/bash\n\
|
| 192 |
-
echo "🚀 Starting Kyutai STT Server v1.
|
| 193 |
echo "📁 Pre-loaded models:"\n\
|
| 194 |
-
ls -lah models/kyutai/stt-
|
| 195 |
echo "GPU Info:"\n\
|
| 196 |
nvidia-smi || echo "No GPU detected at runtime"\n\
|
| 197 |
echo "Starting Python frontend with integrated Rust server..."\n\
|
|
|
|
| 98 |
# Set working directory for models first
|
| 99 |
WORKDIR /app/models
|
| 100 |
|
| 101 |
+
# Create models directory for 1B multilingual model (T4 GPU compatible)
|
| 102 |
+
RUN mkdir -p kyutai/stt-1b-en_fr-candle
|
| 103 |
|
| 104 |
+
# Create download script for 1B multilingual model
|
| 105 |
RUN echo 'from huggingface_hub import hf_hub_download\n\
|
| 106 |
import os\n\
|
| 107 |
import subprocess\n\
|
| 108 |
\n\
|
| 109 |
+
os.makedirs("kyutai/stt-1b-en_fr-candle", exist_ok=True)\n\
|
| 110 |
+
print("📥 Downloading 1B multilingual STT model (T4 GPU optimized)...")\n\
|
| 111 |
\n\
|
| 112 |
print("⬇️ Downloading model.safetensors...")\n\
|
| 113 |
hf_hub_download(\n\
|
| 114 |
+
repo_id="kyutai/stt-1b-en_fr-candle",\n\
|
| 115 |
filename="model.safetensors",\n\
|
| 116 |
+
local_dir="kyutai/stt-1b-en_fr-candle",\n\
|
| 117 |
local_dir_use_symlinks=False\n\
|
| 118 |
)\n\
|
| 119 |
\n\
|
| 120 |
+
print("⬇️ Downloading tokenizer (8000 vocab)...")\n\
|
| 121 |
hf_hub_download(\n\
|
| 122 |
+
repo_id="kyutai/stt-1b-en_fr-candle",\n\
|
| 123 |
+
filename="tokenizer_en_fr_audio_8000.model",\n\
|
| 124 |
+
local_dir="kyutai/stt-1b-en_fr-candle",\n\
|
| 125 |
local_dir_use_symlinks=False\n\
|
| 126 |
)\n\
|
| 127 |
\n\
|
| 128 |
print("⬇️ Downloading Mimi audio tokenizer...")\n\
|
| 129 |
hf_hub_download(\n\
|
| 130 |
+
repo_id="kyutai/stt-1b-en_fr-candle",\n\
|
| 131 |
filename="mimi-pytorch-e351c8d8@125.safetensors",\n\
|
| 132 |
+
local_dir="kyutai/stt-1b-en_fr-candle",\n\
|
| 133 |
local_dir_use_symlinks=False\n\
|
| 134 |
)\n\
|
| 135 |
\n\
|
|
|
|
| 189 |
|
| 190 |
# Create startup script
|
| 191 |
RUN echo '#!/bin/bash\n\
|
| 192 |
+
echo "🚀 Starting Kyutai STT Server v1.4.0 with pre-loaded models..."\n\
|
| 193 |
echo "📁 Pre-loaded models:"\n\
|
| 194 |
+
ls -lah models/kyutai/stt-1b-en_fr-candle/ || echo "No pre-loaded models found"\n\
|
| 195 |
echo "GPU Info:"\n\
|
| 196 |
nvidia-smi || echo "No GPU detected at runtime"\n\
|
| 197 |
echo "Starting Python frontend with integrated Rust server..."\n\
|
|
@@ -41,7 +41,7 @@ def start_rust_server():
|
|
| 41 |
"./kyutai-stt-server",
|
| 42 |
"--host", "127.0.0.1",
|
| 43 |
"--port", "8080",
|
| 44 |
-
"--config", "configs/config-stt-
|
| 45 |
]
|
| 46 |
},
|
| 47 |
{
|
|
@@ -50,7 +50,7 @@ def start_rust_server():
|
|
| 50 |
"./kyutai-stt-server",
|
| 51 |
"--host", "127.0.0.1",
|
| 52 |
"--port", "8080",
|
| 53 |
-
"--config", "configs/config-stt-
|
| 54 |
"--cpu"
|
| 55 |
]
|
| 56 |
}
|
|
|
|
| 41 |
"./kyutai-stt-server",
|
| 42 |
"--host", "127.0.0.1",
|
| 43 |
"--port", "8080",
|
| 44 |
+
"--config", "configs/config-stt-en_fr-hf.toml"
|
| 45 |
]
|
| 46 |
},
|
| 47 |
{
|
|
|
|
| 50 |
"./kyutai-stt-server",
|
| 51 |
"--host", "127.0.0.1",
|
| 52 |
"--port", "8080",
|
| 53 |
+
"--config", "configs/config-stt-en_fr-hf.toml",
|
| 54 |
"--cpu"
|
| 55 |
]
|
| 56 |
}
|
|
@@ -22,6 +22,20 @@ class STTDiagnostic:
|
|
| 22 |
print("🔍 COMPREHENSIVE STT DIAGNOSTIC TEST")
|
| 23 |
print("=" * 50)
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
try:
|
| 26 |
# STEP 1: Test connection
|
| 27 |
print("\n📡 STEP 1: Testing WebSocket connection...")
|
|
|
|
| 22 |
print("🔍 COMPREHENSIVE STT DIAGNOSTIC TEST")
|
| 23 |
print("=" * 50)
|
| 24 |
|
| 25 |
+
# STEP 0: Check server health first
|
| 26 |
+
print("\n🏥 STEP 0: Checking server health...")
|
| 27 |
+
try:
|
| 28 |
+
import requests
|
| 29 |
+
health_response = requests.get("https://pgits-stt-gpu-service-v3.hf.space/health", timeout=5)
|
| 30 |
+
health_data = health_response.json()
|
| 31 |
+
print(f"📊 Server health: {health_data}")
|
| 32 |
+
|
| 33 |
+
if health_data.get("rust_server") != "ready":
|
| 34 |
+
print(f"⚠️ WARNING: Rust server status is '{health_data.get('rust_server')}', not 'ready'")
|
| 35 |
+
print("This explains why WebSocket connections might fail")
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print(f"❌ Health check failed: {e}")
|
| 38 |
+
|
| 39 |
try:
|
| 40 |
# STEP 1: Test connection
|
| 41 |
print("\n📡 STEP 1: Testing WebSocket connection...")
|
|
@@ -65,15 +65,9 @@ impl MoshiAsrModel {
|
|
| 65 |
// VarBuilder not needed with load_streaming - kept for reference
|
| 66 |
// let _stt_vb = VarBuilder::from_tensors(stt_weights, dtype, device);
|
| 67 |
|
| 68 |
-
// Create LM model for
|
| 69 |
let mut lm_config = lm::Config::asr_v0_1_1b();
|
| 70 |
-
|
| 71 |
-
lm_config.text_out_vocab_size = 4000; // 4000 vocab for English model
|
| 72 |
-
|
| 73 |
-
// Update transformer config to match 2.6B model architecture
|
| 74 |
-
lm_config.transformer.d_model = 2048;
|
| 75 |
-
lm_config.transformer.num_heads = 32; // From config.json
|
| 76 |
-
lm_config.transformer.num_layers = 48; // From config.json
|
| 77 |
|
| 78 |
// Store vocab size before moving lm_config
|
| 79 |
let vocab_size = lm_config.text_out_vocab_size;
|
|
@@ -81,19 +75,11 @@ impl MoshiAsrModel {
|
|
| 81 |
let lm_model = lm::load_lm_model(lm_config, model_path, dtype, device)?;
|
| 82 |
info!("STT transformer loaded successfully");
|
| 83 |
|
| 84 |
-
//
|
| 85 |
-
//
|
| 86 |
-
let state_config = lm_generate_multistream::Config
|
| 87 |
-
generated_audio_codebooks: 8,
|
| 88 |
-
input_audio_codebooks: 8,
|
| 89 |
-
audio_vocab_size: 2049,
|
| 90 |
-
acoustic_delay: 2,
|
| 91 |
-
text_eop_token: 0, // End of phrase
|
| 92 |
-
text_pad_token: 3, // Padding token
|
| 93 |
-
text_start_token: 3999, // Use last valid token in 4000 vocab (0-3999)
|
| 94 |
-
};
|
| 95 |
|
| 96 |
-
info!("Using
|
| 97 |
state_config.text_start_token, vocab_size);
|
| 98 |
|
| 99 |
// Create logits processors (required for State::new)
|
|
|
|
| 65 |
// VarBuilder not needed with load_streaming - kept for reference
|
| 66 |
// let _stt_vb = VarBuilder::from_tensors(stt_weights, dtype, device);
|
| 67 |
|
| 68 |
+
// Create LM model for 1B multilingual STT (T4 GPU compatible)
|
| 69 |
let mut lm_config = lm::Config::asr_v0_1_1b();
|
| 70 |
+
// Keep default vocab sizes (48001/48000) as they match the actual model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
// Store vocab size before moving lm_config
|
| 73 |
let vocab_size = lm_config.text_out_vocab_size;
|
|
|
|
| 75 |
let lm_model = lm::load_lm_model(lm_config, model_path, dtype, device)?;
|
| 76 |
info!("STT transformer loaded successfully");
|
| 77 |
|
| 78 |
+
// Use default multistream config (what moshi-backend uses)
|
| 79 |
+
// This should work with 1B model's 48000 vocab since text_start_token: 32000 < 48000
|
| 80 |
+
let state_config = lm_generate_multistream::Config::v0_1();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
+
info!("Using default moshi config with text_start_token: {}, model vocab_size: {}",
|
| 83 |
state_config.text_start_token, vocab_size);
|
| 84 |
|
| 85 |
// Create logits processors (required for State::new)
|