stt-gpu-service-v3 / Dockerfile
pgits's picture
Fix STT model file paths - correct tokenizer filenames
f353b48
# Multi-stage Docker build following HuggingFace Text-Embeddings-Inference pattern
# This approach solves the CUDA build-time compilation issue
# Build argument for CUDA compute capability (T4 = 75)
ARG CUDA_COMPUTE_CAP=75
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS base-builder
# Install system dependencies
RUN apt-get update && apt-get install -y \
curl \
build-essential \
pkg-config \
libssl-dev \
libsndfile1-dev \
libasound2-dev \
wget \
ca-certificates \
git \
cmake \
libprotobuf-dev \
protobuf-compiler \
python3-dev \
&& rm -rf /var/lib/apt/lists/*
# Install Rust
ENV RUSTUP_HOME=/usr/local/rustup
ENV CARGO_HOME=/usr/local/cargo
ENV PATH=/usr/local/cargo/bin:$PATH
ENV RUST_VERSION=stable
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain $RUST_VERSION
RUN chmod -R a+w $RUSTUP_HOME $CARGO_HOME
# Install cargo-chef for dependency caching
RUN cargo install cargo-chef
WORKDIR /app
# Dependency planning stage
FROM base-builder AS planner
COPY Cargo.toml ./
COPY src/ ./src/
RUN cargo chef prepare --recipe-path recipe.json
# Dependency building stage
FROM base-builder AS dependencies
ARG CUDA_COMPUTE_CAP
ENV CUDA_COMPUTE_CAP=${CUDA_COMPUTE_CAP}
COPY --from=planner /app/recipe.json recipe.json
# Set build optimizations to prevent hangs
ENV CARGO_NET_RETRY=10
ENV CARGO_HTTP_TIMEOUT=300
ENV CARGO_HTTP_LOW_SPEED_LIMIT=10
ENV CARGO_BUILD_JOBS=4
# Build dependencies with proper CUDA feature selection
# Cache bust: Force rebuild with Moshi dual-stream fix - v1.4.38
RUN if [ "$CUDA_COMPUTE_CAP" = "75" ]; then \
cargo chef cook --release --features cuda-turing --recipe-path recipe.json; \
else \
cargo chef cook --release --features cuda --recipe-path recipe.json; \
fi
# Main build stage
FROM dependencies AS builder
ARG CUDA_COMPUTE_CAP
ENV CUDA_COMPUTE_CAP=${CUDA_COMPUTE_CAP}
# Copy source code
COPY Cargo.toml ./
COPY src/ ./src/
COPY configs/ ./configs/
# Keep same build optimizations for main build
ENV CARGO_NET_RETRY=10
ENV CARGO_HTTP_TIMEOUT=300
ENV CARGO_HTTP_LOW_SPEED_LIMIT=10
ENV CARGO_BUILD_JOBS=4
# Build application with appropriate CUDA features
RUN if [ "$CUDA_COMPUTE_CAP" = "75" ]; then \
cargo build --release --features cuda-turing; \
else \
cargo build --release --features cuda; \
fi
# Model download stage - pre-load models into image
FROM base-builder AS model-downloader
# Install Python and pip for model downloading
RUN apt-get update && apt-get install -y python3 python3-pip && rm -rf /var/lib/apt/lists/*
# Install huggingface-hub for model downloading
RUN pip3 install --no-cache-dir huggingface-hub
# Set working directory for models first
WORKDIR /app/models
# Create models directory for Moshiko Q8 model (A10G GPU compatible)
RUN mkdir -p kyutai/moshiko-candle-q8
# Create download script for 1B multilingual model
RUN echo 'from huggingface_hub import hf_hub_download\n\
import os\n\
import subprocess\n\
\n\
os.makedirs("kyutai/moshiko-candle-q8", exist_ok=True)\n\
print("πŸ“₯ Downloading Moshiko Q8 model (8.17GB A10G optimized)...")\n\
\n\
print("⬇️ Downloading model.q8.gguf (8.17GB)...")\n\
hf_hub_download(\n\
repo_id="kyutai/moshiko-candle-q8",\n\
filename="model.q8.gguf",\n\
local_dir="kyutai/moshiko-candle-q8",\n\
local_dir_use_symlinks=False\n\
)\n\
\n\
print("⬇️ Downloading tokenizer (32k vocab)...")\n\
hf_hub_download(\n\
repo_id="kyutai/moshiko-candle-q8",\n\
filename="tokenizer_spm_32k_3.model",\n\
local_dir="kyutai/moshiko-candle-q8",\n\
local_dir_use_symlinks=False\n\
)\n\
\n\
print("⬇️ Downloading audio tokenizer...")\n\
hf_hub_download(\n\
repo_id="kyutai/moshiko-candle-q8",\n\
filename="tokenizer-e351c8d8-checkpoint125.safetensors",\n\
local_dir="kyutai/moshiko-candle-q8",\n\
local_dir_use_symlinks=False\n\
)\n\
\n\
print("βœ… All models downloaded successfully!")\n\
result = subprocess.run(["du", "-sh", "kyutai/"], capture_output=True, text=True)\n\
print("πŸ“Š Model files:", result.stdout)\n\
' > download_models.py
# Download models during build time
RUN python3 download_models.py
# Runtime stage
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 AS runtime
# Install Python and runtime dependencies including debugging tools
RUN apt-get update && apt-get install -y \
python3 \
python3-pip \
wget \
curl \
git \
sudo \
vim \
strace \
lsof \
net-tools \
procps \
iproute2 \
gdb \
htop \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies (as root)
RUN pip3 install gradio fastapi uvicorn websockets python-multipart
# Set working directory
WORKDIR /app
# Copy built application
COPY --from=builder /app/target/release/kyutai-stt-server .
COPY --from=builder /app/configs/ ./configs/
# Copy pre-downloaded models from model-downloader stage
COPY --from=model-downloader /app/models/ ./models/
# Copy Python app
COPY app.py .
# Create user for HuggingFace Spaces
RUN useradd -m -u 1000 user
ENV HOME=/home/user \
PATH=/home/user/.local/bin:$PATH \
HF_HOME=/home/user/.cache/huggingface
# Create and set permissions for HuggingFace cache directory
RUN mkdir -p /home/user/.cache/huggingface && \
chown -R user:user /home/user/.cache
# Set proper permissions and ownership
RUN chown -R user:user /app
USER user
# Expose port
EXPOSE 7860
# Create startup script
RUN echo '#!/bin/bash\n\
echo "πŸš€ Starting Kyutai STT Server v1.6.11 - Q8 GGUF MODEL (A10G)"\n\
echo "πŸ“ Pre-loaded Q8 models:"\n\
ls -lah models/kyutai/moshiko-candle-q8/ || echo "No pre-loaded models found"\n\
echo "πŸ“Š Model sizes:"\n\
du -sh models/kyutai/ 2>/dev/null || echo "Models directory not found"\n\
echo "GPU Info:"\n\
nvidia-smi || echo "No GPU detected at runtime"\n\
echo "Starting Python frontend with integrated Rust server..."\n\
python3 app.py\n\
' > /app/start.sh && chmod +x /app/start.sh
# Run the combined server
CMD ["/app/start.sh"]