Spaces:
Sleeping
Sleeping
| # Multi-stage Docker build following HuggingFace Text-Embeddings-Inference pattern | |
| # This approach solves the CUDA build-time compilation issue | |
| # Build argument for CUDA compute capability (T4 = 75) | |
| ARG CUDA_COMPUTE_CAP=75 | |
| FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS base-builder | |
| # Install system dependencies | |
| RUN apt-get update && apt-get install -y \ | |
| curl \ | |
| build-essential \ | |
| pkg-config \ | |
| libssl-dev \ | |
| libsndfile1-dev \ | |
| libasound2-dev \ | |
| wget \ | |
| ca-certificates \ | |
| git \ | |
| cmake \ | |
| libprotobuf-dev \ | |
| protobuf-compiler \ | |
| python3-dev \ | |
| && rm -rf /var/lib/apt/lists/* | |
| # Install Rust | |
| ENV RUSTUP_HOME=/usr/local/rustup | |
| ENV CARGO_HOME=/usr/local/cargo | |
| ENV PATH=/usr/local/cargo/bin:$PATH | |
| ENV RUST_VERSION=stable | |
| RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain $RUST_VERSION | |
| RUN chmod -R a+w $RUSTUP_HOME $CARGO_HOME | |
| # Install cargo-chef for dependency caching | |
| RUN cargo install cargo-chef | |
| WORKDIR /app | |
| # Dependency planning stage | |
| FROM base-builder AS planner | |
| COPY Cargo.toml ./ | |
| COPY src/ ./src/ | |
| RUN cargo chef prepare --recipe-path recipe.json | |
| # Dependency building stage | |
| FROM base-builder AS dependencies | |
| ARG CUDA_COMPUTE_CAP | |
| ENV CUDA_COMPUTE_CAP=${CUDA_COMPUTE_CAP} | |
| COPY --from=planner /app/recipe.json recipe.json | |
| # Set build optimizations to prevent hangs | |
| ENV CARGO_NET_RETRY=10 | |
| ENV CARGO_HTTP_TIMEOUT=300 | |
| ENV CARGO_HTTP_LOW_SPEED_LIMIT=10 | |
| ENV CARGO_BUILD_JOBS=4 | |
| # Build dependencies with proper CUDA feature selection | |
| # Cache bust: Force rebuild with Moshi dual-stream fix - v1.4.38 | |
| RUN if [ "$CUDA_COMPUTE_CAP" = "75" ]; then \ | |
| cargo chef cook --release --features cuda-turing --recipe-path recipe.json; \ | |
| else \ | |
| cargo chef cook --release --features cuda --recipe-path recipe.json; \ | |
| fi | |
| # Main build stage | |
| FROM dependencies AS builder | |
| ARG CUDA_COMPUTE_CAP | |
| ENV CUDA_COMPUTE_CAP=${CUDA_COMPUTE_CAP} | |
| # Copy source code | |
| COPY Cargo.toml ./ | |
| COPY src/ ./src/ | |
| COPY configs/ ./configs/ | |
| # Keep same build optimizations for main build | |
| ENV CARGO_NET_RETRY=10 | |
| ENV CARGO_HTTP_TIMEOUT=300 | |
| ENV CARGO_HTTP_LOW_SPEED_LIMIT=10 | |
| ENV CARGO_BUILD_JOBS=4 | |
| # Build application with appropriate CUDA features | |
| RUN if [ "$CUDA_COMPUTE_CAP" = "75" ]; then \ | |
| cargo build --release --features cuda-turing; \ | |
| else \ | |
| cargo build --release --features cuda; \ | |
| fi | |
| # Model download stage - pre-load models into image | |
| FROM base-builder AS model-downloader | |
| # Install Python and pip for model downloading | |
| RUN apt-get update && apt-get install -y python3 python3-pip && rm -rf /var/lib/apt/lists/* | |
| # Install huggingface-hub for model downloading | |
| RUN pip3 install --no-cache-dir huggingface-hub | |
| # Set working directory for models first | |
| WORKDIR /app/models | |
| # Create models directory for Moshiko Q8 model (A10G GPU compatible) | |
| RUN mkdir -p kyutai/moshiko-candle-q8 | |
| # Create download script for 1B multilingual model | |
| RUN echo 'from huggingface_hub import hf_hub_download\n\ | |
| import os\n\ | |
| import subprocess\n\ | |
| \n\ | |
| os.makedirs("kyutai/moshiko-candle-q8", exist_ok=True)\n\ | |
| print("π₯ Downloading Moshiko Q8 model (8.17GB A10G optimized)...")\n\ | |
| \n\ | |
| print("β¬οΈ Downloading model.q8.gguf (8.17GB)...")\n\ | |
| hf_hub_download(\n\ | |
| repo_id="kyutai/moshiko-candle-q8",\n\ | |
| filename="model.q8.gguf",\n\ | |
| local_dir="kyutai/moshiko-candle-q8",\n\ | |
| local_dir_use_symlinks=False\n\ | |
| )\n\ | |
| \n\ | |
| print("β¬οΈ Downloading tokenizer (32k vocab)...")\n\ | |
| hf_hub_download(\n\ | |
| repo_id="kyutai/moshiko-candle-q8",\n\ | |
| filename="tokenizer_spm_32k_3.model",\n\ | |
| local_dir="kyutai/moshiko-candle-q8",\n\ | |
| local_dir_use_symlinks=False\n\ | |
| )\n\ | |
| \n\ | |
| print("β¬οΈ Downloading audio tokenizer...")\n\ | |
| hf_hub_download(\n\ | |
| repo_id="kyutai/moshiko-candle-q8",\n\ | |
| filename="tokenizer-e351c8d8-checkpoint125.safetensors",\n\ | |
| local_dir="kyutai/moshiko-candle-q8",\n\ | |
| local_dir_use_symlinks=False\n\ | |
| )\n\ | |
| \n\ | |
| print("β All models downloaded successfully!")\n\ | |
| result = subprocess.run(["du", "-sh", "kyutai/"], capture_output=True, text=True)\n\ | |
| print("π Model files:", result.stdout)\n\ | |
| ' > download_models.py | |
| # Download models during build time | |
| RUN python3 download_models.py | |
| # Runtime stage | |
| FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 AS runtime | |
| # Install Python and runtime dependencies including debugging tools | |
| RUN apt-get update && apt-get install -y \ | |
| python3 \ | |
| python3-pip \ | |
| wget \ | |
| curl \ | |
| git \ | |
| sudo \ | |
| vim \ | |
| strace \ | |
| lsof \ | |
| net-tools \ | |
| procps \ | |
| iproute2 \ | |
| gdb \ | |
| htop \ | |
| && rm -rf /var/lib/apt/lists/* | |
| # Install Python dependencies (as root) | |
| RUN pip3 install gradio fastapi uvicorn websockets python-multipart | |
| # Set working directory | |
| WORKDIR /app | |
| # Copy built application | |
| COPY --from=builder /app/target/release/kyutai-stt-server . | |
| COPY --from=builder /app/configs/ ./configs/ | |
| # Copy pre-downloaded models from model-downloader stage | |
| COPY --from=model-downloader /app/models/ ./models/ | |
| # Copy Python app | |
| COPY app.py . | |
| # Create user for HuggingFace Spaces | |
| RUN useradd -m -u 1000 user | |
| ENV HOME=/home/user \ | |
| PATH=/home/user/.local/bin:$PATH \ | |
| HF_HOME=/home/user/.cache/huggingface | |
| # Create and set permissions for HuggingFace cache directory | |
| RUN mkdir -p /home/user/.cache/huggingface && \ | |
| chown -R user:user /home/user/.cache | |
| # Set proper permissions and ownership | |
| RUN chown -R user:user /app | |
| USER user | |
| # Expose port | |
| EXPOSE 7860 | |
| # Create startup script | |
| RUN echo '#!/bin/bash\n\ | |
| echo "π Starting Kyutai STT Server v1.6.11 - Q8 GGUF MODEL (A10G)"\n\ | |
| echo "π Pre-loaded Q8 models:"\n\ | |
| ls -lah models/kyutai/moshiko-candle-q8/ || echo "No pre-loaded models found"\n\ | |
| echo "π Model sizes:"\n\ | |
| du -sh models/kyutai/ 2>/dev/null || echo "Models directory not found"\n\ | |
| echo "GPU Info:"\n\ | |
| nvidia-smi || echo "No GPU detected at runtime"\n\ | |
| echo "Starting Python frontend with integrated Rust server..."\n\ | |
| python3 app.py\n\ | |
| ' > /app/start.sh && chmod +x /app/start.sh | |
| # Run the combined server | |
| CMD ["/app/start.sh"] |