NanoMind / Dockerfile
NOT-OMEGA's picture
Update Dockerfile
048b86f verified
# ============================================================
# Stage 1: Builder — compiles C++ AVX2 engine with batch prefill
# ============================================================
FROM ubuntu:22.04 AS builder
RUN apt-get update && apt-get install -y g++ libgomp1
WORKDIR /build
COPY inference.cpp .
# -mavx2 -mfma: AVX2 + FMA (dot products, matmul)
# -funroll-loops: loop unrolling for inner matmul loops
# -flto: link-time optimization (inlines matmul_vec_serial into OMP regions)
# -fno-math-errno: skip errno checks in math (safe for inference)
RUN g++ -O3 -mavx2 -mfma -fopenmp \
-ffast-math -funroll-loops -flto \
-fno-math-errno \
-std=c++17 \
-o inference inference.cpp -lm && \
echo "✅ inference binary compiled" && \
ls -lh inference
# ============================================================
# Stage 2: Production runtime
# ============================================================
FROM python:3.11-slim
ENV PYTHONUNBUFFERED=1
ENV HF_REPO_ID=NOT-OMEGA/NanoMind
# 3 engines × 1 OMP thread = best CPU utilization on 2-vCPU HF Spaces
# 3 engines handle 3 concurrent requests without any queue wait
# OMP=1 prevents thread contention between engines
ENV N_ENGINES=3
ENV OMP_NUM_THREADS=1
RUN apt-get update && apt-get install -y --no-install-recommends \
libgomp1 \
libstdc++6 \
curl \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy compiled binary from builder
COPY --from=builder /build/inference .
# Application files
COPY main.py index.html ./
# Model weights (bundled — avoids HF download delay on cold start)
COPY model.bin tokenizer.bin ./
RUN chmod +x inference && \
useradd -m -u 1000 appuser && \
chown -R appuser:appuser /app
USER appuser
HEALTHCHECK --interval=30s --timeout=10s --start-period=90s --retries=3 \
CMD curl -f http://localhost:7860/health || exit 1
EXPOSE 7860
CMD ["python", "main.py"]