# ============================================================
# Stage 1: Builder — compiles C++ AVX2 engine with batch prefill
# ============================================================
FROM ubuntu:22.04 AS builder

RUN apt-get update && apt-get install -y g++ libgomp1

WORKDIR /build
COPY inference.cpp .

# -mavx2 -mfma: AVX2 + FMA (dot products, matmul)
# -funroll-loops: loop unrolling for inner matmul loops
# -flto: link-time optimization (inlines matmul_vec_serial into OMP regions)
# -fno-math-errno: skip errno checks in math (safe for inference)
RUN g++ -O3 -mavx2 -mfma -fopenmp \
        -ffast-math -funroll-loops -flto \
        -fno-math-errno \
        -std=c++17 \
        -o inference inference.cpp -lm && \
    echo "✅ inference binary compiled" && \
    ls -lh inference

# ============================================================
# Stage 2: Production runtime
# ============================================================
FROM python:3.11-slim

ENV PYTHONUNBUFFERED=1
ENV HF_REPO_ID=NOT-OMEGA/NanoMind

# 3 engines × 1 OMP thread = best CPU utilization on 2-vCPU HF Spaces
# 3 engines handle 3 concurrent requests without any queue wait
# OMP=1 prevents thread contention between engines
ENV N_ENGINES=3
ENV OMP_NUM_THREADS=1

RUN apt-get update && apt-get install -y --no-install-recommends \
    libgomp1 \
    libstdc++6 \
    curl \
    && rm -rf /var/lib/apt/lists/*

WORKDIR /app

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy compiled binary from builder
COPY --from=builder /build/inference .

# Application files
COPY main.py index.html ./

# Model weights (bundled — avoids HF download delay on cold start)
COPY model.bin tokenizer.bin ./

RUN chmod +x inference && \
    useradd -m -u 1000 appuser && \
    chown -R appuser:appuser /app

USER appuser

HEALTHCHECK --interval=30s --timeout=10s --start-period=90s --retries=3 \
    CMD curl -f http://localhost:7860/health || exit 1

EXPOSE 7860
CMD ["python", "main.py"]