# 1. Use NVIDIA's stable Ubuntu 22.04 image (Natively provides Python 3.10)
FROM nvidia/cuda:12.5.1-devel-ubuntu22.04

# Prevent interactive prompts
ENV DEBIAN_FRONTEND=noninteractive

# 2. Install Python 3.10, system dependencies, and ninja-build
RUN apt-get update && apt-get install -y software-properties-common && \
    add-apt-repository ppa:ubuntu-toolchain-r/test -y && \
    apt-get update && apt-get install -y \
    python3 \
    python3-venv \
    python3-dev \
    python3-pip \
    git \
    wget \
    ninja-build \
    libgl1 \
    libglib2.0-0 \
    libstdc++6 \
    && rm -rf /var/lib/apt/lists/*

# 3. Create user (Standard UID 1000)
RUN useradd -m -u 1000 user
USER user

# 4. Set Architecture for L40S (8.9) and CUDA paths
# MAX_JOBS=2 limits compilation threads so the build server doesn't run out of RAM!
ENV HOME=/home/user \
    PATH=/home/user/venv/bin:/usr/local/cuda/bin:$PATH \
    LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
    PYTHONWARNINGS="ignore::SyntaxWarning" \
    TORCH_CUDA_ARCH_LIST="8.9" \
    CUDA_HOME=/usr/local/cuda \
    MAX_JOBS=2

WORKDIR $HOME/app

# 5. Set up Virtual Environment (This creates a Python 3.10 virtual environment)
RUN python3 -m venv $HOME/venv

# 6. Install PyTorch 2.6.0, Hugging Face spaces, and the matched pre-compiled FlashAttention-2 wheel 
RUN pip install --no-cache-dir --upgrade pip wheel setuptools ninja packaging && \
    pip install --no-cache-dir torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 && \
    pip install --no-cache-dir spaces && \
    pip install --no-cache-dir https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl

# 7. Clean up requirements_th26_cu124.txt
# We delete their precompiled wheels so they don't overwrite our compilation
COPY --chown=user requirements_th26_cu124.txt ./requirements.txt
RUN sed -i '/cumesh/d' requirements.txt && \
    sed -i '/o-voxel/d' requirements.txt && \
    sed -i '/flex-gemm/d' requirements.txt && \
    sed -i '/nvdiffrast/d' requirements.txt && \
    sed -i '/nvdiffrec-render/d' requirements.txt && \
    sed -i '/flash_attn_3/d' requirements.txt && \
    pip install --no-cache-dir -r requirements.txt

# 8. Compile the 3D CUDA extensions from source for Compute 8.9 (L40S)
# By splitting these, Docker will cache each successful build.

# Build CuMesh
RUN git clone https://github.com/JeffreyXiang/CuMesh.git && \
    cd CuMesh && \
    pip install --no-cache-dir . --no-build-isolation && \
    cd .. && rm -rf CuMesh

# Build FlexGEMM
RUN git clone https://github.com/JeffreyXiang/FlexGEMM.git && \
    cd FlexGEMM && \
    pip install --no-cache-dir . --no-build-isolation && \
    cd .. && rm -rf FlexGEMM

# Build o-voxel (cloning and manually embedding PX4/eigen)
RUN git clone https://github.com/microsoft/TRELLIS.2.git && \
    cd TRELLIS.2/o-voxel && \
    mkdir -p third_party && \
    git clone https://github.com/PX4/eigen.git third_party/eigen && \
    pip install --no-cache-dir . --no-build-isolation && \
    cd ../.. && rm -rf TRELLIS.2

# Build nvdiffrast
RUN git clone https://github.com/NVlabs/nvdiffrast.git && \
    cd nvdiffrast && \
    pip install --no-cache-dir . --no-build-isolation && \
    cd .. && rm -rf nvdiffrast

# Build nvdiffrec-render
RUN git clone https://github.com/JeffreyXiang/nvdiffrec.git && \
    cd nvdiffrec && \
    git checkout renderutils && \
    pip install --no-cache-dir . --no-build-isolation && \
    cd .. && rm -rf nvdiffrec

# 9. Copy application files
COPY --chown=user . .

# 10. App settings
EXPOSE 7860
ENV GRADIO_SERVER_NAME="0.0.0.0"
ENV GRADIO_SERVER_PORT="7860"
ENV PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"

# Run the app
CMD ["python", "app.py"]