FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 ENV DEBIAN_FRONTEND=noninteractive ARG HF_TOKEN ENV HF_TOKEN=$HF_TOKEN RUN rm -rf /usr/local/bin /usr/local/lib* || true RUN ln -s /usr/bin /usr/local/bin && ln -s /usr/lib /usr/local/lib && ln -s /usr/lib /usr/local/lib64 RUN apt-get update && apt-get install -y RUN apt-get update && \ apt-get upgrade -y RUN apt-get install -y --no-install-recommends --fix-missing \ git \ git-lfs \ wget \ curl \ cmake \ build-essential \ libssl-dev \ zlib1g-dev \ libbz2-dev \ libreadline-dev \ libsqlite3-dev \ libncursesw5-dev \ xz-utils \ tk-dev \ libxml2-dev \ libxmlsec1-dev \ libffi-dev \ golang-go \ python3 \ liblzma-dev \ ffmpeg \ nvidia-driver-570 \ python3 \ python3-pip unzip curl original-awk grep sed zstd WORKDIR /app COPY --chown=1000 . /app RUN mkdir /app -p && chmod 777 /app # RUN bash instollama.sh # Currently all model types are supported no need to build RUN curl -fsSL https://ollama.com/install.sh | sh # RUN cd /app && \ # git clone --recursive https://github.com/ollama/ollama.git && \ # cd ollama && \ # go generate ./... && \ # go build . && \ # ln -s $PWD/ollama /usr/bin/ollama && \ # chmod +x ollama && \ # cd .. # RUN cd /app && \ # git clone --recursive https://github.com/ggerganov/llama.cpp && \ # cd llama.cpp && \ # cmake -B build -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=OFF -DLLAMA_CURL=OFF && \ # cmake --build build --config Release -j --target llama-quantize --parallel 12 && \ # cp ./build/bin/llama-* /usr/bin/ && \ # cp convert_hf_to_gguf.py /usr/bin/convert_hf_to_gguf && \ # rm -rf build && \ # cd .. # RUN id -u 1000 &>/dev/null || useradd -m -u 1000 user # USER 1000 # ENV HOME=/home/user \ # PATH=/home/user/.local/bin:${PATH} WORKDIR /app # RUN curl https://pyenv.run | bash # ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH} # ARG PYTHON_VERSION=3.13 # RUN pyenv install ${PYTHON_VERSION} && \ # pyenv global ${PYTHON_VERSION} && \ # pyenv rehash RUN pip install --no-cache-dir -U pip setuptools wheel --break-system-packages --ignore-installed RUN pip install "huggingface-hub" "hf-transfer" "gradio[oauth]>=6.5.1" "APScheduler" "protobuf>=4.21.0,<5.0.0" "sentencepiece>=0.1.98,<0.3.0" "numpy~=1.26.4" "gguf>=0.1.0" "fastapi" --break-system-packages --ignore-installed RUN pip install "torch>=2.8.0" --break-system-packages --ignore-installed RUN pip install git+https://github.com/huggingface/transformers.git --break-system-packages --ignore-installed RUN mkdir /tmp/llama && hf download lainlives/llama.cpp --local-dir /tmp/llama && chmod +x /tmp/llama/* && cp /tmp/llama/convert* /app/convert_hf_to_gguf.py && mv /tmp/llama/* /usr/bin/ ENV PYTHONPATH=${HOME}/app \ PYTHONUNBUFFERED=1 \ HF_HUB_ENABLE_HF_TRANSFER=1 \ GRADIO_ALLOW_FLAGGING=never \ GRADIO_NUM_PORTS=1 \ GRADIO_SERVER_NAME=0.0.0.0 \ GRADIO_ANALYTICS_ENABLED=False \ TQDM_POSITION=-1 \ TQDM_MININTERVAL=1 \ SYSTEM=spaces \ LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} \ PATH=/usr/local/nvidia/bin:${PATH} EXPOSE 7860 RUN cp /app/start.sh /usr/bin/start_space && chmod +x /usr/bin/start_space ENTRYPOINT python3 /app/app.py