FROM python:3.10-slim # Install curl RUN apt-get update && apt-get install -y \ curl \ && rm -rf /var/lib/apt/lists/* WORKDIR /code # Limit OpenMP threads to match the HF container's CPU quota (2 vCPUs) # to prevent severe thread thrashing and context switching slowdowns. ENV OMP_NUM_THREADS=2 ENV OMP_PROC_BIND=spread ENV OMP_PLACES=cores # Upgrade pip to ensure the latest wheel tags are supported RUN pip install --no-cache-dir --upgrade pip # Install prebuilt CPU wheel for llama-cpp-python (locked to 0.3.30 to match the prebuilt wheel index) RUN pip install --no-cache-dir llama-cpp-python==0.3.30 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu # Copy requirements and install other dependencies COPY requirements.txt /code/requirements.txt RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt # Copy application files COPY . /code # Expose HF Space port EXPOSE 7860 # Start server CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]