Spaces:

likhonsheikh
/

anthropic-compatible-api

Sleeping

Matrix Agent

v3.2: Speed optimizations - OpenBLAS, dual models (7B/1.5B), model selector, timing display

1b50d66 5 months ago

1.33 kB

	FROM python:3.10-slim

	WORKDIR /app

	# Install build dependencies for llama-cpp-python with OpenBLAS
	RUN apt-get update && apt-get install -y \
	build-essential \
	cmake \
	curl \
	libopenblas-dev \
	&& rm -rf /var/lib/apt/lists/*

	# Copy requirements
	COPY requirements.txt .

	# Install Python dependencies with OpenBLAS for faster CPU inference
	ENV CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
	RUN pip install --no-cache-dir -r requirements.txt

	# Download models - 7B (quality) and 1.5B (speed)
	RUN mkdir -p /app/models && \
	echo "Downloading Qwen2.5-Coder-7B (quality model)..." && \
	curl -L -o /app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf \
	"https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf" && \
	echo "Downloading Qwen2.5-Coder-1.5B (fast model)..." && \
	curl -L -o /app/models/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf \
	"https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf"

	# Copy application code and static files
	COPY app.py .
	COPY static/ ./static/

	# Performance environment variables
	ENV N_CTX=4096
	ENV N_THREADS=4
	ENV N_BATCH=512
	ENV USE_MLOCK=true
	ENV USE_MMAP=true

	# Expose port
	EXPOSE 7860

	# Run the application
	CMD ["python", "app.py"]