Spaces:

cng420
/

embedding

Running

embedding / Dockerfile

Ryan Ballantyne

Initial sidecar deploy

983d8eb 23 days ago

1.86 kB

	# Embeddings sidecar — FastAPI + fastembed.
	# Builds a small image that exposes /embed/dense, /embed/colbert,
	# /embed/colbert/query, /health.
	#
	# Designed to run anywhere a Dockerfile is accepted:
	# - Hugging Face Spaces (Docker SDK) — easiest, free tier, weights cached
	# - Fly.io — `fly launch` then `fly deploy`
	# - Railway / Render / Koyeb — auto-detects Dockerfile
	#
	# The runtime port is taken from $PORT (HF Spaces, Railway, Render set this);
	# defaults to 7860 (HF Spaces convention).

	FROM python:3.11-slim

	ENV PYTHONUNBUFFERED=1 \
	PIP_DISABLE_PIP_VERSION_CHECK=1 \
	PIP_NO_CACHE_DIR=1 \
	# Cache fastembed model downloads in a writable location (HF Spaces uses
	# /data for persistent storage on paid tiers; falls back to /tmp on free).
	FASTEMBED_CACHE_PATH=/tmp/fastembed_cache \
	HF_HOME=/tmp/huggingface \
	PORT=7860

	WORKDIR /app

	# Install build essentials needed for some onnxruntime / tokenizers wheels.
	RUN apt-get update \
	&& apt-get install -y --no-install-recommends \
	build-essential \
	libgomp1 \
	&& rm -rf /var/lib/apt/lists/*

	COPY requirements.txt .
	RUN pip install -r requirements.txt

	COPY main.py .

	# Pre-warm the model cache at build time so the first request is fast.
	# Skipped if HF_TOKEN is required for a gated model (set as a secret at
	# runtime then the first request will warm the cache).
	RUN python -c "from fastembed import TextEmbedding, LateInteractionTextEmbedding; \
	TextEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2'); \
	LateInteractionTextEmbedding(model_name='colbert-ir/colbertv2.0')" \
	\|\| echo "Model pre-warm skipped — will download on first request."

	EXPOSE 7860

	# Use a shell so $PORT is interpolated.
	CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]