document-qa-dev / document_qa /deployment /_embeddings_app.py
lfoppiano's picture
Upload folder using huggingface_hub
6f06d5d verified
Raw
History Blame Contribute Delete
4.67 kB
"""Shared building blocks for the Modal embedding endpoints.
``modal_embeddings_en.py`` and ``modal_embeddings_multilang.py`` each define a tiny
``EmbeddingModel`` class at module scope (Modal requires globally-defined classes
with stacked ``@app.cls`` / ``@modal.concurrent`` decorators) that delegates to the
helpers here. All the heavy lifting — the container image, model loading, pooling,
and the embedding request handler — lives in this module so it is written once.
The endpoint contract (consumed by ``document_qa.custom_embeddings.ModalEmbeddings``):
- **Method**: ``POST``
- **Auth**: ``x-api-key`` header, compared against the ``API_KEY`` secret.
- **Body**: form field ``text`` containing newline-separated strings.
- **Response**: JSON list of L2-normalised embedding vectors, one per input line.
"""
import os
import modal
import torch
import torch.nn.functional as F
from fastapi import HTTPException, Request
from torch import Tensor
MINUTES = 60 # seconds
N_GPU = 1
# Shared container image for every embedding model.
image = (
modal.Image.debian_slim(python_version="3.11")
.pip_install(
"transformers",
"huggingface_hub[hf_transfer]==0.26.2",
"flashinfer-python==0.2.0.post2", # pinning, very unstable
"fastapi[standard]",
extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
# Modal 1.0 no longer auto-mounts imported local modules; the wrapper scripts
# import this module by name, so it must be added explicitly. Kept last so it
# doesn't invalidate the (expensive) pip layer above on every code edit.
.add_local_python_source("_embeddings_app")
)
hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
def cls_kwargs() -> dict:
"""Common ``@app.cls`` configuration shared by every embedding endpoint."""
return dict(
image=image,
gpu=f"L40S:{N_GPU}",
# how long should we stay up with no requests?
scaledown_window=3 * MINUTES,
volumes={
"/root/.cache/huggingface": hf_cache_vol,
"/root/.cache/vllm": vllm_cache_vol,
},
secrets=[modal.Secret.from_name("document-qa-embedding-key")],
)
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
"""Mean-pool token embeddings, ignoring padding positions."""
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
def load_embedding_model(model_name: str, model_revision: str):
"""Load a tokenizer + model onto the best available device, once per container.
Returns:
tuple: ``(tokenizer, model, device)`` with ``model`` already in eval mode.
"""
# transformers is only available inside the Modal image, so import lazily.
from transformers import AutoModel, AutoTokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Loading {model_name} on {device}...")
tokenizer = AutoTokenizer.from_pretrained(model_name, revision=model_revision)
model = AutoModel.from_pretrained(model_name, revision=model_revision).to(device)
model.eval()
print("Model loaded successfully.")
return tokenizer, model, device
def run_embed(tokenizer, model, device, request: Request, text: str):
"""Authenticate, embed newline-separated ``text``, and return normalised vectors."""
api_key = request.headers.get("x-api-key")
if api_key != os.environ["API_KEY"]:
raise HTTPException(status_code=401, detail="Unauthorized")
texts = [t for t in text.split("\n") if t.strip()]
if not texts:
return []
print(f"Start embedding {len(texts)} texts")
try:
with torch.no_grad():
batch_dict = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
embeddings = F.normalize(embeddings, p=2, dim=1)
embeddings = embeddings.cpu().numpy().tolist()
print("Finished embedding texts.")
return embeddings
except RuntimeError as e:
print(f"Error during embedding: {str(e)}")
if "CUDA out of memory" in str(e):
print("CUDA OOM. Try reducing batch size or using a smaller model.")
raise