Spaces:

sciencialab
/

document-qa-dev

Build error

App Files Files Community

document-qa-dev / document_qa /deployment /_embeddings_app.py

lfoppiano

Upload folder using huggingface_hub

6f06d5d verified 21 days ago

Raw

History Blame Contribute Delete

4.67 kB

	"""Shared building blocks for the Modal embedding endpoints.

	``modal_embeddings_en.py`` and ``modal_embeddings_multilang.py`` each define a tiny
	``EmbeddingModel`` class at module scope (Modal requires globally-defined classes
	with stacked ``@app.cls`` / ``@modal.concurrent`` decorators) that delegates to the
	helpers here. All the heavy lifting — the container image, model loading, pooling,
	and the embedding request handler — lives in this module so it is written once.

	The endpoint contract (consumed by ``document_qa.custom_embeddings.ModalEmbeddings``):

	- Method: ``POST``
	- Auth: ``x-api-key`` header, compared against the ``API_KEY`` secret.
	- Body: form field ``text`` containing newline-separated strings.
	- Response: JSON list of L2-normalised embedding vectors, one per input line.
	"""

	import os

	import modal
	import torch
	import torch.nn.functional as F
	from fastapi import HTTPException, Request
	from torch import Tensor

	MINUTES = 60 # seconds
	N_GPU = 1

	# Shared container image for every embedding model.
	image = (
	modal.Image.debian_slim(python_version="3.11")
	.pip_install(
	"transformers",
	"huggingface_hub[hf_transfer]==0.26.2",
	"flashinfer-python==0.2.0.post2", # pinning, very unstable
	"fastapi[standard]",
	extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
	)
	.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
	# Modal 1.0 no longer auto-mounts imported local modules; the wrapper scripts
	# import this module by name, so it must be added explicitly. Kept last so it
	# doesn't invalidate the (expensive) pip layer above on every code edit.
	.add_local_python_source("_embeddings_app")
	)

	hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
	vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)


	def cls_kwargs() -> dict:
	"""Common ``@app.cls`` configuration shared by every embedding endpoint."""
	return dict(
	image=image,
	gpu=f"L40S:{N_GPU}",
	# how long should we stay up with no requests?
	scaledown_window=3 * MINUTES,
	volumes={
	"/root/.cache/huggingface": hf_cache_vol,
	"/root/.cache/vllm": vllm_cache_vol,
	},
	secrets=[modal.Secret.from_name("document-qa-embedding-key")],
	)


	def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
	"""Mean-pool token embeddings, ignoring padding positions."""
	last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
	return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


	def load_embedding_model(model_name: str, model_revision: str):
	"""Load a tokenizer + model onto the best available device, once per container.

	Returns:
	tuple: ``(tokenizer, model, device)`` with ``model`` already in eval mode.
	"""
	# transformers is only available inside the Modal image, so import lazily.
	from transformers import AutoModel, AutoTokenizer

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Loading {model_name} on {device}...")
	tokenizer = AutoTokenizer.from_pretrained(model_name, revision=model_revision)
	model = AutoModel.from_pretrained(model_name, revision=model_revision).to(device)
	model.eval()
	print("Model loaded successfully.")
	return tokenizer, model, device


	def run_embed(tokenizer, model, device, request: Request, text: str):
	"""Authenticate, embed newline-separated ``text``, and return normalised vectors."""
	api_key = request.headers.get("x-api-key")
	if api_key != os.environ["API_KEY"]:
	raise HTTPException(status_code=401, detail="Unauthorized")

	texts = [t for t in text.split("\n") if t.strip()]
	if not texts:
	return []

	print(f"Start embedding {len(texts)} texts")
	try:
	with torch.no_grad():
	batch_dict = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
	batch_dict = {k: v.to(device) for k, v in batch_dict.items()}

	outputs = model(**batch_dict)
	embeddings = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
	embeddings = F.normalize(embeddings, p=2, dim=1)
	embeddings = embeddings.cpu().numpy().tolist()

	print("Finished embedding texts.")
	return embeddings

	except RuntimeError as e:
	print(f"Error during embedding: {str(e)}")
	if "CUDA out of memory" in str(e):
	print("CUDA OOM. Try reducing batch size or using a smaller model.")
	raise