File size: 4,670 Bytes
6f06d5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""Shared building blocks for the Modal embedding endpoints.

``modal_embeddings_en.py`` and ``modal_embeddings_multilang.py`` each define a tiny
``EmbeddingModel`` class at module scope (Modal requires globally-defined classes
with stacked ``@app.cls`` / ``@modal.concurrent`` decorators) that delegates to the
helpers here. All the heavy lifting — the container image, model loading, pooling,
and the embedding request handler — lives in this module so it is written once.

The endpoint contract (consumed by ``document_qa.custom_embeddings.ModalEmbeddings``):

- **Method**: ``POST``
- **Auth**: ``x-api-key`` header, compared against the ``API_KEY`` secret.
- **Body**: form field ``text`` containing newline-separated strings.
- **Response**: JSON list of L2-normalised embedding vectors, one per input line.
"""

import os

import modal
import torch
import torch.nn.functional as F
from fastapi import HTTPException, Request
from torch import Tensor

MINUTES = 60  # seconds
N_GPU = 1

# Shared container image for every embedding model.
image = (
    modal.Image.debian_slim(python_version="3.11")
    .pip_install(
        "transformers",
        "huggingface_hub[hf_transfer]==0.26.2",
        "flashinfer-python==0.2.0.post2",  # pinning, very unstable
        "fastapi[standard]",
        extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
    )
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})  # faster model transfers
    # Modal 1.0 no longer auto-mounts imported local modules; the wrapper scripts
    # import this module by name, so it must be added explicitly. Kept last so it
    # doesn't invalidate the (expensive) pip layer above on every code edit.
    .add_local_python_source("_embeddings_app")
)

hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)


def cls_kwargs() -> dict:
    """Common ``@app.cls`` configuration shared by every embedding endpoint."""
    return dict(
        image=image,
        gpu=f"L40S:{N_GPU}",
        # how long should we stay up with no requests?
        scaledown_window=3 * MINUTES,
        volumes={
            "/root/.cache/huggingface": hf_cache_vol,
            "/root/.cache/vllm": vllm_cache_vol,
        },
        secrets=[modal.Secret.from_name("document-qa-embedding-key")],
    )


def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    """Mean-pool token embeddings, ignoring padding positions."""
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


def load_embedding_model(model_name: str, model_revision: str):
    """Load a tokenizer + model onto the best available device, once per container.

    Returns:
        tuple: ``(tokenizer, model, device)`` with ``model`` already in eval mode.
    """
    # transformers is only available inside the Modal image, so import lazily.
    from transformers import AutoModel, AutoTokenizer

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Loading {model_name} on {device}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name, revision=model_revision)
    model = AutoModel.from_pretrained(model_name, revision=model_revision).to(device)
    model.eval()
    print("Model loaded successfully.")
    return tokenizer, model, device


def run_embed(tokenizer, model, device, request: Request, text: str):
    """Authenticate, embed newline-separated ``text``, and return normalised vectors."""
    api_key = request.headers.get("x-api-key")
    if api_key != os.environ["API_KEY"]:
        raise HTTPException(status_code=401, detail="Unauthorized")

    texts = [t for t in text.split("\n") if t.strip()]
    if not texts:
        return []

    print(f"Start embedding {len(texts)} texts")
    try:
        with torch.no_grad():
            batch_dict = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
            batch_dict = {k: v.to(device) for k, v in batch_dict.items()}

            outputs = model(**batch_dict)
            embeddings = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
            embeddings = F.normalize(embeddings, p=2, dim=1)
            embeddings = embeddings.cpu().numpy().tolist()

        print("Finished embedding texts.")
        return embeddings

    except RuntimeError as e:
        print(f"Error during embedding: {str(e)}")
        if "CUDA out of memory" in str(e):
            print("CUDA OOM. Try reducing batch size or using a smaller model.")
        raise