Spaces:

emilbm
/

text2vector

Sleeping

File size: 1,421 Bytes

5a5e912

from pydantic import BaseModel, Field, field_validator, StringConstraints
from typing import Annotated

PREFIX_ACCEPTED = ["query: ", "passage: "]

ShortText = Annotated[str, StringConstraints(max_length=2000)]


class EmbedRequest(BaseModel):
    """
    Request model for texts to be embedded.
    Each text must start with an accepted prefix and be ≤ 2000 characters.
    The texts need to start with either "query: " or "passage: ".
    """

    texts: list[ShortText] = Field(
        ...,
        json_schema_extra={
            "example": [
                "query: what is the capital of France?",
                "passage: Paris is the capital of France.",
            ]
        },
        description="List of texts to be embedded (≤ 2000 characters each) and must start with 'query: ' or 'passage: '.",
    )

    @field_validator("texts")
    @classmethod
    def check_prefixes(cls, texts: list[str]) -> list[str]:
        for t in texts:
            if not any(t.startswith(prefix) for prefix in PREFIX_ACCEPTED):
                raise ValueError(f"Each text must start with one of {PREFIX_ACCEPTED}")
        return texts


class EmbedResponse(BaseModel):
    """Response model containing embeddings."""

    embeddings: list[list[float]] = Field(
        ...,
        description="List of embedding vectors corresponding to the input texts. Each embedding is a list of floats with length 1024.",
    )