File size: 18,042 Bytes

f7fef32

from dataclasses import dataclass
import shutil
from textwrap import dedent, indent
from typing import Any
import numpy as np
from zstandard import ZstdCompressor
from pathlib import Path
import io
from sentence_transformers import SentenceTransformer
from torch.nn import EmbeddingBag
import torch
from model2vec import StaticModel
from tokenizers import Encoding, Tokenizer

models_path = Path("models")


@dataclass
class ModelCard:
    owner: str
    repo: str
    # The dimensions that were applied with Matroyshka Loss.
    matroyshka_dims: list[int]
    description: str
    license: str

    def name(self):
        return f"{self.owner}/{self.repo}"

    def path(self):
        return models_path / self.owner / self.repo

    def get_description(self):
        return dedent(self.description).strip()


def zst_compress_file(input: Path):
    cctx = ZstdCompressor()
    output = input.parent / f"{input.name}.zst"
    print(f"Compressing {output}")
    with open(input, "rb") as fin, open(output, "wb") as fout:
        cctx.copy_stream(fin, fout)


def save_data(path: Path, tensor: torch.Tensor):
    """Writes out the static embeddings to a .npy and .npy.zst file"""
    buffer = io.BytesIO()

    if tensor.dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
        # Store as the raw bytes.
        np.save(buffer, tensor.detach().view(torch.uint8).numpy())
    else:
        np.save(buffer, tensor.detach().numpy())

    print(f"Saving {path}")
    with (open(path, "wb") as outfile,):
        outfile.write(buffer.getvalue())

    zst_compress_file(path)


def quantization_loss_mse(tensor: torch.Tensor, dtype: torch.dtype):
    """
    Compute reconstruction loss when converting embeddings to a datatype and back using
    the mean squared error, which punishes big errors more than small ones.
    """

    # Original → quantize → dequantize
    roundtrip = tensor.detach().to(dtype).to(tensor.dtype)

    # Mean squared error
    return torch.mean((tensor - roundtrip) ** 2).item()


def quantization_loss_mae(tensor: torch.Tensor, dtype: torch.dtype):
    """
    Compute reconstruction loss when converting embeddings to a datatype and back using
    the mean absolute error, which is less sensitive to outliers than MSE.
    """

    # Original → quantize → dequantize
    roundtrip = tensor.detach().to(dtype).to(tensor.dtype)

    # Mean absolute error
    return torch.mean(torch.abs(tensor - roundtrip)).item()


def quantization_loss_cosine(tensor: torch.Tensor, dtype: torch.dtype):
    """
    Compute reconstruction loss when converting embeddings to a datatype and back using
    cosine similarity. This measures whether the embedding directions are preserved
    after quantization, independent of their magnitudes.
    """

    # Original → quantize → dequantize
    roundtrip = tensor.detach().to(dtype).to(tensor.dtype)

    # Flatten both to 2D (num_vectors, dimensions) in case tensor is 1D or higher-D
    if tensor.ndim == 1:
        orig = tensor.unsqueeze(0)
        recon = roundtrip.unsqueeze(0)
    else:
        orig = tensor.view(tensor.shape[0], -1)
        recon = roundtrip.view(roundtrip.shape[0], -1)

    # Cosine similarity per vector, then average
    cos = torch.nn.functional.cosine_similarity(orig, recon, dim=1)
    return cos.mean().item()


def export_embeddings(model_card: ModelCard, embeddings: torch.Tensor) -> None:
    vocab_size, dimensions = embeddings.shape

    # This logic can always be adjusted for models with different shapes.
    assert (
        embeddings.dtype == torch.float32
    ), f"The embeddings {embeddings.dtype} are assumed to be float32."

    for dim in model_card.matroyshka_dims:
        assert (
            dim <= dimensions
        ), f"The Matroyshka dimensions {dim} were bigger than the models dimensions of {dimensions}"

        truncated = embeddings[:, :dim]
        assert truncated.shape == torch.Size([vocab_size, dim])

        save_data(model_card.path() / f"fp32.d{dim}.npy", truncated)
        save_data(
            model_card.path() / f"fp16.d{dim}.npy",
            truncated.to(dtype=torch.float16),
        )
        save_data(
            model_card.path() / f"fp8_e5m2.d{dim}.npy",
            truncated.to(dtype=torch.float8_e5m2),
        )
        save_data(
            model_card.path() / f"fp8_e4m3.d{dim}.npy",
            truncated.to(dtype=torch.float8_e4m3fn),
        )


def normalized_mean_pooling(x: torch.Tensor) -> torch.Tensor:
    pooled = x.mean(dim=0)
    normalized = torch.nn.functional.normalize(pooled, dim=0)
    return normalized


def export_readme(
    model_card: ModelCard,
    embeddings: torch.Tensor,
    tokenizer: Tokenizer,
):
    vocab_size, dimensions = embeddings.shape
    norms = torch.norm(embeddings, dim=1)  # shape: [vocab_size]

    phrases = [
        "The committee approved the proposal after hours of heated discussion and several last-minute amendments."
        "When training large neural networks, careful tuning of hyperparameters can significantly affect performance and stability."
        "Despite the heavy rain, the concert continued as planned and the crowd stayed enthusiastic until the final encore."
        "In ancient mythology, heroes often embarked on perilous journeys to discover hidden truths about themselves and their world."
        "The new smartphone model features an improved camera system, faster processing, and extended battery life compared to its predecessor."
        "He tried to explain the concept using simple analogies, but the underlying mathematics remained difficult to grasp for most listeners."
        "After weeks of negotiations, the two countries signed a historic trade agreement aimed at reducing tariffs and boosting cooperation."
        "She paused for a moment before answering, choosing her words carefully to avoid misunderstanding in such a delicate situation."
        "The detective pieced together the timeline of events, realizing that the key witness had provided a contradictory statement."
        "Remote work has changed the way teams collaborate, with online tools replacing traditional office routines and in-person meetings."
    ]

    cosine_similarity = {
        torch.float16: [],
        torch.float8_e4m3fn: [],
        torch.float8_e5m2: [],
    }

    for phrase in phrases:
        encoding: Encoding = tokenizer.encode(phrase)
        embedded_phrase = embeddings[torch.tensor(encoding.ids, dtype=torch.long)]

        for dtype in cosine_similarity.keys():
            pooling_unquantized = normalized_mean_pooling(embedded_phrase)
            pooling_roundtrip = normalized_mean_pooling(
                embedded_phrase.to(dtype).to(torch.float32)
            )
            cosine = torch.dot(pooling_unquantized, pooling_roundtrip).item()
            cosine_similarity[dtype].append(cosine)

    avg_cosine_similarity = {
        dtype: sum(values) / len(values) for dtype, values in cosine_similarity.items()
    }

    tokenizer_examples = []
    for text in [
        "This is an example of encoding",
        "The quick brown fox jumps over the lazy dog.",
        "Curaçao, naïve fiancé, jalapeño, déjà vu.",
        "Привет, как дела?",
        "Бързата кафява лисица прескача мързеливото куче.",
        "Γρήγορη καφέ αλεπού πηδάει πάνω από τον τεμπέλη σκύλο.",
        "اللغة العربية جميلة وغنية بالتاريخ.",
        "مرحبا بالعالم!",
        "Simplified: 快速的棕色狐狸跳过懒狗。",
        "Traditional: 快速的棕色狐狸跳過懶狗。",
        "素早い茶色の狐が怠け者の犬を飛び越える。",
        "コンピュータープログラミング",
        "빠른 갈색 여우가 게으른 개를 뛰어넘습니다.",
        "तेज़ भूरी लोमड़ी आलसी कुत्ते के ऊपर कूदती है।",
        "দ্রুত বাদামী শিয়াল অলস কুকুরের উপর দিয়ে লাফ দেয়।",
        "வேகமான பழுப்பு நரி சோம்பேறி நாயின் மேல் குதிக்கிறது.",
        "สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ.",
        "ብሩክ ቡናማ ቀበሮ ሰነፍ ውሻን ተዘልሏል።",
        "Hello 世界 مرحبا 🌍",
        "123, αβγ, абв, العربية, 中文, हिन्दी.",
    ]:
        encoding = tokenizer.encode(text)
        tokens = [f"`{token}`" for token in encoding.tokens]

        tokenizer_examples.append(f"**Input:** {text}<br/>")
        tokenizer_examples.append(f"**Tokens**: {' '.join(tokens)}")
        tokenizer_examples.append("")

    tokenizer_output = "\n".join(tokenizer_examples)

    with (model_card.path() / "README.md").open("wt") as file:
        prefix = "                "

        file.write(
            dedent(
                f"""
                # [{model_card.name()}](https://huggingface.co/{model_card.name()})
                
                License: [{model_card.license}](https://choosealicense.com/licenses/{model_card.license}/)
                
                {indent(model_card.get_description(), prefix).strip()}
                
                ## Model Stats
                
                Stats that describe the embeddings tensor shapes and value distribution.

                | item          | metric                  | value |
                | --------------| ----------------------- | ----- |
                | vocab         | size                    | {vocab_size:,.0f} |
                | embedding     | dimensions              | {dimensions:,.0f} |
                | vector length | mean                    | {norms.mean().item():.2f} |
                | vector length | median                  | {norms.median().item():.2f} |
                | vector length | stddev                  | {norms.std().item():.2f} |
                | values        | mean                    | {embeddings.mean().item():.2f} |
                | values        | median                  | {embeddings.median().item():.2f} |
                | values        | stddev                  | {embeddings.std().item():.2f} |
                
                ## Mean Pooled Quantization Loss
                
                This test roundtrips the vectors through quantization, but performs the
                mean pooling arithmetic in float32 space. The quantized and unquantized
                mean pooled vectors are compared to each other to determine their cosine
                similarity, to show how much the meaning of the vector has changed due
                to quantization.
                
                | Precision     | Cosine Similarity |
                | ------------- | ----------------- |
                | fp16          | {avg_cosine_similarity[torch.float16]:.5f} |
                | fp8 e4m3      | {avg_cosine_similarity[torch.float8_e4m3fn]:.5f} |
                | fp8 e5m2      | {avg_cosine_similarity[torch.float8_e5m2]:.5f} |
                
                ## Quantization Loss Per Vector
                
                While ultimately the embedding vectors will be mean pooled together, it's
                still useful to look at the loss per-vector in the embedding table to see
                which quantization strategies retain the most vector meaning.
                
                - **Cosine Similarity** — measures how well the *direction* of embedding vectors
                is preserved after quantization, independent of scale. This is especially
                relevant when embeddings are used for similarity search or retrieval.
                - **MSE (Mean Squared Error)** — emphasizes large errors by squaring the
                differences. Useful for detecting whether any values are badly distorted.
                - **MAE (Mean Absolute Error)** — the average absolute difference between
                original and quantized values. Easier to interpret, less sensitive to outliers.

                | Precision     | Metric | Value |
                | ------------- | ------ | ----- |
                | fp16          | cosine similarity | {quantization_loss_cosine(embeddings, torch.float16):.5f} |
                | fp8 e4m3      | cosine similarity | {quantization_loss_cosine(embeddings, torch.float8_e4m3fn):.5f} |
                | fp8 e5m2      | cosine similarity | {quantization_loss_cosine(embeddings, torch.float8_e5m2):.5f} |
                | fp16          | MSE    | {quantization_loss_mse(embeddings, torch.float16):.5f} |
                | fp8 e4m3      | MSE    | {quantization_loss_mse(embeddings, torch.float8_e4m3fn):.5f} |
                | fp8 e5m2      | MSE    | {quantization_loss_mse(embeddings, torch.float8_e5m2):.5f} |
                | fp16          | MAE    | {quantization_loss_mae(embeddings, torch.float16):.5f} |
                | fp8 e4m3      | MAE    | {quantization_loss_mae(embeddings, torch.float8_e4m3fn):.5f} |
                | fp8 e5m2      | MAE    | {quantization_loss_mae(embeddings, torch.float8_e5m2):.5f} |
                
                ## Tokenizer Examples
                
                {indent(tokenizer_output, prefix).strip()}
                """
            ).strip()
        )


def export_tokenizer(model_card: ModelCard, tokenizer: Tokenizer) -> None:
    tokenizer_path = model_card.path() / "tokenizer.json"
    print(f"Exporting tokenizer: {tokenizer_path}")
    tokenizer.save(str(tokenizer_path))
    zst_compress_file(tokenizer_path)


def export_sentence_transformers(model_card: ModelCard) -> None:
    """Extract the embeddings and tokenizer from SentenceTransformers"""

    print("Processing", model_card.name())

    model = SentenceTransformer(model_card.name(), device="cpu")
    embedding_bag: EmbeddingBag = model[0].embedding  # type: ignore
    model_card.path().mkdir(exist_ok=True, parents=True)
    embeddings = torch.Tensor(embedding_bag.weight)

    export_embeddings(model_card, embeddings)
    export_tokenizer(model_card, model.tokenizer)
    export_readme(model_card, embeddings, model.tokenizer)


def export_model2vec(model_card: ModelCard) -> None:
    """Extract the embeddings and tokenizer from model2vec"""

    print("Processing", model_card.name())

    model = StaticModel.from_pretrained(model_card.name())
    model_card.path().mkdir(exist_ok=True, parents=True)
    embeddings = torch.from_numpy(model.embedding)
    export_embeddings(model_card, embeddings)
    export_tokenizer(model_card, model.tokenizer)
    export_readme(model_card, embeddings, model.tokenizer)


def main() -> None:
    # Static embedders that use sentence_transformers models.
    sentence_transformers_models = [
        ModelCard(
            owner="sentence-transformers",
            repo="static-similarity-mrl-multilingual-v1",
            description="""
            Multi-lingual similarity embeddings that were trained with Matroyshka loss
            that allows for more effective truncation of the embedding vectors. It
            was trained on a variety of domains of multilingual datasets.
            
            It's a general purpose model that can be used for semantic textual similarity,
            paraphrase mining, text classification, clustering, and more
            """,
            matroyshka_dims=[32, 64, 128, 256, 512, 1024],
            license="apache-2.0",
        ),
        ModelCard(
            owner="sentence-transformers",
            repo="static-retrieval-mrl-en-v1",
            description="""
            English-only uncased similarity embeddings that were trained with Matroyshka
            loss that allows for more effective truncation of the embedding vectors. It
            was trained on a variety of domains of monolingual datasets. I was designed
            specifically for similarity retrieval.
            """,
            matroyshka_dims=[32, 64, 128, 256, 512, 1024],
            license="apache-2.0",
        ),
    ]
    # Static embedders that use model2vec.
    model2vec_models = [
        ModelCard(
            owner="minishlab",
            repo="potion-multilingual-128M",
            # These are assumed as their is no python reference implementation:
            matroyshka_dims=[32, 64, 128, 256],
            description="""
            A multilingual embedder. The details are a bit scant on how it's trained as
            there is no source code for it. However, it's likely a close architecture
            to the potion-retrieval-32M model, but trained on Common Crawl data.
            
            The 128M references the number of parameters in the embeddings:
            
            256 dimensions * 500,353 vocab.
            """,
            license="mit",
        ),
        ModelCard(
            owner="minishlab",
            repo="potion-retrieval-32M",
            matroyshka_dims=[32, 64, 128, 256, 512],
            description="""
            The token embeddings from a monolingual English 32M parameter model that was
            distilled from embeddings that were initialized from the the multi-domain
            [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5)
            
            The 32M references the number of parameters in the embeddings:
            
            512 dimension * 63,091 vocab.
            """,
            license="mit",
        ),
    ]

    if models_path.exists():
        print(f"Removing the old models folder: {models_path}")
        shutil.rmtree(models_path)
        models_path.mkdir()

    for model_card in sentence_transformers_models:
        export_sentence_transformers(model_card)

    for model_card in model2vec_models:
        export_model2vec(model_card)


if __name__ == "__main__":
    main()