| from __future__ import annotations |
|
|
| import numpy as np |
| import pytest |
| import torch |
| from datasets import load_dataset |
|
|
| from sentence_transformers import SentenceTransformer |
|
|
|
|
| @pytest.mark.custom |
| def test_cde_small_v2(): |
| |
| model = SentenceTransformer("jxm/cde-small-v2", trust_remote_code=True) |
| context_docs_size = model[0].config.transductive_corpus_size |
|
|
| |
| dataset = load_dataset("sentence-transformers/natural-questions", split="train") |
| dataset.shuffle(seed=42) |
| |
| queries = dataset["query"][:2] |
| docs = dataset["answer"][:5] |
| context_docs = dataset["answer"][-context_docs_size:] |
|
|
| |
| dataset_embeddings = model.encode( |
| context_docs, |
| prompt_name="document", |
| convert_to_tensor=True, |
| ) |
|
|
| |
| doc_embeddings = model.encode( |
| docs, |
| prompt_name="document", |
| dataset_embeddings=dataset_embeddings, |
| convert_to_tensor=True, |
| ) |
| query_embeddings = model.encode( |
| queries, |
| prompt_name="query", |
| dataset_embeddings=dataset_embeddings, |
| convert_to_tensor=True, |
| ) |
|
|
| |
| similarities = model.similarity(query_embeddings, doc_embeddings) |
| assert similarities.shape == (2, 5), f"Expected shape (2, 5), but got {similarities.shape}" |
| expected = torch.tensor( |
| [[0.8778, 0.7851, 0.7810, 0.7781, 0.7966], [0.7916, 0.8648, 0.7845, 0.7865, 0.8136]], |
| device=similarities.device, |
| ) |
| assert torch.isclose(similarities, expected, atol=1e-3).all() |
|
|
|
|
| @pytest.mark.custom |
| def test_jina_embeddings_v3(): |
| model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True) |
| task = "retrieval.query" |
| embeddings = model.encode( |
| ["What is the weather like in Berlin today?"], |
| task=task, |
| prompt_name=task, |
| ) |
| assert embeddings.shape == (1, 1024), f"Expected shape (1, 1024), but got {embeddings.shape}" |
| assert embeddings[0][0] == pytest.approx( |
| -0.08203125, abs=0.01 |
| ), f"Expected value close to 0.08203125, but got {embeddings[0][0]}" |
|
|
|
|
| @pytest.mark.custom |
| def test_jina_clip(): |
| |
| truncate_dim = 512 |
|
|
| |
| model = SentenceTransformer( |
| "jinaai/jina-clip-v2", |
| trust_remote_code=True, |
| truncate_dim=truncate_dim, |
| config_kwargs={"use_vision_xformers": False}, |
| ) |
|
|
| |
| sentences = [ |
| "غروب جميل على الشاطئ", |
| "海滩上美丽的日落", |
| "Un beau coucher de soleil sur la plage", |
| "Ein wunderschöner Sonnenuntergang am Strand", |
| "Ένα όμορφο ηλιοβασίλεμα πάνω από την παραλία", |
| "समुद्र तट पर एक खूबसूरत सूर्यास्त", |
| "Un bellissimo tramonto sulla spiaggia", |
| "浜辺に沈む美しい夕日", |
| "해변 위로 아름다운 일몰", |
| ] |
|
|
| |
| image_urls = ["https://i.ibb.co/nQNGqL0/beach1.jpg", "https://i.ibb.co/r5w8hG8/beach2.jpg"] |
|
|
| |
| text_embeddings = model.encode(sentences, normalize_embeddings=True) |
| image_embeddings = model.encode(image_urls, normalize_embeddings=True) |
| embeddings = np.concatenate((text_embeddings, image_embeddings), axis=0) |
|
|
| |
| query = "beautiful sunset over the beach" |
| query_embeddings = model.encode(query, prompt_name="retrieval.query", normalize_embeddings=True) |
|
|
| similarities = model.similarity(query_embeddings, embeddings) |
| assert similarities.shape == ( |
| 1, |
| len(sentences) + len(image_urls), |
| ), f"Expected shape (1, {len(sentences) + len(image_urls)}), but got {similarities.shape}" |
| expected = torch.tensor([0.5342, 0.6753, 0.6130, 0.6234, 0.5823, 0.6351, 0.5950, 0.5691, 0.6070, 0.3101, 0.3291]) |
| assert torch.isclose(similarities, expected, atol=1e-3).all() |
|
|