metadata
language:
- en
license: apache-2.0
tags:
- sentence-transformers
- sentence-similarity
- feature-extraction
- dense
- generated_from_trainer
- dataset_size:900
- loss:MatryoshkaLoss
- loss:MultipleNegativesRankingLoss
base_model: jinaai/jina-code-embeddings-0.5b
widget:
- source_sentence: >-
Best practices for async
test_similarity_search_with_relevance_score_with_threshold_and_filter
sentences:
- |-
def test_tool_retry_custom_failure_formatter() -> None:
"""Test ToolRetryMiddlewarewith custom failure message formatter."""
def custom_formatter(exc: Exception) -> str:
return f"Custom error: {type(exc).__name__}"
model = FakeToolCallingModel(
tool_calls=[
[ToolCall(name="failing_tool", args={"value": "test"}, id="1")],
[],
]
)
retry = ToolRetryMiddleware(
max_retries=1,
initial_delay=0.01,
jitter=False,
on_failure=custom_formatter,
)
agent = create_agent(
model=model,
tools=[failing_tool],
middleware=[retry],
checkpointer=InMemorySaver(),
)
result = agent.invoke(
{"messages": [HumanMessage("Use failing tool")]},
{"configurable": {"thread_id": "test"}},
)
tool_messages = [m for m in result["messages"] if isinstance(m, ToolMessage)]
assert len(tool_messages) == 1
assert "Custom error: ValueError" in tool_messages[0].content
- |-
def test_parse_scores(answer: str) -> None:
result = output_parser.parse(answer)
assert result["answer"] == "foo bar answer."
score = int(result["score"])
assert score == 80
- >-
async def
test_similarity_search_with_relevance_score_with_threshold_and_filter(
vector_name: str | None,
qdrant_location: str,
) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [
{"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}}
for i in range(len(texts))
]
docsearch = Qdrant.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
vector_name=vector_name,
location=qdrant_location,
)
score_threshold = 0.99 # for almost exact match
# test negative filter condition
negative_filter = {"page": 1, "metadata": {"page": 2, "pages": [3]}}
kwargs = {"filter": negative_filter, "score_threshold": score_threshold}
output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs)
assert len(output) == 0
# test positive filter condition
positive_filter = {"page": 0, "metadata": {"page": 1, "pages": [2]}}
kwargs = {"filter": positive_filter, "score_threshold": score_threshold}
output = await docsearch.asimilarity_search_with_relevance_scores(
"foo", k=3, **kwargs
)
assert len(output) == 1
assert all(score >= score_threshold for _, score in output)
- source_sentence: Explain the test_empty_token logic
sentences:
- |-
def test_empty_token(self) -> None:
assert len(_get_token_ids_default_method("")) == 0
- |-
def convert_to_openai_tool(
tool: Mapping[str, Any] | type[BaseModel] | Callable | BaseTool,
*,
strict: bool | None = None,
) -> dict[str, Any]:
"""Convert a tool-like object to an OpenAI tool schema.
[OpenAI tool schema reference](https://platform.openai.com/docs/api-reference/chat/create#chat-create-tools)
Args:
tool:
Either a dictionary, a `pydantic.BaseModel` class, Python function, or
`BaseTool`. If a dictionary is passed in, it is assumed to already be a
valid OpenAI function, a JSON schema with top-level `title` key specified,
an Anthropic format tool, or an Amazon Bedrock Converse format tool.
strict:
If `True`, model output is guaranteed to exactly match the JSON Schema
provided in the function definition. If `None`, `strict` argument will not
be included in tool definition.
Returns:
A dict version of the passed in tool which is compatible with the
OpenAI tool-calling API.
!!! warning "Behavior changed in `langchain-core` 0.3.16"
`description` and `parameters` keys are now optional. Only `name` is
required and guaranteed to be part of the output.
!!! warning "Behavior changed in `langchain-core` 0.3.44"
Return OpenAI Responses API-style tools unchanged. This includes
any dict with `"type"` in `"file_search"`, `"function"`,
`"computer_use_preview"`, `"web_search_preview"`.
!!! warning "Behavior changed in `langchain-core` 0.3.63"
Added support for OpenAI's image generation built-in tool.
"""
# Import locally to prevent circular import
from langchain_core.tools import Tool # noqa: PLC0415
if isinstance(tool, dict):
if tool.get("type") in _WellKnownOpenAITools:
return tool
# As of 03.12.25 can be "web_search_preview" or "web_search_preview_2025_03_11"
if (tool.get("type") or "").startswith("web_search_preview"):
return tool
if isinstance(tool, Tool) and (tool.metadata or {}).get("type") == "custom_tool":
oai_tool = {
"type": "custom",
"name": tool.name,
"description": tool.description,
}
if tool.metadata is not None and "format" in tool.metadata:
oai_tool["format"] = tool.metadata["format"]
return oai_tool
oai_function = convert_to_openai_function(tool, strict=strict)
return {"type": "function", "function": oai_function}
- |-
def strip_think_tags(text: str) -> str:
"""Removes all <think>...</think> tags and their content from text.
This function removes all occurrences of think tags, preserving text
before, between, and after the tags. It also handles markdown code fences.
Args:
text: The input text that may contain think tags.
Returns:
The text with all `<think>...</think>` blocks removed.
"""
# Remove all <think>...</think> blocks using regex
# The pattern matches <think> followed by any content (non-greedy) until </think>
result = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
# Remove markdown code fence markers if present
result = result.strip()
if result.startswith("```json"):
result = result[len("```json") :].strip()
elif result.startswith("```"):
result = result[3:].strip()
if result.endswith("```"):
result = result[:-3].strip()
return result
- source_sentence: Explain the test_dict_int_op_invalid_types logic
sentences:
- |-
def test_parse_empty_after_stripping_think_tags(self) -> None:
"""Test handling when only think tags remain."""
parser: ReasoningStructuredOutputParser[MockPerson] = (
ReasoningStructuredOutputParser(pydantic_object=MockPerson)
)
text = "<think>Only reasoning here</think>"
generation = Generation(text=text)
with pytest.raises(OutputParserException):
parser.parse_result([generation])
- |-
def test_dict_int_op_invalid_types() -> None:
left = {"a": 1, "b": "string"}
right = {"a": 2, "b": 3}
with pytest.raises(
ValueError,
match="Only dict and int values are supported",
):
_dict_int_op(left, right, operator.add)
- |-
def _invocation_params(self) -> dict[str, Any]:
params: dict = {"model": self.model, **self.model_kwargs}
if self.dimensions is not None:
params["dimensions"] = self.dimensions
return params
- source_sentence: Explain the get_allowed_tools logic
sentences:
- |-
def test_text_accessor() -> None:
"""Test that `message.text` property and `.text()` method return the same value."""
human_msg = HumanMessage(content="Hello world")
assert human_msg.text == "Hello world"
assert human_msg.text == "Hello world"
assert str(human_msg.text) == str(human_msg.text)
system_msg = SystemMessage(content="You are a helpful assistant")
assert system_msg.text == "You are a helpful assistant"
assert system_msg.text == "You are a helpful assistant"
assert str(system_msg.text) == str(system_msg.text)
ai_msg = AIMessage(content="I can help you with that")
assert ai_msg.text == "I can help you with that"
assert ai_msg.text == "I can help you with that"
assert str(ai_msg.text) == str(ai_msg.text)
tool_msg = ToolMessage(content="Task completed", tool_call_id="tool_1")
assert tool_msg.text == "Task completed"
assert tool_msg.text == "Task completed"
assert str(tool_msg.text) == str(tool_msg.text)
complex_msg = HumanMessage(
content=[{"type": "text", "text": "Hello "}, {"type": "text", "text": "world"}]
)
assert complex_msg.text == "Hello world"
assert complex_msg.text == "Hello world"
assert str(complex_msg.text) == str(complex_msg.text)
mixed_msg = AIMessage(
content=[
{"type": "text", "text": "The answer is "},
{"type": "tool_use", "name": "calculate", "input": {"x": 2}, "id": "1"},
{"type": "text", "text": "42"},
]
)
assert mixed_msg.text == "The answer is 42"
assert mixed_msg.text == "The answer is 42"
assert str(mixed_msg.text) == str(mixed_msg.text)
empty_msg = HumanMessage(content=[])
assert empty_msg.text == ""
assert empty_msg.text == ""
assert str(empty_msg.text) == str(empty_msg.text)
- |-
def __getattr__(name: str) -> Any:
"""Look up attributes dynamically."""
return _import_attribute(name)
- |-
def get_allowed_tools(self) -> list[str] | None:
"""Get allowed tools.
Returns:
Allowed tools.
"""
return None
- source_sentence: Explain the test_hashing_custom_key_encoder logic
sentences:
- |-
def __getattr__(name: str) -> Any:
"""Look up attributes dynamically."""
return _import_attribute(name)
- |-
def test_hashing_custom_key_encoder() -> None:
"""Test hashing with a custom key encoder."""
def custom_key_encoder(doc: Document) -> str:
return f"quack-{doc.metadata['key']}"
document = Document(
page_content="Lorem ipsum dolor sit amet", metadata={"key": "like a duck"}
)
hashed_document = _get_document_with_hash(document, key_encoder=custom_key_encoder)
assert hashed_document.id == "quack-like a duck"
assert isinstance(hashed_document.id, str)
- |-
def __getattr__(name: str) -> Any:
"""Look up attributes dynamically."""
return _import_attribute(name)
pipeline_tag: sentence-similarity
library_name: sentence-transformers
jina2 Base
This is a sentence-transformers model finetuned from jinaai/jina-code-embeddings-0.5b. It maps sentences & paragraphs to a 896-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
Model Details
Model Description
- Model Type: Sentence Transformer
- Base model: jinaai/jina-code-embeddings-0.5b
- Maximum Sequence Length: 512 tokens
- Output Dimensionality: 896 dimensions
- Similarity Function: Cosine Similarity
- Language: en
- License: apache-2.0
Model Sources
- Documentation: Sentence Transformers Documentation
- Repository: Sentence Transformers on GitHub
- Hugging Face: Sentence Transformers on Hugging Face
Full Model Architecture
SentenceTransformer(
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'Qwen2Model'})
(1): Pooling({'word_embedding_dimension': 896, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': True, 'include_prompt': True})
(2): Normalize()
)
Usage
Direct Usage (Sentence Transformers)
First install the Sentence Transformers library:
pip install -U sentence-transformers
Then you can load this model and run inference.
from sentence_transformers import SentenceTransformer
# Download from the 🤗 Hub
model = SentenceTransformer("ondayex/jina-embed-base-dense-retriever")
# Run inference
sentences = [
'Explain the test_hashing_custom_key_encoder logic',
'def test_hashing_custom_key_encoder() -> None:\n """Test hashing with a custom key encoder."""\n\n def custom_key_encoder(doc: Document) -> str:\n return f"quack-{doc.metadata[\'key\']}"\n\n document = Document(\n page_content="Lorem ipsum dolor sit amet", metadata={"key": "like a duck"}\n )\n hashed_document = _get_document_with_hash(document, key_encoder=custom_key_encoder)\n assert hashed_document.id == "quack-like a duck"\n assert isinstance(hashed_document.id, str)',
'def __getattr__(name: str) -> Any:\n """Look up attributes dynamically."""\n return _import_attribute(name)',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 896]
# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.8949, 0.1085],
# [0.8949, 1.0000, 0.1456],
# [0.1085, 0.1456, 1.0000]])
Training Details
Training Dataset
Unnamed Dataset
- Size: 900 training samples
- Columns:
anchorandpositive - Approximate statistics based on the first 900 samples:
anchor positive type string string details - min: 4 tokens
- mean: 8.5 tokens
- max: 27 tokens
- min: 12 tokens
- mean: 153.14 tokens
- max: 512 tokens
- Samples:
anchor positive How does test_multiple_msg work in Python?def test_multiple_msg() -> None:
human_msg = HumanMessage(content="human", additional_kwargs={"key": "value"})
ai_msg = AIMessage(content="ai")
sys_msg = SystemMessage(content="sys")
msgs = [
human_msg,
ai_msg,
sys_msg,
]
assert messages_from_dict(messages_to_dict(msgs)) == msgs
# Test with tool calls
msgs = [
AIMessage(
content="",
tool_calls=[create_tool_call(name="a", args={"b": 1}, id=None)],
),
AIMessage(
content="",
tool_calls=[create_tool_call(name="c", args={"c": 2}, id=None)],
),
]
assert messages_from_dict(messages_to_dict(msgs)) == msgsHow to implement getattr?def getattr(name: str) -> Any:
"""Look up attributes dynamically."""
return _import_attribute(name)Example usage of test_multiple_msgdef test_multiple_msg() -> None:
human_msg = HumanMessage(content="human", additional_kwargs={"key": "value"})
ai_msg = AIMessage(content="ai")
sys_msg = SystemMessage(content="sys")
msgs = [
human_msg,
ai_msg,
sys_msg,
]
assert messages_from_dict(messages_to_dict(msgs)) == msgs
# Test with tool calls
msgs = [
AIMessage(
content="",
tool_calls=[create_tool_call(name="a", args={"b": 1}, id=None)],
),
AIMessage(
content="",
tool_calls=[create_tool_call(name="c", args={"c": 2}, id=None)],
),
]
assert messages_from_dict(messages_to_dict(msgs)) == msgs - Loss:
MatryoshkaLosswith these parameters:{ "loss": "MultipleNegativesRankingLoss", "matryoshka_dims": [ 768, 512, 256, 128, 64 ], "matryoshka_weights": [ 1, 1, 1, 1, 1 ], "n_dims_per_step": -1 }
Training Hyperparameters
Non-Default Hyperparameters
per_device_train_batch_size: 4per_device_eval_batch_size: 4gradient_accumulation_steps: 16learning_rate: 2e-05num_train_epochs: 4lr_scheduler_type: cosinewarmup_ratio: 0.1bf16: Trueload_best_model_at_end: Trueoptim: adamw_torchbatch_sampler: no_duplicates
All Hyperparameters
Click to expand
overwrite_output_dir: Falsedo_predict: Falseeval_strategy: noprediction_loss_only: Trueper_device_train_batch_size: 4per_device_eval_batch_size: 4per_gpu_train_batch_size: Noneper_gpu_eval_batch_size: Nonegradient_accumulation_steps: 16eval_accumulation_steps: Nonetorch_empty_cache_steps: Nonelearning_rate: 2e-05weight_decay: 0.0adam_beta1: 0.9adam_beta2: 0.999adam_epsilon: 1e-08max_grad_norm: 1.0num_train_epochs: 4max_steps: -1lr_scheduler_type: cosinelr_scheduler_kwargs: Nonewarmup_ratio: 0.1warmup_steps: 0log_level: passivelog_level_replica: warninglog_on_each_node: Truelogging_nan_inf_filter: Truesave_safetensors: Truesave_on_each_node: Falsesave_only_model: Falserestore_callback_states_from_checkpoint: Falseno_cuda: Falseuse_cpu: Falseuse_mps_device: Falseseed: 42data_seed: Nonejit_mode_eval: Falsebf16: Truefp16: Falsefp16_opt_level: O1half_precision_backend: autobf16_full_eval: Falsefp16_full_eval: Falsetf32: Nonelocal_rank: 0ddp_backend: Nonetpu_num_cores: Nonetpu_metrics_debug: Falsedebug: []dataloader_drop_last: Falsedataloader_num_workers: 0dataloader_prefetch_factor: Nonepast_index: -1disable_tqdm: Falseremove_unused_columns: Truelabel_names: Noneload_best_model_at_end: Trueignore_data_skip: Falsefsdp: []fsdp_min_num_params: 0fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}fsdp_transformer_layer_cls_to_wrap: Noneaccelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}parallelism_config: Nonedeepspeed: Nonelabel_smoothing_factor: 0.0optim: adamw_torchoptim_args: Noneadafactor: Falsegroup_by_length: Falselength_column_name: lengthproject: huggingfacetrackio_space_id: trackioddp_find_unused_parameters: Noneddp_bucket_cap_mb: Noneddp_broadcast_buffers: Falsedataloader_pin_memory: Truedataloader_persistent_workers: Falseskip_memory_metrics: Trueuse_legacy_prediction_loop: Falsepush_to_hub: Falseresume_from_checkpoint: Nonehub_model_id: Nonehub_strategy: every_savehub_private_repo: Nonehub_always_push: Falsehub_revision: Nonegradient_checkpointing: Falsegradient_checkpointing_kwargs: Noneinclude_inputs_for_metrics: Falseinclude_for_metrics: []eval_do_concat_batches: Truefp16_backend: autopush_to_hub_model_id: Nonepush_to_hub_organization: Nonemp_parameters:auto_find_batch_size: Falsefull_determinism: Falsetorchdynamo: Noneray_scope: lastddp_timeout: 1800torch_compile: Falsetorch_compile_backend: Nonetorch_compile_mode: Noneinclude_tokens_per_second: Falseinclude_num_input_tokens_seen: noneftune_noise_alpha: Noneoptim_target_modules: Nonebatch_eval_metrics: Falseeval_on_start: Falseuse_liger_kernel: Falseliger_kernel_config: Noneeval_use_gather_object: Falseaverage_tokens_across_devices: Trueprompts: Nonebatch_sampler: no_duplicatesmulti_dataset_batch_sampler: proportionalrouter_mapping: {}learning_rate_mapping: {}
Training Logs
| Epoch | Step | Training Loss |
|---|---|---|
| 0.7111 | 10 | 0.0808 |
| 1.3556 | 20 | 0.022 |
| 2.0 | 30 | 0.0104 |
| 2.7111 | 40 | 0.0136 |
| 3.3556 | 50 | 0.0001 |
| 4.0 | 60 | 0.0442 |
Framework Versions
- Python: 3.12.12
- Sentence Transformers: 5.2.0
- Transformers: 4.57.6
- PyTorch: 2.9.0+cu126
- Accelerate: 1.12.0
- Datasets: 4.0.0
- Tokenizers: 0.22.2
Citation
BibTeX
Sentence Transformers
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
MatryoshkaLoss
@misc{kusupati2024matryoshka,
title={Matryoshka Representation Learning},
author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
year={2024},
eprint={2205.13147},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
MultipleNegativesRankingLoss
@misc{henderson2017efficient,
title={Efficient Natural Language Response Suggestion for Smart Reply},
author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
year={2017},
eprint={1705.00652},
archivePrefix={arXiv},
primaryClass={cs.CL}
}