SentenceTransformer based on benjamintli/modernbert-cosqa
This is a sentence-transformers model finetuned from benjamintli/modernbert-cosqa. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
Model Details
Model Description
- Model Type: Sentence Transformer
- Base model: benjamintli/modernbert-cosqa
- Maximum Sequence Length: 512 tokens
- Output Dimensionality: 768 dimensions
- Similarity Function: Cosine Similarity
Model Sources
Full Model Architecture
SentenceTransformer(
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'OptimizedModule'})
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)
Usage
Direct Usage (Sentence Transformers)
First install the Sentence Transformers library:
pip install -U sentence-transformers
Then you can load this model and run inference.
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("modernbert-codesearchnet")
queries = [
"Split the data object along a given expression, in units.\n\n Parameters\n ----------\n expression : int or str\n The expression to split along. If given as an integer, the axis at that index\n is used.\n positions : number-type or 1D array-type\n The position(s) to split at, in units.\n units : str (optional)\n The units of the given positions. Default is same, which assumes\n input units are identical to first variable units.\n parent : WrightTools.Collection (optional)\n The parent collection in which to place the \u0027split\u0027 collection.\n Default is a new Collection.\n verbose : bool (optional)\n Toggle talkback. Default is True.\n\n Returns\n -------\n WrightTools.collection.Collection\n A Collection of data objects.\n The order of the objects is such that the axis points retain their original order.\n\n See Also\n --------\n chop\n Divide the dataset into its lower-dimensionality components.\n collapse\n Collapse the dataset along one axis.",
]
documents = [
'def split(\n self, expression, positions, *, units=None, parent=None, verbose=True\n ) -> wt_collection.Collection:\n """\n Split the data object along a given expression, in units.\n\n Parameters\n ----------\n expression : int or str\n The expression to split along. If given as an integer, the axis at that index\n is used.\n positions : number-type or 1D array-type\n The position(s) to split at, in units.\n units : str (optional)\n The units of the given positions. Default is same, which assumes\n input units are identical to first variable units.\n parent : WrightTools.Collection (optional)\n The parent collection in which to place the \'split\' collection.\n Default is a new Collection.\n verbose : bool (optional)\n Toggle talkback. Default is True.\n\n Returns\n -------\n WrightTools.collection.Collection\n A Collection of data objects.\n The order of the objects is such that the axis points retain their original order.\n\n See Also\n --------\n chop\n Divide the dataset into its lower-dimensionality components.\n collapse\n Collapse the dataset along one axis.\n """\n # axis ------------------------------------------------------------------------------------\n old_expr = self.axis_expressions\n old_units = self.units\n out = wt_collection.Collection(name="split", parent=parent)\n if isinstance(expression, int):\n if units is None:\n units = self._axes[expression].units\n expression = self._axes[expression].expression\n elif isinstance(expression, str):\n pass\n else:\n raise TypeError("expression: expected {int, str}, got %s" % type(expression))\n\n self.transform(expression)\n if units:\n self.convert(units)\n\n try:\n positions = [-np.inf] + sorted(list(positions)) + [np.inf]\n except TypeError:\n positions = [-np.inf, positions, np.inf]\n\n values = self._axes[0].full\n masks = [(values >= lo) & (values < hi) for lo, hi in wt_kit.pairwise(positions)]\n omasks = []\n cuts = []\n for mask in masks:\n try:\n omasks.append(wt_kit.mask_reduce(mask))\n cuts.append([i == 1 for i in omasks[-1].shape])\n # Ensure at least one axis is kept\n if np.all(cuts[-1]):\n cuts[-1][0] = False\n except ValueError:\n omasks.append(None)\n cuts.append(None)\n for i in range(len(positions) - 1):\n out.create_data("split%03i" % i)\n\n for var in self.variables:\n for i, (imask, omask, cut) in enumerate(zip(masks, omasks, cuts)):\n if omask is None:\n # Zero length split\n continue\n omask = wt_kit.enforce_mask_shape(omask, var.shape)\n omask.shape = tuple([s for s, c in zip(omask.shape, cut) if not c])\n out_arr = np.full(omask.shape, np.nan)\n imask = wt_kit.enforce_mask_shape(imask, var.shape)\n out_arr[omask] = var[:][imask]\n out[i].create_variable(values=out_arr, **var.attrs)\n\n for ch in self.channels:\n for i, (imask, omask, cut) in enumerate(zip(masks, omasks, cuts)):\n if omask is None:\n # Zero length split\n continue\n omask = wt_kit.enforce_mask_shape(omask, ch.shape)\n omask.shape = tuple([s for s, c in zip(omask.shape, cut) if not c])\n out_arr = np.full(omask.shape, np.nan)\n imask = wt_kit.enforce_mask_shape(imask, ch.shape)\n out_arr[omask] = ch[:][imask]\n out[i].create_channel(values=out_arr, **ch.attrs)\n\n if verbose:\n for d in out.values():\n try:\n d.transform(expression)\n except IndexError:\n continue\n\n print("split data into {0} pieces along <{1}>:".format(len(positions) - 1, expression))\n for i, (lo, hi) in enumerate(wt_kit.pairwise(positions)):\n new_data = out[i]\n if new_data.shape == ():\n print(" {0} : None".format(i))\n else:\n new_axis = new_data.axes[0]\n print(\n " {0} : {1:0.2f} to {2:0.2f} {3} {4}".format(\n i, lo, hi, new_axis.units, new_axis.shape\n )\n )\n\n for d in out.values():\n try:\n d.transform(*old_expr)\n keep = []\n keep_units = []\n for ax in d.axes:\n if ax.size > 1:\n keep.append(ax.expression)\n keep_units.append(ax.units)\n else:\n d.create_constant(ax.expression, verbose=False)\n d.transform(*keep)\n for ax, u in zip(d.axes, keep_units):\n ax.convert(u)\n except IndexError:\n continue\n tempax = Axis(d, expression)\n if all(\n np.all(\n np.sum(~np.isnan(tempax.masked), axis=tuple(set(range(tempax.ndim)) - {j}))\n <= 1\n )\n for j in range(tempax.ndim)\n ):\n d.create_constant(expression, verbose=False)\n self.transform(*old_expr)\n for ax, u in zip(self.axes, old_units):\n ax.convert(u)\n\n return out',
'def add_item(self, title, key, synonyms=None, description=None, img_url=None):\n """Adds item to a list or carousel card.\n\n A list must contain at least 2 items, each requiring a title and object key.\n\n Arguments:\n title {str} -- Name of the item object\n key {str} -- Key refering to the item.\n This string will be used to send a query to your app if selected\n\n Keyword Arguments:\n synonyms {list} -- Words and phrases the user may send to select the item\n (default: {None})\n description {str} -- A description of the item (default: {None})\n img_url {str} -- URL of the image to represent the item (default: {None})\n """\n item = build_item(title, key, synonyms, description, img_url)\n self._items.append(item)\n return self',
'def compare(a, b):\n """Compares two timestamps.\n\n ``a`` and ``b`` must be the same type, in addition to normal\n representations of timestamps that order naturally, they can be rfc3339\n formatted strings.\n\n Args:\n a (string|object): a timestamp\n b (string|object): another timestamp\n\n Returns:\n int: -1 if a < b, 0 if a == b or 1 if a > b\n\n Raises:\n ValueError: if a or b are not the same type\n ValueError: if a or b strings but not in valid rfc3339 format\n\n """\n a_is_text = isinstance(a, basestring)\n b_is_text = isinstance(b, basestring)\n if type(a) != type(b) and not (a_is_text and b_is_text):\n _logger.error(u\'Cannot compare %s to %s, types differ %s!=%s\',\n a, b, type(a), type(b))\n raise ValueError(u\'cannot compare inputs of differing types\')\n\n if a_is_text:\n a = from_rfc3339(a, with_nanos=True)\n b = from_rfc3339(b, with_nanos=True)\n\n if a < b:\n return -1\n elif a > b:\n return 1\n else:\n return 0',
]
query_embeddings = model.encode_query(queries)
document_embeddings = model.encode_document(documents)
print(query_embeddings.shape, document_embeddings.shape)
similarities = model.similarity(query_embeddings, document_embeddings)
print(similarities)
Evaluation
Metrics
Information Retrieval
| Metric |
Value |
| cosine_accuracy@1 |
0.9481 |
| cosine_accuracy@3 |
0.9703 |
| cosine_accuracy@5 |
0.9752 |
| cosine_accuracy@10 |
0.9807 |
| cosine_precision@1 |
0.9481 |
| cosine_precision@3 |
0.3234 |
| cosine_precision@5 |
0.195 |
| cosine_precision@10 |
0.0981 |
| cosine_recall@1 |
0.9481 |
| cosine_recall@3 |
0.9703 |
| cosine_recall@5 |
0.9752 |
| cosine_recall@10 |
0.9807 |
| cosine_ndcg@10 |
0.9652 |
| cosine_mrr@10 |
0.9602 |
| cosine_map@100 |
0.9606 |
Training Details
Training Dataset
Unnamed Dataset
- Size: 369,762 training samples
- Columns:
query and positive
- Approximate statistics based on the first 1000 samples:
|
query |
positive |
| type |
string |
string |
| details |
- min: 3 tokens
- mean: 71.9 tokens
- max: 512 tokens
|
- min: 37 tokens
- mean: 236.1 tokens
- max: 512 tokens
|
- Samples:
| query |
positive |
Returns group object for datacenter root group.
>>> clc.v2.Datacenter().RootGroup() >>> print _ WA1 Hardware |
def RootGroup(self): """Returns group object for datacenter root group.
>>> clc.v2.Datacenter().RootGroup() >>> print _ WA1 Hardware
"""
return(clc.v2.Group(id=self.root_group_id,alias=self.alias,session=self.session)) |
Calculate the euclidean distance of all array positions in "matchArr".
:param matchArr: a dictionary of numpy.arrays containing at least two entries that are treated as cartesian coordinates. :param tKey: #TODO: docstring :param mKey: #TODO: docstring
:returns: #TODO: docstring
{'eucDist': numpy.array([eucDistance, eucDistance, ...]), 'posPairs': numpy.array([[pos1, pos2], [pos1, pos2], ...]) } |
def calcDistMatchArr(matchArr, tKey, mKey): """Calculate the euclidean distance of all array positions in "matchArr".
:param matchArr: a dictionary of numpy.arrays containing at least two entries that are treated as cartesian coordinates. :param tKey: #TODO: docstring :param mKey: #TODO: docstring
:returns: #TODO: docstring
{'eucDist': numpy.array([eucDistance, eucDistance, ...]), 'posPairs': numpy.array([[pos1, pos2], [pos1, pos2], ...]) } """ #Calculate all sorted list of all eucledian feature distances matchArrSize = listvalues(matchArr)[0].size
distInfo = {'posPairs': list(), 'eucDist': list()} _matrix = numpy.swapaxes(numpy.array([matchArr[tKey], matchArr[mKey]]), 0, 1)
for pos1 in range(matchArrSize-1): for pos2 in range(pos1+1, matchArrSize): distInfo['posPairs'].append((pos1, pos2)) distInfo['posPairs'] = numpy.array(distInfo['posPairs']) distInfo['eucD... |
Format this verifier
Returns: string: A formatted string |
def format(self, indent_level, indent_size=4): """Format this verifier
Returns: string: A formatted string """
name = self.format_name('Literal', indent_size)
if self.long_desc is not None: name += '\n'
name += self.wrap_lines('value: %s\n' % str(self._literal), 1, indent_size)
return self.wrap_lines(name, indent_level, indent_size) |
- Loss:
CachedMultipleNegativesRankingLoss with these parameters:{
"scale": 20.0,
"similarity_fct": "cos_sim",
"mini_batch_size": 64,
"gather_across_devices": false,
"directions": [
"query_to_doc"
],
"partition_mode": "joint",
"hardness_mode": null,
"hardness_strength": 0.0
}
Evaluation Dataset
Unnamed Dataset
- Size: 19,462 evaluation samples
- Columns:
query and positive
- Approximate statistics based on the first 1000 samples:
|
query |
positive |
| type |
string |
string |
| details |
- min: 3 tokens
- mean: 71.05 tokens
- max: 512 tokens
|
- min: 40 tokens
- mean: 236.22 tokens
- max: 512 tokens
|
- Samples:
| query |
positive |
Create a new ParticipantInstance
:param unicode attributes: An optional string metadata field you can use to store any data you wish. :param unicode twilio_address: The address of the Twilio phone number that the participant is in contact with. :param datetime date_created: The date that this resource was created. :param datetime date_updated: The date that this resource was last updated. :param unicode identity: A unique string identifier for the session participant as Chat User. :param unicode user_address: The address of the participant's device.
:returns: Newly created ParticipantInstance :rtype: twilio.rest.messaging.v1.session.participant.ParticipantInstance |
def create(self, attributes=values.unset, twilio_address=values.unset, date_created=values.unset, date_updated=values.unset, identity=values.unset, user_address=values.unset): """ Create a new ParticipantInstance
:param unicode attributes: An optional string metadata field you can use to store any data you wish. :param unicode twilio_address: The address of the Twilio phone number that the participant is in contact with. :param datetime date_created: The date that this resource was created. :param datetime date_updated: The date that this resource was last updated. :param unicode identity: A unique string identifier for the session participant as Chat User. :param unicode user_address: The address of the participant's device.
:returns: Newly created ParticipantInstance :rtype: twilio.rest.messaging.v1.session.participant.ParticipantInstance """ data = values.o... |
It returns absolute url defined by node related to this page |
def get_absolute_url(self): """ It returns absolute url defined by node related to this page """ try: node = Node.objects.select_related().filter(page=self)[0] return node.get_absolute_url() except Exception, e: raise ValueError(u"Error in {0}.{1}: {2}".format(self.module, self.class.name, e)) return u"" |
Return the current scaled font.
:return: A new :class:ScaledFont object, wrapping an existing cairo object. |
def get_scaled_font(self): """Return the current scaled font.
:return: A new :class:ScaledFont object, wrapping an existing cairo object.
""" return ScaledFont._from_pointer( cairo.cairo_get_scaled_font(self._pointer), incref=True) |
- Loss:
CachedMultipleNegativesRankingLoss with these parameters:{
"scale": 20.0,
"similarity_fct": "cos_sim",
"mini_batch_size": 64,
"gather_across_devices": false,
"directions": [
"query_to_doc"
],
"partition_mode": "joint",
"hardness_mode": null,
"hardness_strength": 0.0
}
Training Hyperparameters
Non-Default Hyperparameters
per_device_train_batch_size: 8192
num_train_epochs: 1
learning_rate: 2e-06
warmup_steps: 0.1
bf16: True
eval_strategy: epoch
per_device_eval_batch_size: 8192
push_to_hub: True
hub_model_id: modernbert-codesearchnet
load_best_model_at_end: True
dataloader_num_workers: 4
batch_sampler: no_duplicates
All Hyperparameters
Click to expand
per_device_train_batch_size: 8192
num_train_epochs: 1
max_steps: -1
learning_rate: 2e-06
lr_scheduler_type: linear
lr_scheduler_kwargs: None
warmup_steps: 0.1
optim: adamw_torch_fused
optim_args: None
weight_decay: 0.0
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 1e-08
optim_target_modules: None
gradient_accumulation_steps: 1
average_tokens_across_devices: True
max_grad_norm: 1.0
label_smoothing_factor: 0.0
bf16: True
fp16: False
bf16_full_eval: False
fp16_full_eval: False
tf32: None
gradient_checkpointing: False
gradient_checkpointing_kwargs: None
torch_compile: False
torch_compile_backend: None
torch_compile_mode: None
use_liger_kernel: False
liger_kernel_config: None
use_cache: False
neftune_noise_alpha: None
torch_empty_cache_steps: None
auto_find_batch_size: False
log_on_each_node: True
logging_nan_inf_filter: True
include_num_input_tokens_seen: no
log_level: passive
log_level_replica: warning
disable_tqdm: False
project: huggingface
trackio_space_id: trackio
eval_strategy: epoch
per_device_eval_batch_size: 8192
prediction_loss_only: True
eval_on_start: False
eval_do_concat_batches: True
eval_use_gather_object: False
eval_accumulation_steps: None
include_for_metrics: []
batch_eval_metrics: False
save_only_model: False
save_on_each_node: False
enable_jit_checkpoint: False
push_to_hub: True
hub_private_repo: None
hub_model_id: modernbert-codesearchnet
hub_strategy: every_save
hub_always_push: False
hub_revision: None
load_best_model_at_end: True
ignore_data_skip: False
restore_callback_states_from_checkpoint: False
full_determinism: False
seed: 42
data_seed: None
use_cpu: False
accelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
parallelism_config: None
dataloader_drop_last: False
dataloader_num_workers: 4
dataloader_pin_memory: True
dataloader_persistent_workers: False
dataloader_prefetch_factor: None
remove_unused_columns: True
label_names: None
train_sampling_strategy: random
length_column_name: length
ddp_find_unused_parameters: None
ddp_bucket_cap_mb: None
ddp_broadcast_buffers: False
ddp_backend: None
ddp_timeout: 1800
fsdp: []
fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
deepspeed: None
debug: []
skip_memory_metrics: True
do_predict: False
resume_from_checkpoint: None
warmup_ratio: None
local_rank: -1
prompts: None
batch_sampler: no_duplicates
multi_dataset_batch_sampler: proportional
router_mapping: {}
learning_rate_mapping: {}
Training Logs
| Epoch |
Step |
Training Loss |
Validation Loss |
eval_cosine_ndcg@10 |
| 0.2174 |
10 |
0.9210 |
- |
- |
| 0.4348 |
20 |
0.6679 |
- |
- |
| 0.6522 |
30 |
0.5007 |
- |
- |
| 0.8696 |
40 |
0.4181 |
- |
- |
| 1.0 |
46 |
- |
0.0328 |
0.9652 |
- The bold row denotes the saved checkpoint.
Framework Versions
- Python: 3.12.12
- Sentence Transformers: 5.3.0
- Transformers: 5.3.0
- PyTorch: 2.10.0+cu128
- Accelerate: 1.13.0
- Datasets: 4.8.2
- Tokenizers: 0.22.2
Citation
BibTeX
Sentence Transformers
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
CachedMultipleNegativesRankingLoss
@misc{gao2021scaling,
title={Scaling Deep Contrastive Learning Batch Size under Memory Limited Setup},
author={Luyu Gao and Yunyi Zhang and Jiawei Han and Jamie Callan},
year={2021},
eprint={2101.06983},
archivePrefix={arXiv},
primaryClass={cs.LG}
}