metadata
tags:
- sentence-transformers
- sentence-similarity
- feature-extraction
- dense
- generated_from_trainer
- dataset_size:8118
- loss:CachedMultipleNegativesRankingLoss
base_model: benjamintli/modernbert-cosqa
widget:
- source_sentence: python create path if doesnt exist
sentences:
- |-
def clean_whitespace(string, compact=False):
"""Return string with compressed whitespace."""
for a, b in (('\r\n', '\n'), ('\r', '\n'), ('\n\n', '\n'),
('\t', ' '), (' ', ' ')):
string = string.replace(a, b)
if compact:
for a, b in (('\n', ' '), ('[ ', '['),
(' ', ' '), (' ', ' '), (' ', ' ')):
string = string.replace(a, b)
return string.strip()
- |-
def rotateImage(img, angle):
"""
querries scipy.ndimage.rotate routine
:param img: image to be rotated
:param angle: angle to be rotated (radian)
:return: rotated image
"""
imgR = scipy.ndimage.rotate(img, angle, reshape=False)
return imgR
- |-
def check_create_folder(filename):
"""Check if the folder exisits. If not, create the folder"""
os.makedirs(os.path.dirname(filename), exist_ok=True)
- source_sentence: how decompiled python code looks like
sentences:
- |-
def xeval(source, optimize=True):
"""Compiles to native Python bytecode and runs program, returning the
topmost value on the stack.
Args:
optimize: Whether to optimize the code after parsing it.
Returns:
None: If the stack is empty
obj: If the stack contains a single value
[obj, obj, ...]: If the stack contains many values
"""
native = xcompile(source, optimize=optimize)
return native()
- |-
def html(header_rows):
"""
Convert a list of tuples describing a table into a HTML string
"""
name = 'table%d' % next(tablecounter)
return HtmlTable([map(str, row) for row in header_rows], name).render()
- |-
def cint8_array_to_numpy(cptr, length):
"""Convert a ctypes int pointer array to a numpy array."""
if isinstance(cptr, ctypes.POINTER(ctypes.c_int8)):
return np.fromiter(cptr, dtype=np.int8, count=length)
else:
raise RuntimeError('Expected int pointer')
- source_sentence: python calling pytest from a python script
sentences:
- |-
def draw_image(self, ax, image):
"""Process a matplotlib image object and call renderer.draw_image"""
self.renderer.draw_image(imdata=utils.image_to_base64(image),
extent=image.get_extent(),
coordinates="data",
style={"alpha": image.get_alpha(),
"zorder": image.get_zorder()},
mplobj=image)
- |-
def test(): # pragma: no cover
"""Execute the unit tests on an installed copy of unyt.
Note that this function requires pytest to run. If pytest is not
installed this function will raise ImportError.
"""
import pytest
import os
pytest.main([os.path.dirname(os.path.abspath(__file__))])
- |-
def is_int(string):
"""
Checks if a string is an integer. If the string value is an integer
return True, otherwise return False.
Args:
string: a string to test.
Returns:
boolean
"""
try:
a = float(string)
b = int(a)
except ValueError:
return False
else:
return a == b
- source_sentence: python datetime get last day in a month
sentences:
- |-
def upgrade(directory, sql, tag, x_arg, revision):
"""Upgrade to a later version"""
_upgrade(directory, revision, sql, tag, x_arg)
- |-
def flat_list(lst):
"""This function flatten given nested list.
Argument:
nested list
Returns:
flat list
"""
if isinstance(lst, list):
for item in lst:
for i in flat_list(item):
yield i
else:
yield lst
- |-
def get_last_weekday_in_month(year, month, weekday):
"""Get the last weekday in a given month. e.g:
>>> # the last monday in Jan 2013
>>> Calendar.get_last_weekday_in_month(2013, 1, MON)
datetime.date(2013, 1, 28)
"""
day = date(year, month, monthrange(year, month)[1])
while True:
if day.weekday() == weekday:
break
day = day - timedelta(days=1)
return day
- source_sentence: first duplicate element in list in python
sentences:
- |-
def python_mime(fn):
"""
Decorator, which adds correct MIME type for python source to the decorated
bottle API function.
"""
@wraps(fn)
def python_mime_decorator(*args, **kwargs):
response.content_type = "text/x-python"
return fn(*args, **kwargs)
return python_mime_decorator
- |-
def purge_duplicates(list_in):
"""Remove duplicates from list while preserving order.
Parameters
----------
list_in: Iterable
Returns
-------
list
List of first occurences in order
"""
_list = []
for item in list_in:
if item not in _list:
_list.append(item)
return _list
- "def getRect(self):\n\t\t\"\"\"\n\t\tReturns the window bounds as a tuple of (x,y,w,h)\n\t\t\"\"\"\n\t\treturn (self.x, self.y, self.w, self.h)"
pipeline_tag: sentence-similarity
library_name: sentence-transformers
metrics:
- cosine_accuracy@1
- cosine_accuracy@3
- cosine_accuracy@5
- cosine_accuracy@10
- cosine_precision@1
- cosine_precision@3
- cosine_precision@5
- cosine_precision@10
- cosine_recall@1
- cosine_recall@3
- cosine_recall@5
- cosine_recall@10
- cosine_ndcg@10
- cosine_mrr@10
- cosine_map@100
model-index:
- name: SentenceTransformer based on benjamintli/modernbert-cosqa
results:
- task:
type: information-retrieval
name: Information Retrieval
dataset:
name: eval
type: eval
metrics:
- type: cosine_accuracy@1
value: 0.6197339246119734
name: Cosine Accuracy@1
- type: cosine_accuracy@3
value: 0.88470066518847
name: Cosine Accuracy@3
- type: cosine_accuracy@5
value: 0.9390243902439024
name: Cosine Accuracy@5
- type: cosine_accuracy@10
value: 0.9778270509977827
name: Cosine Accuracy@10
- type: cosine_precision@1
value: 0.6197339246119734
name: Cosine Precision@1
- type: cosine_precision@3
value: 0.29490022172949004
name: Cosine Precision@3
- type: cosine_precision@5
value: 0.18780487804878046
name: Cosine Precision@5
- type: cosine_precision@10
value: 0.0977827050997783
name: Cosine Precision@10
- type: cosine_recall@1
value: 0.6197339246119734
name: Cosine Recall@1
- type: cosine_recall@3
value: 0.88470066518847
name: Cosine Recall@3
- type: cosine_recall@5
value: 0.9390243902439024
name: Cosine Recall@5
- type: cosine_recall@10
value: 0.9778270509977827
name: Cosine Recall@10
- type: cosine_ndcg@10
value: 0.8124675617500997
name: Cosine Ndcg@10
- type: cosine_mrr@10
value: 0.7577473339668463
name: Cosine Mrr@10
- type: cosine_map@100
value: 0.7588050805217604
name: Cosine Map@100
SentenceTransformer based on benjamintli/modernbert-cosqa
This is a sentence-transformers model finetuned from benjamintli/modernbert-cosqa. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
Model Details
Model Description
- Model Type: Sentence Transformer
- Base model: benjamintli/modernbert-cosqa
- Maximum Sequence Length: 512 tokens
- Output Dimensionality: 768 dimensions
- Similarity Function: Cosine Similarity
Model Sources
- Documentation: Sentence Transformers Documentation
- Repository: Sentence Transformers on GitHub
- Hugging Face: Sentence Transformers on Hugging Face
Full Model Architecture
SentenceTransformer(
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'OptimizedModule'})
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)
Usage
Direct Usage (Sentence Transformers)
First install the Sentence Transformers library:
pip install -U sentence-transformers
Then you can load this model and run inference.
from sentence_transformers import SentenceTransformer
# Download from the 🤗 Hub
model = SentenceTransformer("modernbert-cosqa")
# Run inference
queries = [
"first duplicate element in list in python",
]
documents = [
'def purge_duplicates(list_in):\n """Remove duplicates from list while preserving order.\n\n Parameters\n ----------\n list_in: Iterable\n\n Returns\n -------\n list\n List of first occurences in order\n """\n _list = []\n for item in list_in:\n if item not in _list:\n _list.append(item)\n return _list',
'def getRect(self):\n\t\t"""\n\t\tReturns the window bounds as a tuple of (x,y,w,h)\n\t\t"""\n\t\treturn (self.x, self.y, self.w, self.h)',
'def python_mime(fn):\n """\n Decorator, which adds correct MIME type for python source to the decorated\n bottle API function.\n """\n @wraps(fn)\n def python_mime_decorator(*args, **kwargs):\n response.content_type = "text/x-python"\n\n return fn(*args, **kwargs)\n\n return python_mime_decorator',
]
query_embeddings = model.encode_query(queries)
document_embeddings = model.encode_document(documents)
print(query_embeddings.shape, document_embeddings.shape)
# [1, 768] [3, 768]
# Get the similarity scores for the embeddings
similarities = model.similarity(query_embeddings, document_embeddings)
print(similarities)
# tensor([[ 0.5986, -0.0006, -0.0122]])
Evaluation
Metrics
Information Retrieval
- Dataset:
eval - Evaluated with
InformationRetrievalEvaluator
| Metric | Value |
|---|---|
| cosine_accuracy@1 | 0.6197 |
| cosine_accuracy@3 | 0.8847 |
| cosine_accuracy@5 | 0.939 |
| cosine_accuracy@10 | 0.9778 |
| cosine_precision@1 | 0.6197 |
| cosine_precision@3 | 0.2949 |
| cosine_precision@5 | 0.1878 |
| cosine_precision@10 | 0.0978 |
| cosine_recall@1 | 0.6197 |
| cosine_recall@3 | 0.8847 |
| cosine_recall@5 | 0.939 |
| cosine_recall@10 | 0.9778 |
| cosine_ndcg@10 | 0.8125 |
| cosine_mrr@10 | 0.7577 |
| cosine_map@100 | 0.7588 |
Training Details
Training Dataset
Unnamed Dataset
- Size: 8,118 training samples
- Columns:
queryandpositive - Approximate statistics based on the first 1000 samples:
query positive type string string details - min: 6 tokens
- mean: 9.3 tokens
- max: 23 tokens
- min: 35 tokens
- mean: 85.05 tokens
- max: 512 tokens
- Samples:
query positive python code for opening geojson filedef _loadfilepath(self, filepath, **kwargs):
"""This loads a geojson file into a geojson python
dictionary using the json module.
Note: to load with a different text encoding use the encoding argument.
"""
with open(filepath, "r") as f:
data = json.load(f, **kwargs)
return datapython 3 none compare with intdef is_natural(x):
"""A non-negative integer."""
try:
is_integer = int(x) == x
except (TypeError, ValueError):
return False
return is_integer and x >= 0design db memory cache pythondef refresh(self, document):
""" Load a new copy of a document from the database. does not
replace the old one """
try:
old_cache_size = self.cache_size
self.cache_size = 0
obj = self.query(type(document)).filter_by(mongo_id=document.mongo_id).one()
finally:
self.cache_size = old_cache_size
self.cache_write(obj)
return obj - Loss:
CachedMultipleNegativesRankingLosswith these parameters:{ "scale": 20.0, "similarity_fct": "cos_sim", "mini_batch_size": 64, "gather_across_devices": false, "directions": [ "query_to_doc" ], "partition_mode": "joint", "hardness_mode": null, "hardness_strength": 0.0 }
Evaluation Dataset
Unnamed Dataset
- Size: 902 evaluation samples
- Columns:
queryandpositive - Approximate statistics based on the first 902 samples:
query positive type string string details - min: 6 tokens
- mean: 9.24 tokens
- max: 22 tokens
- min: 38 tokens
- mean: 86.55 tokens
- max: 332 tokens
- Samples:
query positive how to remove masked items in python arraydef ma(self):
"""Represent data as a masked array.
The array is returned with column-first indexing, i.e. for a data file with
columns X Y1 Y2 Y3 ... the array a will be a[0] = X, a[1] = Y1, ... .
inf and nan are filtered via :func:numpy.isfinite.
"""
a = self.array
return numpy.ma.MaskedArray(a, mask=numpy.logical_not(numpy.isfinite(a)))python deepcopy basic typedef deepcopy(self, memo):
"""Improve deepcopy speed."""
return type(self)(value=self._value, enum_ref=self.enum_ref)python number of non nan rows in a rowdef count_rows_with_nans(X):
"""Count the number of rows in 2D arrays that contain any nan values."""
if X.ndim == 2:
return np.where(np.isnan(X).sum(axis=1) != 0, 1, 0).sum() - Loss:
CachedMultipleNegativesRankingLosswith these parameters:{ "scale": 20.0, "similarity_fct": "cos_sim", "mini_batch_size": 64, "gather_across_devices": false, "directions": [ "query_to_doc" ], "partition_mode": "joint", "hardness_mode": null, "hardness_strength": 0.0 }
Training Hyperparameters
Non-Default Hyperparameters
per_device_train_batch_size: 1024num_train_epochs: 10learning_rate: 2e-06warmup_steps: 0.1bf16: Trueeval_strategy: epochper_device_eval_batch_size: 1024push_to_hub: Truehub_model_id: modernbert-cosqaload_best_model_at_end: Truedataloader_num_workers: 4batch_sampler: no_duplicates
All Hyperparameters
Click to expand
per_device_train_batch_size: 1024num_train_epochs: 10max_steps: -1learning_rate: 2e-06lr_scheduler_type: linearlr_scheduler_kwargs: Nonewarmup_steps: 0.1optim: adamw_torch_fusedoptim_args: Noneweight_decay: 0.0adam_beta1: 0.9adam_beta2: 0.999adam_epsilon: 1e-08optim_target_modules: Nonegradient_accumulation_steps: 1average_tokens_across_devices: Truemax_grad_norm: 1.0label_smoothing_factor: 0.0bf16: Truefp16: Falsebf16_full_eval: Falsefp16_full_eval: Falsetf32: Nonegradient_checkpointing: Falsegradient_checkpointing_kwargs: Nonetorch_compile: Falsetorch_compile_backend: Nonetorch_compile_mode: Noneuse_liger_kernel: Falseliger_kernel_config: Noneuse_cache: Falseneftune_noise_alpha: Nonetorch_empty_cache_steps: Noneauto_find_batch_size: Falselog_on_each_node: Truelogging_nan_inf_filter: Trueinclude_num_input_tokens_seen: nolog_level: passivelog_level_replica: warningdisable_tqdm: Falseproject: huggingfacetrackio_space_id: trackioeval_strategy: epochper_device_eval_batch_size: 1024prediction_loss_only: Trueeval_on_start: Falseeval_do_concat_batches: Trueeval_use_gather_object: Falseeval_accumulation_steps: Noneinclude_for_metrics: []batch_eval_metrics: Falsesave_only_model: Falsesave_on_each_node: Falseenable_jit_checkpoint: Falsepush_to_hub: Truehub_private_repo: Nonehub_model_id: modernbert-cosqahub_strategy: every_savehub_always_push: Falsehub_revision: Noneload_best_model_at_end: Trueignore_data_skip: Falserestore_callback_states_from_checkpoint: Falsefull_determinism: Falseseed: 42data_seed: Noneuse_cpu: Falseaccelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}parallelism_config: Nonedataloader_drop_last: Falsedataloader_num_workers: 4dataloader_pin_memory: Truedataloader_persistent_workers: Falsedataloader_prefetch_factor: Noneremove_unused_columns: Truelabel_names: Nonetrain_sampling_strategy: randomlength_column_name: lengthddp_find_unused_parameters: Noneddp_bucket_cap_mb: Noneddp_broadcast_buffers: Falseddp_backend: Noneddp_timeout: 1800fsdp: []fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}deepspeed: Nonedebug: []skip_memory_metrics: Truedo_predict: Falseresume_from_checkpoint: Nonewarmup_ratio: Nonelocal_rank: -1prompts: Nonebatch_sampler: no_duplicatesmulti_dataset_batch_sampler: proportionalrouter_mapping: {}learning_rate_mapping: {}
Training Logs
| Epoch | Step | Training Loss | Validation Loss | eval_cosine_ndcg@10 |
|---|---|---|---|---|
| 1.0 | 8 | - | 0.3550 | 0.8071 |
| 1.25 | 10 | 1.0218 | - | - |
| 2.0 | 16 | - | 0.3508 | 0.8110 |
| 2.5 | 20 | 0.9890 | - | - |
| 3.0 | 24 | - | 0.3466 | 0.8131 |
| 3.75 | 30 | 0.9778 | - | - |
| 4.0 | 32 | - | 0.3439 | 0.8136 |
| 5.0 | 40 | 0.9507 | 0.3417 | 0.8148 |
| 6.0 | 48 | - | 0.3404 | 0.8120 |
| 6.25 | 50 | 0.9429 | - | - |
| 7.0 | 56 | - | 0.3387 | 0.8131 |
| 7.5 | 60 | 0.9267 | - | - |
| 8.0 | 64 | - | 0.3378 | 0.8127 |
| 8.75 | 70 | 0.9396 | - | - |
| 9.0 | 72 | - | 0.3370 | 0.8106 |
| 10.0 | 80 | 0.9099 | 0.3366 | 0.8125 |
- The bold row denotes the saved checkpoint.
Framework Versions
- Python: 3.12.12
- Sentence Transformers: 5.3.0
- Transformers: 5.3.0
- PyTorch: 2.10.0+cu128
- Accelerate: 1.13.0
- Datasets: 4.8.2
- Tokenizers: 0.22.2
Citation
BibTeX
Sentence Transformers
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
CachedMultipleNegativesRankingLoss
@misc{gao2021scaling,
title={Scaling Deep Contrastive Learning Batch Size under Memory Limited Setup},
author={Luyu Gao and Yunyi Zhang and Jiawei Han and Jamie Callan},
year={2021},
eprint={2101.06983},
archivePrefix={arXiv},
primaryClass={cs.LG}
}