nomic-embed-text1.5-ftcode / README.md

JahnaviKumar

Add new SentenceTransformer model

e7198a7 verified about 2 months ago

preview code

raw

history blame contribute delete

18.7 kB

metadata

tags:
  - sentence-transformers
  - sentence-similarity
  - feature-extraction
  - dense
  - generated_from_trainer
  - dataset_size:100
  - loss:MatryoshkaLoss
  - loss:MultipleNegativesRankingLoss
base_model: nomic-ai/nomic-embed-text-v1.5
widget:
  - source_sentence: "func SetFactory(ctx context.Context, f Factory) context.Context {\n\treturn"
    sentences:
      - rm -r path
      - |-
        Transforms an array into a DateTime.

        @param array $value Array value.

        @return DateTime DateTime value.
      - |2-
         context.WithValue(ctx, &clockKey, f)
        }
  - source_sentence: >
      public function hyvesTipUrl($title, $body, $categoryId = 12, $rating = 5)
      {

              $url = 'http://www.hyves-share.nl/button/tip/?tipcategoryid=%s&rating=%s&title=%s&body=%s';
    sentences:
      - " by a TLS client to\n\t// authenticate itself to the TLS server.\n\ttemplate.ExtKeyUsage = append(template.ExtKeyUsage, x509.ExtKeyUsageClientAuth)\n\n\tt := time.Now().UnixNano()\n\ttemplate.SerialNumber = pki.BuildPKISerial(t)\n\n\tcertificate, err := pki.SignNewCertificate(privateKey, template, caCert.Certificate, caKey)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"error signing certificate for master kubelet: %v\", err)\n\t}\n\n\tcaBytes, err := caCert.AsBytes()\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"failed to get certificate authority data: %s\", err)\n\t}\n\tcertBytes, err := certificate.AsBytes()\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"failed to get certificate data: %s\", err)\n\t}\n\tkeyBytes, err := privateKey.AsBytes()\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"failed to get private key data: %s\", err)\n\t}\n\n\tcontent, err := b.BuildKubeConfig(\"kubelet\", caBytes, certBytes, keyBytes)\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\n\treturn &nodetasks.File{\n\t\tPath:     b.KubeletKubeConfig(),\n\t\tContents: fi.NewStringResource(content),\n\t\tType:     nodetasks.FileType_File,\n\t\tMode:     s(\"600\"),\n\t}, nil\n}"
      - |-
        Executes the current query and returns the response

        @throws \Cassandra\Response\Exception
        @return \Cassandra\Response
      - |2-
                $title = $title;
                $body = $body;
                return sprintf($url, $categoryId, $rating, $title, $body);
            }
  - source_sentence: |-
      public function get($key, $default = null, $dot_syntax = true)
          {
              if ($dot_syntax === true) {
                  $paths = explode('.', $key);
                  $node =& $this->_data;
                  
                  foreach ($paths as $path) {
                      if (!is_array($node) || !isset($node[$path])) {
                          // error occurred
                          return $default;
                      }
                      $node =& $node[$path];
                  }
                  
                  return $node;
                  
              } else {
                  
                  return isset($this->_data[$key]) ? $this->_data[$key] : $default;
                  
              }
          }
    sentences:
      - // PrintShortName turns a pkix.Name into a string of RDN tuples.
      - >-
        Here is the code to create an array, add elements, sort in ascending
        order, and print the elements in reverse order in Java:


        ```java

        import java.util.Arrays;


        public class Main {
            public static void main(String[] args) {
                // Create an array
                int[] array = {5, 7, 3};

                // Sort the array in ascending order
                Arrays.sort(array);

                // Print the elements in reverse order
                for (int i = array.length - 1; i >= 0; i--) {
                    System.out.println(array[i]);
                }
            }
        }

        ```


        Output:

        ```

        7

        5

        3

        ```


        In the code above, we import the `Arrays` class from the `java.util`
        package to use the `sort()` method for sorting the array. We create an
        integer array `array` with the given elements. The `Arrays.sort(array)`
        method sorts the array in ascending order. Finally, we loop through the
        array in reverse order starting from the last index (`array.length - 1`)
        and print each element using `System.out.println()`.
      - |-
        Returns a single item from the collection data.

        @param string $key
        @return mixed
  - source_sentence: |-
      def iter(self, query, *parameters, **kwargs):
              """Returns a generator for records from the query."""
              cursor = self._cursor()
              try:
                  self._execute(cursor, query, parameters or None, kwargs)
                  if cursor.description:
                      column_names = [column.name for column in cursor.description]
                      while True:
                          record = cursor.fetchone()
                          if not record:
                              break
                          yield Row(zip(column_names, record))
                  raise StopIteration

              except:
                  cursor.close()
                  raise
    sentences:
      - |-
        def exit(exit_code=0):
          r"""A function to support exiting from exit hooks.

          Could also be used to exit from the calling scripts in a thread safe manner.
          """
          core.processExitHooks()

          if state.isExitHooked and not hasattr(sys, 'exitfunc'): # The function is called from the exit hook
            sys.stderr.flush()
            sys.stdout.flush()
            os._exit(exit_code) #pylint: disable=W0212

          sys.exit(exit_code)
      - Returns a generator for records from the query.
      - |2-
         """

                url = self.file['url']
                args = ['{0}={1}'.format(k, v) for k, v in kwargs.items()]

                if args:
                    url += '?{0}'.format('&'.join(args))

                return url
  - source_sentence: >-
      What is the total CO2 emission from all aquaculture farms in the year
      2021?
    sentences:
      - |2-
         && value.size == value.uniq.size
              else
                result
              end
            end
      - "\n\treturn c.postJSON(\"joberror\", args)\n}"
      - SELECT SUM(co2_emission) FROM co2_emission WHERE year = 2021;
pipeline_tag: sentence-similarity
library_name: sentence-transformers

SentenceTransformer based on nomic-ai/nomic-embed-text-v1.5

This is a sentence-transformers model finetuned from nomic-ai/nomic-embed-text-v1.5. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.

Model Details

Model Description

Model Type: Sentence Transformer
Base model: nomic-ai/nomic-embed-text-v1.5
Maximum Sequence Length: 8192 tokens
Output Dimensionality: 768 dimensions
Similarity Function: Cosine Similarity

Model Sources

Documentation: Sentence Transformers Documentation
Repository: Sentence Transformers on GitHub
Hugging Face: Sentence Transformers on Hugging Face

Full Model Architecture

SentenceTransformer(
  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False, 'architecture': 'NomicBertModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

Usage

Direct Usage (Sentence Transformers)

First install the Sentence Transformers library:

pip install -U sentence-transformers

Then you can load this model and run inference.

from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("JahnaviKumar/nomic-embed-text1.5-ftcode")
# Run inference
queries = [
    "What is the total CO2 emission from all aquaculture farms in the year 2021?",
]
documents = [
    'SELECT SUM(co2_emission) FROM co2_emission WHERE year = 2021;',
    '\n\treturn c.postJSON("joberror", args)\n}',
    ' && value.size == value.uniq.size\n      else\n        result\n      end\n    end',
]
query_embeddings = model.encode_query(queries)
document_embeddings = model.encode_document(documents)
print(query_embeddings.shape, document_embeddings.shape)
# [1, 768] [3, 768]

# Get the similarity scores for the embeddings
similarities = model.similarity(query_embeddings, document_embeddings)
print(similarities)
# tensor([[0.7075, 0.3913, 0.3213]])

Training Details

Training Dataset

Unnamed Dataset

Size: 100 training samples
Columns: query and corpus
Approximate statistics based on the first 100 samples:
query corpus
type string string
details
min: 6 tokens
mean: 138.88 tokens
max: 1004 tokens

min: 6 tokens
mean: 95.76 tokens
max: 1151 tokens

	query	corpus
type	string	string
details	min: 6 tokens mean: 138.88 tokens max: 1004 tokens	min: 6 tokens mean: 95.76 tokens max: 1151 tokens

Samples:

query	corpus
`def add_data_file(data_files, target, source): """Add an entry to data_files""" for t, f in data_files: if t == target: break else:`	`data_files.append((target, [])) f = data_files[-1][1] if source not in f: f.append(source)`
`function verify (token, options) { options = options \|\| {} options.issuer = options.issuer \|\| this.issuer options.client_id = options.client_id \|\| this.client_id options.client_secret = options.client_secret \|\| this.client_secret options.scope = options.scope \|\| this.scope options.key = options.key \|\| this.jwks.sig return new Promise(function (resolve, reject) { AccessToken.verify(token, options, function (err, claims) { if (err) { return reject(err) } resolve(claims) }) }) }`	`Verifies a given OIDC token @method verify @param token {String} JWT AccessToken for OpenID Connect (base64 encoded) @param [options={}] {Object} Options hashmap @param [options.issuer] {String} OIDC Provider/Issuer URL @param [options.key] {Object} Issuer's public key for signatures (jwks.sig) @param [options.client_id] {String} @param [options.client_secret {String} @param [options.scope] {String} @throws {UnauthorizedError} HTTP 401 or 403 errors (invalid tokens etc) @return {Promise}`
`def _combine_lines(self, lines): """ Combines a list of JSON objects into one JSON object. """`	`lines = filter(None, map(lambda x: x.strip(), lines)) return '[' + ','.join(lines) + ']'`

Loss: MatryoshkaLoss with these parameters:

{
    "loss": "MultipleNegativesRankingLoss",
    "matryoshka_dims": [
        768,
        512,
        256,
        128,
        64
    ],
    "matryoshka_weights": [
        1,
        1,
        1,
        1,
        1
    ],
    "n_dims_per_step": -1
}

Framework Versions

Python: 3.10.12
Sentence Transformers: 5.1.1
Transformers: 4.54.1
PyTorch: 2.9.0+cu128
Accelerate: 1.10.1
Datasets: 4.2.0
Tokenizers: 0.21.4

Citation

BibTeX

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

MatryoshkaLoss

@misc{kusupati2024matryoshka,
    title={Matryoshka Representation Learning},
    author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
    year={2024},
    eprint={2205.13147},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

MultipleNegativesRankingLoss

@misc{henderson2017efficient,
    title={Efficient Natural Language Response Suggestion for Smart Reply},
    author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
    year={2017},
    eprint={1705.00652},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}