First model commit

Browse files

Files changed (12) hide show

.gitattributes +2 -0
1_Pooling/config.json +7 -0
README.md +123 -0
config.json +28 -0
config_sentence_transformers.json +7 -0
model.safetensors +3 -0
modules.json +20 -0
sentence_bert_config.json +4 -0
sentencepiece.bpe.model +3 -0
special_tokens_map.json +51 -0
tokenizer.json +3 -0
tokenizer_config.json +54 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model.safetensors filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "word_embedding_dimension": 768,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false
+}

README.md CHANGED Viewed

@@ -1,3 +1,126 @@
 ---
 license: mit
 ---

 ---
 license: mit
+language:
+- en
+base_model:
+- intfloat/multilingual-e5-base
 ---
+## BAM Embeddings (multilingual-e5-base)
+Text embeddings specialized for retrieval in the finance domain.
+[Greenback Bears and Fiscal Hawks: Finance is a Jungle and Text Embeddings Must Adapt](https://aclanthology.org/2024.emnlp-industry.26.pdf).
+Peter Anderson, Mano Vikash Janardhanan, Jason He, Wei Cheng, Charlie Flanagan, EMNLP 2024
+This model has 12 layers,  and the embedding size is 768.
+## Usage
+Below is an example to encode queries and passages for text retrieval.
+```python
+import torch.nn.functional as F
+from torch import Tensor
+from transformers import AutoTokenizer, AutoModel
+def average_pool(last_hidden_states: Tensor,
+                 attention_mask: Tensor) -> Tensor:
+    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+# Each input text should start with "query: " or "passage: ", even for non-English texts.
+# For tasks other than retrieval, you can simply use the "query: " prefix.
+input_texts = [
+    "query: What is a callback provision?",
+    "query: EverCommerce revenue headwinds",
+    "passage: Beazley PLC/ADR - But they're saying, do you confirm prior to issuing an invoice that this is the correct, or prior to paying an invoice that this is the correct...",
+    "passage: EverCommerce Inc\nWe are assuming coverage of EverCommerce, which is among the leading SaaS platforms in the services sector for SMBs..."
+]
+tokenizer = AutoTokenizer.from_pretrained('BalyasnyAI/multilingual-e5-base')
+model = AutoModel.from_pretrained('BalyasnyAI/multilingual-e5-base')
+# Tokenize the input texts
+batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
+outputs = model(**batch_dict)
+embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
+# normalize embeddings
+embeddings = F.normalize(embeddings, p=2, dim=1)
+scores = (embeddings[:2] @ embeddings[2:].T) * 100
+print(scores.tolist())
+```
+## Supported Languages
+This model is initialized from [intfloat/multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base)
+and finetuned on English datasets. Other languages may see lower performance.
+## Training Details
+**Initialization**: [intfloat/multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base)
+**Finetuning**: contrastive loss with synthetically qenerated queries and hard negatives
+| Dataset                                                                                                | Weak supervision                      | # of text pairs |
+|--------------------------------------------------------------------------------------------------------|---------------------------------------|-----------------|
+| BAM internal dataset                                                                                   | (text passage, synthetic query)       | 14.3M           |
+## Support for Sentence Transformers
+Below is an example for usage with sentence_transformers.
+```python
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer('BalyasnyAI/multilingual-e5-base')
+input_texts = [
+    "query: What is a callback provision?",
+    "query: EverCommerce revenue headwinds",
+    "passage: Beazley PLC/ADR - But they're saying, do you confirm prior to issuing an invoice that this is the correct, or prior to paying an invoice that this is the correct...",
+    "passage: EverCommerce Inc\nWe are assuming coverage of EverCommerce, which is among the leading SaaS platforms in the services sector for SMBs..."
+]
+embeddings = model.encode(input_texts, normalize_embeddings=True)
+```
+Package requirements
+`pip install sentence_transformers~=2.2.2`
+## TIPS FOR BEST PERFORMANCE
+**1. Always add the correct text prefix, either "query: " or "passage: " to input texts**
+This is how the model is trained, otherwise you will see a performance degradation.
+Here are some rules of thumb:
+- Use "query: " and "passage: " correspondingly for asymmetric tasks such as passage retrieval.
+- Use "query: " prefix for symmetric tasks such as semantic similarity, bitext mining, paraphrase retrieval.
+- Use "query: " prefix if you want to use embeddings as features, such as linear probing classification, clustering.
+**2. Add Context to Passages**
+When a document is split into individual text passages for embedding, frequently these text passages are missing crucial information such as the title of the document, or the name and ticker of the company it relates to. To overcome this problem, BAM embeddings are trained to work well with *one line of document context added to the beginning of each text passage* (followed by a newline).
+It’s up to you what document context you should use. We have had success using combinations of the document title, author name and bio, company name, ticker, event, and date, depending on the application, e.g. “Google GOOG FY23 earnings call\n”. Only one line of document context is needed.
+**3. Keep passages <=512 tokens**
+Long texts will be truncated to at most 512 tokens.
+## Citation
+If you find our paper or models helpful, please consider citing as follows:
+```
+@inproceedings{anderson-etal-2024-greenback,
+    title = "Greenback Bears and Fiscal Hawks: Finance is a Jungle and Text Embeddings Must Adapt",
+    author = "Anderson, Peter and Janardhanan, Mano Vikash and He, Jason and Cheng, Wei and Flanagan, Charlie",
+    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
+    year = "2024",
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_name_or_path": "/local/peanderson/mixed-models/e5-base-v2",
+  "architectures": [
+    "XLMRobertaModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.39.0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.2.2",
+    "transformers": "4.39.0",
+    "pytorch": "2.1.2+cu121"
+  }
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60caa896327382bca0fe783e1f13d933200f8ec2c6454585facc7010d91689ae
+size 1112197096

modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 512,
+  "do_lower_case": false
+}

sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
+size 5069051

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f59925fcb90c92b894cb93e51bb9b4a6105c5c249fe54ce1c704420ac39b81af
+size 17082756

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250001": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}