michaelfeil
/

ct2fast-LaBSE

@@ -128,20 +128,36 @@ Speedup inference while reducing memory by 2x-4x using int8 inference in C++ on
 quantized version of [setu4993/LaBSE](https://huggingface.co/setu4993/LaBSE)
 ```bash
-pip install hf-hub-ctranslate2>=2.10.0 ctranslate2>=3.16.0
 ```
 ```python
 # from transformers import AutoTokenizer
 model_name = "michaelfeil/ct2fast-LaBSE"
 from hf_hub_ctranslate2 import EncoderCT2fromHfHub
 model = EncoderCT2fromHfHub(
         # load in int8 on CUDA
         model_name_or_path=model_name,
         device="cuda",
-        compute_type="float16",
-        # tokenizer=AutoTokenizer.from_pretrained("{ORG}/{NAME}")
 )
 embeddings = model.encode(
     ["I like soccer", "I like tennis", "The eiffel tower is in Paris"],
@@ -152,16 +168,20 @@ embeddings = model.encode(
 print(embeddings.shape, embeddings)
 scores = (embeddings @ embeddings.T) * 100
 ```
-Checkpoint compatible to [ctranslate2>=3.16.0](https://github.com/OpenNMT/CTranslate2)
-and [hf-hub-ctranslate2>=2.10.0](https://github.com/michaelfeil/hf-hub-ctranslate2)
 - `compute_type=int8_float16` for `device="cuda"`
 - `compute_type=int8`  for `device="cpu"`
-Converted on 2023-06-16 using
 ```
-ct2-transformers-converter --model setu4993/LaBSE --output_dir ~/tmp-ct2fast-LaBSE --force --copy_files tokenizer.json README.md tokenizer_config.json vocab.txt special_tokens_map.json .gitattributes --quantization float16 --trust_remote_code
 ```
 # Licence and other remarks:

 quantized version of [setu4993/LaBSE](https://huggingface.co/setu4993/LaBSE)
 ```bash
+pip install hf-hub-ctranslate2>=2.12.0 ctranslate2>=3.17.1
 ```
 ```python
 # from transformers import AutoTokenizer
 model_name = "michaelfeil/ct2fast-LaBSE"
+model_name_orig="setu4993/LaBSE"
 from hf_hub_ctranslate2 import EncoderCT2fromHfHub
 model = EncoderCT2fromHfHub(
         # load in int8 on CUDA
         model_name_or_path=model_name,
         device="cuda",
+        compute_type="int8_float16"
+)
+outputs = model.generate(
+    text=["I like soccer", "I like tennis", "The eiffel tower is in Paris"],
+    max_length=64,
+) # perform downstream tasks on outputs
+outputs["pooler_output"]
+outputs["last_hidden_state"]
+outputs["attention_mask"]
+# alternative, use SentenceTransformer Mix-In
+# for end-to-end Sentence embeddings generation
+# (not pulling from this CT2fast-HF repo)
+from hf_hub_ctranslate2 import CT2SentenceTransformer
+model = CT2SentenceTransformer(
+    model_name_orig, compute_type="int8_float16", device="cuda"
 )
 embeddings = model.encode(
     ["I like soccer", "I like tennis", "The eiffel tower is in Paris"],
 print(embeddings.shape, embeddings)
 scores = (embeddings @ embeddings.T) * 100
+# Hint: you can also host this code via REST API and
+# via github.com/michaelfeil/infinity
 ```
+Checkpoint compatible to [ctranslate2>=3.17.1](https://github.com/OpenNMT/CTranslate2)
+and [hf-hub-ctranslate2>=2.12.0](https://github.com/michaelfeil/hf-hub-ctranslate2)
 - `compute_type=int8_float16` for `device="cuda"`
 - `compute_type=int8`  for `device="cpu"`
+Converted on 2023-10-13 using
 ```
+LLama-2 -> removed <pad> token.
 ```
 # Licence and other remarks:

config.json CHANGED Viewed

@@ -1,6 +1,29 @@
 {
-  "bos_token": "<s>",
-  "eos_token": "</s>",
-  "layer_norm_epsilon": 1e-12,
-  "unk_token": "[UNK]"
-}

 {
+    "architectures": [
+        "BertModel"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "classifier_dropout": null,
+    "gradient_checkpointing": false,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-12,
+    "max_position_embeddings": 512,
+    "model_type": "bert",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "pad_token_id": 0,
+    "position_embedding_type": "absolute",
+    "torch_dtype": "float32",
+    "transformers_version": "4.29.2",
+    "type_vocab_size": 2,
+    "use_cache": true,
+    "vocab_size": 501153,
+    "bos_token": "<s>",
+    "eos_token": "</s>",
+    "layer_norm_epsilon": 1e-12,
+    "unk_token": "[UNK]"
+}