| --- |
| language: |
| - ru |
| base_model: |
| - cointegrated/rubert-tiny2 |
| pipeline_tag: text-classification |
| size: |
| - 114 MB |
| --- |
| |
| <pre><code class="python"> |
| import torch |
| from optimum.onnxruntime import ORTModelForFeatureExtraction |
| from transformers import AutoTokenizer |
| import numpy as np |
|
|
|
|
| model_path = ".../bert-onnx-optim/" |
| file_name = "model_optimized.onnx" |
| |
| encoder = ORTModelForFeatureExtraction.from_pretrained( |
| model_path, |
| file_name=file_name |
| ) |
| tokenizer = AutoTokenizer.from_pretrained(model_path) |
| |
| def encode(self, texts, batch_size=8, normalize=True): |
| """ |
| Fetch embeddings for text |
| :input: |
| Any (str/list): text |
| int: batch size |
| bool: need normilize |
| |
| :output: |
| list: list fetch embeddings |
| """ |
| log.debug(f"Data: {texts}") |
| if isinstance(texts, str): |
| texts = [texts] |
| all_embeddings = [] |
| for i in range(0, len(texts), batch_size): |
| batch = texts[i:i+batch_size] |
| inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True) |
| |
| with torch.no_grad(): |
| outputs = self.encoder(**inputs) |
| embeddings = outputs.last_hidden_state.mean(dim=1) |
| |
| if normalize: |
| embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) |
| all_embeddings.append(embeddings.cpu().numpy()) |
| return np.vstack(all_embeddings) |
| </code></pre> |