Upload folder using huggingface_hub
Browse files- README.md +20 -3
- modeling_borealis.py +8 -4
README.md
CHANGED
|
@@ -10,7 +10,7 @@ pipeline_tag: automatic-speech-recognition
|
|
| 10 |
|
| 11 |
### Описание
|
| 12 |
|
| 13 |
-
**Borealis** - это наша первая ASR модель для русского
|
| 14 |
|
| 15 |
|
| 16 |
|
|
@@ -21,7 +21,7 @@ pipeline_tag: automatic-speech-recognition
|
|
| 21 |
```python
|
| 22 |
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoFeatureExtractor
|
| 23 |
import torch
|
| 24 |
-
|
| 25 |
|
| 26 |
model = AutoModelForCausalLM.from_pretrained("Vikhrmodels/Borealis", trust_remote_code=True)
|
| 27 |
tokenizer = AutoTokenizer.from_pretrained("Vikhrmodels/Borealis")
|
|
@@ -55,4 +55,21 @@ with torch.inference_mode():
|
|
| 55 |
transcript = model.generate(mel=mel, att_mask=att_mask, **generation_params)
|
| 56 |
|
| 57 |
print(transcript)
|
| 58 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
### Описание
|
| 12 |
|
| 13 |
+
**Borealis** - это наша первая ASR модель для русского языка. В этом репозитории представлен чекопинт, который видел примерно `7000` часов аудио на русском языке. Важным отличием от других моделей является то, что тут есть поддержка пунктуации в распознанных аудио. Арихитектура во много вдохновлена [Voxtral](https://mistral.ai/news/voxtral), но отличается в некоторых моментах
|
| 14 |
|
| 15 |
|
| 16 |
|
|
|
|
| 21 |
```python
|
| 22 |
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoFeatureExtractor
|
| 23 |
import torch
|
| 24 |
+
import librosa
|
| 25 |
|
| 26 |
model = AutoModelForCausalLM.from_pretrained("Vikhrmodels/Borealis", trust_remote_code=True)
|
| 27 |
tokenizer = AutoTokenizer.from_pretrained("Vikhrmodels/Borealis")
|
|
|
|
| 55 |
transcript = model.generate(mel=mel, att_mask=att_mask, **generation_params)
|
| 56 |
|
| 57 |
print(transcript)
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### Метрики модели
|
| 61 |
+
|
| 62 |
+
Ниже представлены замеры `Borealis` на фоне остальных открытых моделей, который поддерживают русский язык. Бенчмарк мы скоро выложим в открытый доступ
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
| Модель | Средний WER | Средний CER | RuLS | CV 22.0 | Books | Speak | Sova |
|
| 66 |
+
|---|---:|---:|---:|---:|---:|---:|---:|
|
| 67 |
+
| Borealis | 6.95% | **2.44% 🏆** | 6.30% | 3.02% | **6.41% 🏆** | **1.98% 🏆** | 17.04% |
|
| 68 |
+
| openai/whisper-large-v3 | 10.74% | — | 11.62% | 7.51% | 12.19% | 2.74% | 19.65% |
|
| 69 |
+
| bond005/whisper-podlodka-turbo | 9.38% | — | 11.91% | 6.36% | 8.96% | 3.14% | 16.55% |
|
| 70 |
+
| openai/whisper-large-v3-turbo | 11.30% | — | 11.88% | 8.17% | 13.29% | 2.80% | 20.37% |
|
| 71 |
+
| bond005/whisper-large-v3-ru-podlodka | 10.76% | — | 10.24% | 7.80% | 10.31% | 3.23% | 22.21% |
|
| 72 |
+
| nvidia/canary-1b-v2 | 13.52% | — | 20.16% | 9.12% | 11.45% | 4.97% | 21.89% |
|
| 73 |
+
| VOSK-model-ru-0.42 | 11.30% | — | 12.06% | 11.87% | 10.80% | 2.61% | 19.15% |
|
| 74 |
+
| GigaAM-ASR-V2-RNNT | **5.85% 🏆** | — | **5.24% 🏆** | **2.85% 🏆** | 8.06% | 3.08% | **10.01% 🏆** |
|
| 75 |
+
| GigaAM-ASR-V2-CTC | 6.45% | — | 5.26% | 3.42% | 7.72% | 3.01% | 12.86% |
|
modeling_borealis.py
CHANGED
|
@@ -37,10 +37,6 @@ class BorealisForConditionalGeneration(PreTrainedModel, PyTorchModelHubMixin):
|
|
| 37 |
self.llm = language_model
|
| 38 |
self.tokenizer = tokenizer
|
| 39 |
self.llm.resize_token_embeddings(len(tokenizer))
|
| 40 |
-
print("Pad token:", self.llm.config.pad_token_id)
|
| 41 |
-
print("EOS token:", self.llm.config.eos_token_id)
|
| 42 |
-
print("Tokenizer EOS token ID:", tokenizer.eos_token_id)
|
| 43 |
-
print("Tokenizer PAD token ID:", tokenizer.pad_token_id)
|
| 44 |
self.downsample_factor = config.downsample_factor
|
| 45 |
self.adapter = AudioLanguageAdapter(
|
| 46 |
hidden_size=self.encoder.config.d_model * self.downsample_factor,
|
|
@@ -148,6 +144,11 @@ class BorealisForConditionalGeneration(PreTrainedModel, PyTorchModelHubMixin):
|
|
| 148 |
return_dict=True,
|
| 149 |
)
|
| 150 |
return out.loss, out.logits
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
@torch.no_grad()
|
| 153 |
def generate(
|
|
@@ -233,6 +234,9 @@ class BorealisForConditionalGeneration(PreTrainedModel, PyTorchModelHubMixin):
|
|
| 233 |
return gen_ids[0] if single else gen_ids
|
| 234 |
else:
|
| 235 |
txt = self.tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
|
|
|
|
|
|
|
|
|
|
| 236 |
return txt[0] if single else txt
|
| 237 |
|
| 238 |
def save_pretrained(self, save_directory, **kwargs):
|
|
|
|
| 37 |
self.llm = language_model
|
| 38 |
self.tokenizer = tokenizer
|
| 39 |
self.llm.resize_token_embeddings(len(tokenizer))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
self.downsample_factor = config.downsample_factor
|
| 41 |
self.adapter = AudioLanguageAdapter(
|
| 42 |
hidden_size=self.encoder.config.d_model * self.downsample_factor,
|
|
|
|
| 144 |
return_dict=True,
|
| 145 |
)
|
| 146 |
return out.loss, out.logits
|
| 147 |
+
|
| 148 |
+
def extract_assistant_content(text: str) -> str:
|
| 149 |
+
if "assistant\n" in text:
|
| 150 |
+
return text.split("assistant\n")[-1].strip()
|
| 151 |
+
return text.strip()
|
| 152 |
|
| 153 |
@torch.no_grad()
|
| 154 |
def generate(
|
|
|
|
| 234 |
return gen_ids[0] if single else gen_ids
|
| 235 |
else:
|
| 236 |
txt = self.tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
|
| 237 |
+
|
| 238 |
+
txt = [self.extract_assistant_content(t) for t in txt]
|
| 239 |
+
|
| 240 |
return txt[0] if single else txt
|
| 241 |
|
| 242 |
def save_pretrained(self, save_directory, **kwargs):
|