Anonumous commited on
Commit
aa2b49b
·
verified ·
1 Parent(s): fc0bbdc

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. README.md +20 -3
  2. modeling_borealis.py +8 -4
README.md CHANGED
@@ -10,7 +10,7 @@ pipeline_tag: automatic-speech-recognition
10
 
11
  ### Описание
12
 
13
- **Borealis** - это наша первая ASR модель для русского языка
14
 
15
 
16
 
@@ -21,7 +21,7 @@ pipeline_tag: automatic-speech-recognition
21
  ```python
22
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoFeatureExtractor
23
  import torch
24
-
25
 
26
  model = AutoModelForCausalLM.from_pretrained("Vikhrmodels/Borealis", trust_remote_code=True)
27
  tokenizer = AutoTokenizer.from_pretrained("Vikhrmodels/Borealis")
@@ -55,4 +55,21 @@ with torch.inference_mode():
55
  transcript = model.generate(mel=mel, att_mask=att_mask, **generation_params)
56
 
57
  print(transcript)
58
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  ### Описание
12
 
13
+ **Borealis** - это наша первая ASR модель для русского языка. В этом репозитории представлен чекопинт, который видел примерно `7000` часов аудио на русском языке. Важным отличием от других моделей является то, что тут есть поддержка пунктуации в распознанных аудио. Арихитектура во много вдохновлена [Voxtral](https://mistral.ai/news/voxtral), но отличается в некоторых моментах
14
 
15
 
16
 
 
21
  ```python
22
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoFeatureExtractor
23
  import torch
24
+ import librosa
25
 
26
  model = AutoModelForCausalLM.from_pretrained("Vikhrmodels/Borealis", trust_remote_code=True)
27
  tokenizer = AutoTokenizer.from_pretrained("Vikhrmodels/Borealis")
 
55
  transcript = model.generate(mel=mel, att_mask=att_mask, **generation_params)
56
 
57
  print(transcript)
58
+ ```
59
+
60
+ ### Метрики модели
61
+
62
+ Ниже представлены замеры `Borealis` на фоне остальных открытых моделей, который поддерживают русский язык. Бенчмарк мы скоро выложим в открытый доступ
63
+
64
+
65
+ | Модель | Средний WER | Средний CER | RuLS | CV 22.0 | Books | Speak | Sova |
66
+ |---|---:|---:|---:|---:|---:|---:|---:|
67
+ | Borealis | 6.95% | **2.44% 🏆** | 6.30% | 3.02% | **6.41% 🏆** | **1.98% 🏆** | 17.04% |
68
+ | openai/whisper-large-v3 | 10.74% | — | 11.62% | 7.51% | 12.19% | 2.74% | 19.65% |
69
+ | bond005/whisper-podlodka-turbo | 9.38% | — | 11.91% | 6.36% | 8.96% | 3.14% | 16.55% |
70
+ | openai/whisper-large-v3-turbo | 11.30% | — | 11.88% | 8.17% | 13.29% | 2.80% | 20.37% |
71
+ | bond005/whisper-large-v3-ru-podlodka | 10.76% | — | 10.24% | 7.80% | 10.31% | 3.23% | 22.21% |
72
+ | nvidia/canary-1b-v2 | 13.52% | — | 20.16% | 9.12% | 11.45% | 4.97% | 21.89% |
73
+ | VOSK-model-ru-0.42 | 11.30% | — | 12.06% | 11.87% | 10.80% | 2.61% | 19.15% |
74
+ | GigaAM-ASR-V2-RNNT | **5.85% 🏆** | — | **5.24% 🏆** | **2.85% 🏆** | 8.06% | 3.08% | **10.01% 🏆** |
75
+ | GigaAM-ASR-V2-CTC | 6.45% | — | 5.26% | 3.42% | 7.72% | 3.01% | 12.86% |
modeling_borealis.py CHANGED
@@ -37,10 +37,6 @@ class BorealisForConditionalGeneration(PreTrainedModel, PyTorchModelHubMixin):
37
  self.llm = language_model
38
  self.tokenizer = tokenizer
39
  self.llm.resize_token_embeddings(len(tokenizer))
40
- print("Pad token:", self.llm.config.pad_token_id)
41
- print("EOS token:", self.llm.config.eos_token_id)
42
- print("Tokenizer EOS token ID:", tokenizer.eos_token_id)
43
- print("Tokenizer PAD token ID:", tokenizer.pad_token_id)
44
  self.downsample_factor = config.downsample_factor
45
  self.adapter = AudioLanguageAdapter(
46
  hidden_size=self.encoder.config.d_model * self.downsample_factor,
@@ -148,6 +144,11 @@ class BorealisForConditionalGeneration(PreTrainedModel, PyTorchModelHubMixin):
148
  return_dict=True,
149
  )
150
  return out.loss, out.logits
 
 
 
 
 
151
 
152
  @torch.no_grad()
153
  def generate(
@@ -233,6 +234,9 @@ class BorealisForConditionalGeneration(PreTrainedModel, PyTorchModelHubMixin):
233
  return gen_ids[0] if single else gen_ids
234
  else:
235
  txt = self.tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
 
 
 
236
  return txt[0] if single else txt
237
 
238
  def save_pretrained(self, save_directory, **kwargs):
 
37
  self.llm = language_model
38
  self.tokenizer = tokenizer
39
  self.llm.resize_token_embeddings(len(tokenizer))
 
 
 
 
40
  self.downsample_factor = config.downsample_factor
41
  self.adapter = AudioLanguageAdapter(
42
  hidden_size=self.encoder.config.d_model * self.downsample_factor,
 
144
  return_dict=True,
145
  )
146
  return out.loss, out.logits
147
+
148
+ def extract_assistant_content(text: str) -> str:
149
+ if "assistant\n" in text:
150
+ return text.split("assistant\n")[-1].strip()
151
+ return text.strip()
152
 
153
  @torch.no_grad()
154
  def generate(
 
234
  return gen_ids[0] if single else gen_ids
235
  else:
236
  txt = self.tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
237
+
238
+ txt = [self.extract_assistant_content(t) for t in txt]
239
+
240
  return txt[0] if single else txt
241
 
242
  def save_pretrained(self, save_directory, **kwargs):