Spaces:
Runtime error
Runtime error
| # Import NeMo and it's ASR, NLP and TTS collections | |
| import nemo | |
| # Import Speech Recognition collection | |
| import nemo.collections.asr as nemo_asr | |
| # Import Natural Language Processing colleciton | |
| import nemo.collections.nlp as nemo_nlp | |
| # Import Speech Synthesis collection | |
| import nemo.collections.tts as nemo_tts | |
| from nemo.collections.nlp.models.dialogue.dialogue_zero_shot_intent_model import DialogueZeroShotIntentModel | |
| import whisper | |
| from .utils import measure_time | |
| class SpeechTranslate(): | |
| def __init__(self,intents=None): | |
| # Next, we instantiate all the necessary models directly from NVIDIA NGC | |
| # Speech Recognition model - QuartzNet trained on Russian part of MCV 6.0 | |
| self.intent_label= intents | |
| self.intent_model=DialogueZeroShotIntentModel.from_pretrained("zeroshotintent_en_bert_base_uncased").eval() | |
| self.transcription= whisper.load_model("base") | |
| # Neural Machine Translation model | |
| self.nmt_model = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_de_en_transformer24x6').eval() | |
| self.nmt_model_de = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_en_de_transformer24x6').eval() | |
| # Spectrogram generator which takes text as an input and produces spectrogram | |
| self.spectrogram_generator = nemo_tts.models.FastPitchModel.from_pretrained(model_name="tts_de_fastpitch_singlespeaker").eval() | |
| # Vocoder model which takes spectrogram and produces actual audio | |
| self.vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name="tts_de_slr_hifigan_ft_fastpitch_singlespeaker").eval() | |
| def translate(self,speechfile): | |
| # Transcribe an audio file | |
| # IMPORTANT: The audio must be mono with 16Khz sampling rate | |
| text = self.transcription.transcribe(speechfile) | |
| # You should see russian text here. Let's translate it to English | |
| if text["language"]=="de": | |
| english_text = self.nmt_model.translate([text["text"]]) | |
| elif text["language"]=="en": | |
| english_text=text["text"] | |
| else: | |
| raise NotImplementedError(f"Language: {text['language']} currently not supported") | |
| if self.intent_label is None: | |
| self.text = self.nmt_model_de.translate(english_text) | |
| else: | |
| self.text=english_text | |
| # After this you should see English translation | |
| # Let's convert it into audio | |
| # A helper function which combines FastPitch and HiFiGAN to go directly from | |
| # text to audio | |
| def get_intent(self): | |
| intents = self.intent_model.predict([self.text[0]],self.intent_label) | |
| intent = [f"This is a {intents[0]['labels'][0]}, I will route you to the corresponding department"] | |
| print(intents) | |
| intenti = self.nmt_model_de.translate(intent) | |
| return intenti,intents[0]['labels'][0] | |
| def text_to_audio(self): | |
| parsed = self.spectrogram_generator.parse(self.text[0]) | |
| spectrogram = self.spectrogram_generator.generate_spectrogram(tokens=parsed) | |
| audio = self.vocoder.convert_spectrogram_to_audio(spec=spectrogram) | |
| return audio.to('cpu').detach().numpy() | |
| def process(self,speechfile,intents): | |
| self.intent_label = intents.split(",") if intents is not None else None | |
| self.translate(speechfile) | |
| if self.intent_label is not None: | |
| self.text,intent = self.get_intent() | |
| return self.text_to_audio(),intent | |