| | """ |
| | Resources: |
| | |
| | Canary 1B: https://huggingface.co/nvidia/canary-1b |
| | Phi-3-Mini-4K-Instruct: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct |
| | VITS TTS: https://huggingface.co/docs/transformers/en/model_doc/vits |
| | Blocks and Event Listeners, Gradio Guide: https://www.gradio.app/guides/blocks-and-event-listeners |
| | """ |
| |
|
| | import torch |
| | from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
| | import scipy |
| | import numpy as np |
| | |
| | torch.random.manual_seed(0) |
| |
|
| | model = AutoModelForCausalLM.from_pretrained( |
| | "microsoft/Phi-3-mini-4k-instruct", |
| | torch_dtype="auto", |
| | trust_remote_code=True, |
| | ) |
| | tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") |
| |
|
| | pipe = pipeline( |
| | "text-generation", |
| | model=model, |
| | tokenizer=tokenizer, |
| | ) |
| |
|
| | generation_args = { |
| | "max_new_tokens": 16, |
| | "return_full_text": False, |
| | "temperature": 0.0, |
| | "do_sample": False, |
| | } |
| |
|
| | def phi(user_question): |
| | messages = [{"role": "system", "content": "What can I do for you today"}, |
| | {"role": "user", "content": user_question}] |
| |
|
| | output = pipe(messages, **generation_args) |
| | return output |
| | |
| | |
| | from nemo.collections.asr.models import EncDecMultiTaskModel |
| |
|
| | |
| | canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b') |
| |
|
| | |
| | decode_cfg = canary_model.cfg.decoding |
| | decode_cfg.beam.beam_size = 1 |
| | canary_model.change_decoding_strategy(decode_cfg) |
| |
|
| | |
| | from transformers import VitsTokenizer, VitsModel, set_seed |
| |
|
| | tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") |
| | vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng") |
| | set_seed(555) |
| |
|
| | |
| | import gradio as gr |
| |
|
| | def fromvoice(input): |
| | query = canary_model.transcribe(input, batch_size=16) |
| | resp = phi(query[0]) |
| | voice = tokenizer(text=resp[0]['generated_text'], return_tensors="pt") |
| | with torch.no_grad(): |
| | v = vits_model(**voice) |
| | output=v.waveform[0].numpy() |
| | |
| | return (vits_model.config.sampling_rate, output) |
| |
|
| | def fromtext(input): |
| | resp = phi(input) |
| | voice = tokenizer(text=resp[0]['generated_text'], return_tensors="pt") |
| | with torch.no_grad(): |
| | v = vits_model(**voice) |
| | output=v.waveform[0].numpy() |
| | |
| | return (vits_model.config.sampling_rate, output) |
| | |
| |
|
| | Alexa = gr.Blocks() |
| |
|
| | with Alexa: |
| | audio_file = gr.Audio(type="filepath") |
| | text = gr.Textbox() |
| | output=gr.Audio() |
| |
|
| | b1 = gr.Button("From Speech") |
| | b2 = gr.Button("From Text") |
| |
|
| | b1.click(fromvoice, inputs=audio_file, outputs=output) |
| | b2.click(fromtext, inputs=text, outputs=output) |
| |
|
| |
|
| | Alexa.launch() |
| |
|
| |
|
| |
|