import gradio as gr import torch import time import librosa import soundfile import nemo.collections.asr as nemo_asr import tempfile import os import uuid from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration import torch mname = "facebook/blenderbot-400M-distill" model = BlenderbotForConditionalGeneration.from_pretrained(mname) tokenizer = BlenderbotTokenizer.from_pretrained(mname) def take_last_tokens(inputs, note_history, history): filterTokenCount = 128 # filter last 128 tokens if inputs['input_ids'].shape[1] > filterTokenCount: inputs['input_ids'] = torch.tensor([inputs['input_ids'][0][-filterTokenCount:].tolist()]) inputs['attention_mask'] = torch.tensor([inputs['attention_mask'][0][-filterTokenCount:].tolist()]) note_history = [' '.join(note_history[0].split(' ')[2:])] history = history[1:] return inputs, note_history, history def add_note_to_history(note, note_history): note_history.append(note) note_history = ' '.join(note_history) return [note_history] SAMPLE_RATE = 16000 model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_en_conformer_transducer_xlarge") model.change_decoding_strategy(None) model.eval() def process_audio_file(file): data, sr = librosa.load(file) if sr != SAMPLE_RATE: data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE) data = librosa.to_mono(data) return data def transcribe(audio, state = ""): if state is None: state = "" audio_data = process_audio_file(audio) with tempfile.TemporaryDirectory() as tmpdir: audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav') soundfile.write(audio_path, audio_data, SAMPLE_RATE) transcriptions = model.transcribe([audio_path]) if type(transcriptions) == tuple and len(transcriptions) == 2: transcriptions = transcriptions[0] transcriptions = transcriptions[0] state = state + transcriptions return state, state gr.Interface( fn=transcribe, inputs=[ gr.Audio(source="microphone", type='filepath', streaming=True), "state", ], outputs=[ "textbox", "state" ], layout="horizontal", theme="huggingface", title="ASR", description=f"Automatic Speech Recognition (ASR)", allow_flagging='never', live=True ).launch(debug=True)