| import torch |
| import soundfile as sf |
| from transformers import AutoModelForCTC, Wav2Vec2BertProcessor |
| from pydub import AudioSegment |
| import streamlit as st |
| import tempfile |
| import librosa |
|
|
| |
| available_models = ['Yehor/w2v-bert-2.0-uk'] |
|
|
| st.title("Voice Recognition App") |
|
|
| |
| model_name = st.selectbox("Choose a model", available_models) |
|
|
| |
| |
| |
|
|
| |
| asr_model = AutoModelForCTC.from_pretrained(model_name).to('cpu') |
| processor = Wav2Vec2BertProcessor.from_pretrained(model_name) |
|
|
| |
| |
| |
|
|
| def map_to_pred(file_path, sampling_rate = 16_000, device = 'cpu'): |
| audio_inputs = [] |
|
|
| |
| audio, _ = librosa.load(file_path) |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| inputs = processor([audio], sampling_rate=sampling_rate).input_features |
| |
| features = torch.tensor(inputs).to(device) |
|
|
| with torch.no_grad(): |
| logits = asr_model(features).logits |
|
|
| predicted_ids = torch.argmax(logits, dim=-1) |
| predictions = processor.batch_decode(predicted_ids) |
|
|
| |
| print('Predictions:') |
|
|
| return predictions |
|
|
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| uploaded_file = st.file_uploader("Choose file", type=["wav", "mp3"]) |
|
|
| if uploaded_file is not None: |
| |
| file_path = './temp.wav' |
| with open(file_path, 'wb') as f: |
| f.write(uploaded_file.getbuffer()) |
|
|
| |
| with tempfile.NamedTemporaryFile(delete=False) as temp_file: |
| temp_file.write(uploaded_file.read()) |
| temp_file_path = temp_file.name |
|
|
| |
| audio = AudioSegment.from_file(temp_file_path) |
| temp_wav_path = tempfile.mktemp(suffix=".wav") |
| audio.export(temp_wav_path, format="wav") |
|
|
| st.audio(uploaded_file, format="audio/wav") |
|
|
| text = map_to_pred(file_path) |
|
|
| |
| st.write('Input audio:', uploaded_file.name) |
| st.write('Predicted standard:', text) |