| import gradio as gr |
| from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor |
| import librosa |
|
|
|
|
| model_name = "Grosy/wav2vec2-base-hu" |
|
|
| tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name) |
| processor = Wav2Vec2Processor.from_pretrained(model_name) |
| model = Wav2Vec2ForCTC.from_pretrained(model_name) |
| model.to("cpu") |
|
|
| max_seconds = 30 |
| |
| def speech_file_to_array_fn(path, max_seconds=10): |
| batch = {"file": path} |
| speech_array, sampling_rate = librosa.load(batch["file"], sr=16000) |
| if max_seconds > 0: |
| speech_array = speech_array[: max_seconds * 16000] |
| batch["speech"] = speech_array |
| batch["sampling_rate"] = 16000 |
| return batch |
|
|
|
|
| |
| def inference(audio): |
| |
| |
| sp = speech_file_to_array_fn(audio.name, max_seconds) |
|
|
| sample_rate = 16000 |
| |
| |
| |
| input_values = processor( |
| sp["speech"], |
| sample_rate=sample_rate, |
| chunk_length_s=10, |
| stride_length_s=(4, 2), |
| return_tensors="pt", |
| ).input_values |
|
|
| with torch.no_grad(): |
| logits = model(input_values).logits |
|
|
| pred_ids = torch.argmax(logits, axis=-1).cpu().tolist() |
| prediction = tokenizer.decode(pred_ids[0], output_word_offsets=True) |
|
|
| time_offset = 320 / sample_rate |
|
|
| total_prediction = [] |
| words = [] |
| for item in prediction.word_offsets: |
| r = item |
|
|
| s = round(r['start_offset'] * time_offset, 2) |
| e = round(r['end_offset'] * time_offset, 2) |
|
|
| total_prediction.append(f"{s} - {e}: {r['word']}") |
| words.append(r['word'].lower()) |
|
|
| print(prediction[0]) |
|
|
| return "\n".join(total_prediction) + "\n\n" + ' '.join(words) |
|
|
|
|
| inputs = gr.Audio(label="Input Audio", sources="microphone", type="filepath") |
| outputs = "text" |
| title = model_name |
| description = f"Gradio demo for a {model_name}. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below. Currently supports .wav 16_000hz files, max duration of {max_seconds} sec" |
| article = "<p style='text-align: center'><a href='https://github.com/GrosyT/GrosyT.github.io' target='_blank'> Github repo</a> | <a href='<HF Space link>' target='_blank'>Pretrained model</a> </p>" |
| examples = [ |
| ["sample1.mp3"], |
| ["sample2.mp3"], |
| ] |
| gr.Interface( |
| inference, |
| inputs, |
| outputs, |
| title=title, |
| description=description, |
| article=article, |
| examples=examples, |
| ).launch() |
|
|