| | import os |
| | import sys |
| | os.system("pip install transformers==4.27.0") |
| | from transformers import pipeline, WhisperModel, WhisperTokenizer |
| | os.system("pip install evaluate") |
| | import evaluate |
| | os.system("pip install datasets") |
| | os.system("pip install llvmlite") |
| | os.system("pip install spicy") |
| | os.system("pip install soundfile") |
| | os.system("pip install datasets[audio]") |
| | |
| | |
| | os.system("pip install numba==0.51.2") |
| | from evaluate import evaluator |
| | from datasets import load_dataset, Audio |
| | from datasets import disable_caching |
| | from datasets import set_caching_enabled |
| | set_caching_enabled(False) |
| | disable_caching() |
| |
|
| | from transformers import pipeline, WhisperModel, WhisperTokenizer, AutoConfig |
| | from datasets import load_dataset |
| |
|
| |
|
| | metric = evaluate.load("wer") |
| |
|
| | |
| | huggingface_token = os.environ["huggingface_token"] |
| | whisper_miso = WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) |
| | miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) |
| |
|
| | |
| | asr_pipeline = pipeline( |
| | "automatic-speech-recognition", |
| | model=whisper_miso, |
| | tokenizer=miso_tokenizer |
| | ) |
| |
|
| | |
| | dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio()) |
| |
|
| | |
| | results = asr_pipeline(dataset) |
| | metric = WERMetric() |
| | wer = metric.compute(predictions=results, references=dataset["audio"]) |
| | print(wer) |
| |
|
| |
|
| |
|
| | def transcribe(audio, state=""): |
| | text = p(audio)["text"] |
| | state += text + " " |
| | return state, state |
| |
|
| | gr.Interface( |
| | fn=transcribe, |
| | inputs=[ |
| | gr.Audio(source="microphone", type="filepath", streaming=True), |
| | "state" |
| | ], |
| | outputs=[ |
| | "textbox", |
| | "state" |
| | ], |
| | live=True).launch() |
| |
|