| import os |
| import sys |
| os.system("pip install transformers==4.27.0") |
| from transformers import pipeline, WhisperModel, WhisperTokenizer, AutoModelForCTC |
| os.system("pip install evaluate") |
| import evaluate |
| os.system("pip install datasets") |
| os.system("pip install llvmlite") |
| os.system("pip install spicy") |
| os.system("pip install soundfile") |
| os.system("pip install jiwer") |
| os.system("pip install datasets[audio]") |
| |
| |
| os.system("pip install numba==0.51.2") |
| from evaluate import evaluator |
| from datasets import load_dataset, Audio |
| from datasets import disable_caching |
| from datasets import set_caching_enabled |
| set_caching_enabled(False) |
| disable_caching() |
|
|
| p = pipeline("automatic-speech-recognition") |
|
|
| |
|
|
| huggingface_token = os.environ["huggingface_token"] |
|
|
| whisper_miso=WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) |
| miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) |
| |
|
|
|
|
| task_evaluator = evaluator("automatic-speech-recognition") |
| |
| |
| |
| dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio()) |
| results = task_evaluator.compute( |
| model_or_pipeline=whisper_miso, |
| |
| data=dataset, |
| tokenizer=miso_tokenizer, |
| input_column="audio", |
| label_column="audio", |
| |
| strategy="simple", |
| metric="wer", |
| ) |
| print(results) |
|
|
|
|
| def transcribe(audio, state=""): |
| text = p(audio)["text"] |
| state += text + " " |
| returnstate, state |
|
|
| gr.Interface( |
| fn=transcribe, |
| inputs=[ |
| gr.Audio(source="microphone", type="filepath", streaming=True), |
| "state" |
| ], |
| outputs=[ |
| "textbox", |
| "state" |
| ], |
| live=True).launch() |