| import os | |
| import sys | |
| os.system("pip install transformers==4.27.0") | |
| os.system("pip install torch") | |
| os.system("pip install openai") | |
| os.system("pip install accelerate") | |
| from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor | |
| os.system("pip install evaluate") | |
| #import evaluate | |
| #os.system("pip install evaluate[evaluator]") | |
| os.system("pip install datasets") | |
| # os.system("pip install llvmlite") | |
| os.system("pip install spicy==1.8.1") | |
| os.system("pip install soundfile") | |
| os.system("pip install jiwer") | |
| os.system("pip install datasets[audio]") | |
| os.system("pip install numba==0.51.2") | |
| from evaluate import evaluator | |
| from datasets import load_dataset, Audio, disable_caching, set_caching_enabled | |
| set_caching_enabled(False) | |
| disable_caching() | |
| #config = AutoConfig.from_pretrained('whisper-small') | |
| huggingface_token = os.environ["huggingface_token"] | |
| miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) | |
| miso_feature_extractor = WhisperFeatureExtractor.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) | |
| whisper_miso=WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token, tokenizer=miso_tokenizer, device_map="auto") | |
| #whisper_miso=WhisperModel.from_pretrained("openai/whisper-small", use_auth_token=huggingface_token, device_map="auto") | |
| task_evaluator = evaluator("automatic-speech-recognition") | |
| #url = {"test" : "https://huggingface.co/datasets/mskov/miso_test/blob/main/test_set.parquet"} | |
| #data = load_dataset("audiofolder", data_dir="mskov/miso_test") | |
| # data = load_dataset("audiofolder", data_files=["datasets/mskov/miso_test/test_set/and.wav","mskov/miso_test/test_set/chew1.wav","mskov/miso_test/test_set/chew3.wav", "mskov/miso_test/test_set/chew3.wav","mskov/miso_test/test_set/chew4.wav","mskov/miso_test/test_set/cough1.wav","mskov/miso_test/test_set/cough2.wav","mskov/miso_test/test_set/cough3.wav","mskov/miso_test/test_set/hi.wav","mskov/miso_test/test_set/knock_knock.wav","mskov/miso_test/test_set/mouth_sounds1.wav","mskov/miso_test/test_set/mouth_sounds2.wav","mskov/miso_test/test_set/no.wav","mskov/miso_test/test_set/not_bad.wav","mskov/miso_test/test_set/oh_i_wish.wav","mskov/miso_test/test_set/pop1.wav","mskov/miso_test/test_set/really.wav","mskov/miso_test/test_set/sigh1.wav","mskov/miso_test/test_set/sigh2.wav","mskov/miso_test/test_set/slurp1.wav","mskov/miso_test/test_set/slurp2.wav","mskov/miso_test/test_set/sneeze1.wav","mskov/miso_test/test_set/sneeze2.wav","mskov/miso_test/test_set/so_i_did_it_again.wav"]) | |
| #dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio()) | |
| # dataset = load_dataset("mskov/miso_test", split="test") | |
| dataset = load_dataset("mskov/ESC50", split="test") | |
| print(dataset) | |
| results = task_evaluator.compute( | |
| model_or_pipeline=whisper_miso, | |
| #model_or_pipeline="mskov/whisper-small.en", | |
| data=dataset, | |
| tokenizer=miso_tokenizer, | |
| #feature_extractor=miso_feature_extractor, | |
| input_column="audio", | |
| label_column="category", | |
| # device=None, | |
| strategy="simple", | |
| metric="wer", | |
| ) | |
| print(results) | |
| def transcribe(audio, state=""): | |
| text = p(audio)["text"] | |
| state += text + " " | |
| returnstate, state | |
| gr.Interface( | |
| fn=transcribe, | |
| inputs=[ | |
| gr.Audio(source="microphone", type="filepath", streaming=True), | |
| "state" | |
| ], | |
| outputs=[ | |
| "textbox", | |
| "state" | |
| ], | |
| live=True).launch() |