| | import os |
| | import sys |
| | os.system("pip install transformers==4.27.0") |
| | os.system("pip install torch") |
| | os.system("pip install openai") |
| | os.system("pip install accelerate") |
| | from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor, AutoFeatureExtractor, AutoProcessor, WhisperConfig |
| | os.system("pip install evaluate") |
| | |
| | |
| | os.system("pip install datasets") |
| | |
| | os.system("pip install spicy==1.8.1") |
| | os.system("pip install soundfile") |
| | os.system("pip install jiwer") |
| | os.system("pip install datasets[audio]") |
| | os.system("pip install numba==0.51.2") |
| | import torch |
| | from evaluate import evaluator |
| | from datasets import load_dataset, Audio, disable_caching, set_caching_enabled |
| |
|
| | set_caching_enabled(False) |
| | disable_caching() |
| |
|
| | huggingface_token = os.environ["huggingface_token"] |
| |
|
| | model = WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) |
| | feature_extractor = AutoFeatureExtractor.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) |
| |
|
| |
|
| | ds = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio(sampling_rate=16000)) |
| |
|
| | print(ds, "and at 0 ", ds[0]) |
| |
|
| | inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt") |
| | print("check check") |
| | print(inputs) |
| | input_features = inputs.input_features |
| | decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id |
| | last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state |
| | list(last_hidden_state.shape) |
| | print(list(last_hidden_state.shape)) |
| |
|