| | --- |
| | library_name: transformers |
| | pipeline_tag: automatic-speech-recognition |
| | inference: true |
| | --- |
| | |
| | This model is for debugging. It is randomly initialized with the config from [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) but is of smaller size. |
| |
|
| | Codes: |
| | ```python |
| | import os |
| | |
| | import torch |
| | |
| | from huggingface_hub import create_repo, upload_folder |
| | from transformers import ( |
| | AutoModelForCausalLM, |
| | AutoTokenizer, |
| | GenerationConfig, |
| | AutoConfig, |
| | pipeline, |
| | set_seed, |
| | ) |
| | import torch |
| | from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoConfig |
| | from datasets import load_dataset |
| | |
| | model_id = "openai/whisper-large-v3" |
| | repo_id = "yujiepan/whisper-v3-tiny-random" |
| | save_path = f"/tmp/{repo_id}" |
| | os.system(f'rm -rf {save_path}') |
| | os.makedirs(save_path, exist_ok=True) |
| | |
| | device = "cuda" |
| | torch_dtype = torch.float16 |
| | model_id = "openai/whisper-large-v3" |
| | |
| | config = AutoConfig.from_pretrained(model_id) |
| | config.num_hidden_layers = 2 |
| | config.d_model = 8 |
| | config.decoder_attention_heads = 2 |
| | config.decoder_ffn_dim = 16 |
| | config.decoder_layers = 2 |
| | config.encoder_ffn_dim = 16 |
| | config.encoder_attention_heads = 2 |
| | config.encoder_layers = 2 |
| | |
| | model = AutoModelForSpeechSeq2Seq.from_config(config) |
| | model.to(device).to(torch_dtype) |
| | model.generation_config = GenerationConfig.from_pretrained(model_id) |
| | processor = AutoProcessor.from_pretrained(model_id) |
| | |
| | set_seed(42) |
| | num_params = 0 |
| | with torch.no_grad(): |
| | for name, p in sorted(model.named_parameters()): |
| | print(name, p.shape) |
| | torch.nn.init.uniform_(p, -0.5, 0.5) |
| | num_params += p.numel() |
| | print("Total number of parameters:", num_params) |
| | |
| | pipe = pipeline( |
| | "automatic-speech-recognition", |
| | model=model, |
| | tokenizer=processor.tokenizer, |
| | feature_extractor=processor.feature_extractor, |
| | torch_dtype=torch_dtype, |
| | device=device, |
| | ) |
| | |
| | sample = load_dataset( |
| | "distil-whisper/librispeech_long", "clean", |
| | split="validation", |
| | )[0]["audio"] |
| | result = pipe(sample, return_timestamps=True) |
| | print(result["text"]) |
| | |
| | create_repo(repo_id, exist_ok=True) |
| | upload_folder(repo_id=repo_id, folder_path=save_path, repo_type='model') |
| | ``` |
| |
|