| | |
| | from datasets import load_dataset |
| | from collections import Counter |
| | import json |
| | import os |
| | import tempfile |
| | from transformers import Wav2Vec2CTCTokenizer |
| |
|
| | |
| | dataset_name = "switchboard" |
| | |
| | split = "train" |
| | |
| | use_auth_token = True |
| | |
| | tokenizer_name = f"wav2vec2-ctc-{dataset_name}-tokenizer" |
| |
|
| | |
| | cutoff_freq = 0.01 |
| |
|
| | dataset = load_dataset( |
| | "esb/datasets", |
| | dataset_name, |
| | split=split, |
| | use_auth_token=use_auth_token, |
| | ) |
| |
|
| | |
| | dataset = dataset.remove_columns(list(set(dataset.column_names) - {"text"})) |
| |
|
| | |
| | def create_vocabulary_from_data(dataset, word_delimiter_token="|", cutoff_freq=0.0): |
| | def extract_all_chars(batch): |
| | all_text = " ".join(batch["text"]) |
| |
|
| | count_chars_dict = Counter(list(all_text)) |
| | |
| | count_chars_dict = sorted(count_chars_dict.items(), key=lambda item: (-item[1], item[0])) |
| | |
| | vocab, freqs = zip(*count_chars_dict) |
| |
|
| | return {"vocab": list(vocab), "freqs": list(freqs)} |
| |
|
| | dataset = dataset.map( |
| | extract_all_chars, |
| | batched=True, |
| | batch_size=-1, |
| | remove_columns=dataset.column_names, |
| | ) |
| |
|
| | vocab, freqs = dataset["vocab"], dataset["freqs"] |
| | total_num_chars = sum(freqs) |
| | chars_to_remove = [] |
| |
|
| | print("Character Occurences") |
| | print(f"Total characters in dataset: {total_num_chars}") |
| | print(50 * "-") |
| | print(f"{'Char'.rjust(5)} | {'Total occ'.rjust(10)} | {'% of total occ'.rjust(20)} |") |
| | print(50 * "-") |
| | for char, freq in zip(vocab, freqs): |
| | freq_in_percent = freq / total_num_chars * 100 |
| | print(f"{char.rjust(5)} | {str(freq).rjust(10)} | {str(round(freq_in_percent, 3)).rjust(20)} |") |
| | if freq_in_percent < cutoff_freq: |
| | chars_to_remove.append(char) |
| | print(50 * "-") |
| |
|
| | vocab = list(set(vocab) - set(chars_to_remove)) |
| |
|
| | |
| | vocab = ["<pad>", "<s>", "</s>", "<unk>"] + vocab |
| |
|
| | |
| | vocab_dict = {v: k for k, v in enumerate(list(vocab))} |
| |
|
| | |
| | if word_delimiter_token is not None: |
| | vocab_dict[word_delimiter_token] = vocab_dict[" "] |
| | del vocab_dict[" "] |
| |
|
| | return vocab_dict |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | vocab_dict = create_vocabulary_from_data(dataset, cutoff_freq=cutoff_freq) |
| |
|
| | |
| | with tempfile.TemporaryDirectory() as tmp: |
| | with open(os.path.join(tmp, "vocab.json"), "w") as file: |
| | json.dump(vocab_dict, file) |
| |
|
| | tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(tmp) |
| |
|
| | |
| | tokenizer.push_to_hub(tokenizer_name) |
| |
|