| from datasets import DatasetDict, load_dataset | |
| import pandas as pd | |
| import torchaudio | |
| import os | |
| print(os.path.exists("/workspace/datasets/CommonVoiceWalkie/cv-corpus-6.1-2020-12-11/ru/clips/common_voice_ru_28718278.mp3")) | |
| array, sampling_rate = torchaudio.load("/workspace/datasets/CommonVoiceWalkie/cv-corpus-6.1-2020-12-11/ru/clips/common_voice_ru_28718278.mp3", format="mp3") | |
| test_df = pd.read_csv("/workspace/datasets/CommonVoiceWalkie/cv-corpus-6.1-2020-12-11/ru/test.tsv", sep='\t') | |
| print(test_df["path"].values[:5]) | |
| datasets = DatasetDict() | |
| datasets["test"] = load_dataset( | |
| "/workspace/datasets/CommonVoiceWalkie", | |
| "ru", | |
| cache_dir = "cache", | |
| split="test", | |
| use_auth_token=False, | |
| ) | |
| # get the first sample of the dataset straight away! | |
| print(next(iter(datasets["test"]))) |