Spaces:
Runtime error
Runtime error
| # %% | |
| import os | |
| from pathlib import Path | |
| from pprint import pprint | |
| from lhotse.recipes import ( | |
| download_voxceleb1, | |
| download_voxceleb2, | |
| hifitts, | |
| libritts, | |
| prepare_voxceleb, | |
| ) | |
| import pandas as pd | |
| # %% | |
| root_dir = Path("../../datasets_cache") | |
| # root_dir = Path("datasets_cache") | |
| voxceleb1_path = root_dir / "voxceleb1" | |
| voxceleb2_path = root_dir / "voxceleb2" | |
| hifitts_path = root_dir / "hifitts" | |
| libritts_path = root_dir / "librittsr" | |
| num_jobs = os.cpu_count() - 3 # type: ignore | |
| num_jobs, hifitts_path | |
| # %% | |
| # voxceleb1_root = download_voxceleb1(voxceleb1_path) | |
| # voxceleb1_root | |
| # %% | |
| # voxceleb2_root = download_voxceleb2(voxceleb2_path) | |
| # voxceleb2_root | |
| # %% | |
| hifitts_root = hifitts.download_hifitts(hifitts_path) | |
| hifitts_root | |
| # %% | |
| result = hifitts.prepare_hifitts(hifitts_root, num_jobs=num_jobs) | |
| result | |
| # %% | |
| result.keys() | |
| # %% | |
| from lhotse import CutSet, Fbank, FbankConfig, Mfcc, MfccConfig, RecordingSet | |
| cuts_train = CutSet.from_manifests(**result["6670_other_test"]) # type: ignore | |
| cuts_train | |
| # %% | |
| pprint(cuts_train[0]) | |
| # %% | |
| from lhotse.cut import Cut | |
| # Filter the CutSet to only include cuts that are no more than the duration limit | |
| duration_limit_min = 2.0 | |
| duration_limit_max = 2.5 | |
| # Duration limit in seconds | |
| cuts_train = cuts_train.filter( | |
| lambda cut: isinstance(cut, Cut) | |
| and cut.duration >= duration_limit_min | |
| and cut.duration <= duration_limit_max, | |
| ) | |
| cuts_train | |
| # %% | |
| cuts_train[0].supervisions[0] | |
| # %% | |
| # filter_length=2048, | |
| # hop_length=512, # NOTE: 441 ?? https://github.com/jik876/hifi-gan/issues/116#issuecomment-1436999858 | |
| # win_length=2048, | |
| # n_mel_channels=128, | |
| # mel_fmin=20, | |
| # mel_fmax=11025, | |
| fbank = Fbank( | |
| FbankConfig( | |
| sampling_rate=44100, | |
| num_filters=128, | |
| ), | |
| ) | |
| cuts_train_fbank = cuts_train.compute_and_store_features( | |
| extractor=fbank, | |
| storage_path=hifitts_root / "features", | |
| num_jobs=1, | |
| ) | |
| cuts_train_fbank | |
| # %% | |
| # cuts_train_fbank.to_file(hifitts_root / "cuts_train.json.gz") | |
| # %% | |
| cuts_train_fbank[0].plot_features() | |
| # %% | |
| cuts_train_fbank_item = cuts_train_fbank[0] | |
| cuts_train_fbank_item | |
| # %% | |
| from lhotse.cut import MonoCut | |
| if isinstance(cuts_train_fbank_item, MonoCut): | |
| print(cuts_train_fbank_item.features) | |
| # %% | |
| cuts_train_fbank_item.plot_audio() | |
| # %% | |
| cuts_train_fbank_item.play_audio() | |
| # %% | |
| from lhotse import CutSet | |
| from lhotse.dataset import ( | |
| SimpleCutSampler, | |
| UnsupervisedDataset, | |
| UnsupervisedWaveformDataset, | |
| ) | |
| from torch.utils.data import DataLoader, Dataset | |
| dataset = UnsupervisedDataset() | |
| sampler = SimpleCutSampler(cuts_train_fbank, max_duration=300) | |
| dataloader = DataLoader(dataset, sampler=sampler, batch_size=None) | |
| batch = next(iter(dataloader)) | |
| batch | |
| # %% | |
| batch["cuts"][0].recording.sources[0].load_audio().shape | |
| # %% | |
| batch["cuts"][0].features | |
| # %% | |
| batch["features"][0].shape | |
| # %% | |
| batch["features"][0] | |
| # %% | |
| # Prepare the LibriTTS dataset | |
| libritts_root = libritts.download_librittsr( | |
| libritts_path, | |
| dataset_parts=["train-clean-100"], | |
| ) | |
| libritts_root, libritts_path | |
| # %% | |
| prepared_libri = libritts.prepare_librittsr( | |
| libritts_root / "LibriTTS_R", | |
| # dataset_parts=["dev-clean"], | |
| dataset_parts=["train-clean-100"], | |
| num_jobs=num_jobs, | |
| ) | |
| # %% | |
| prepared_libri | |
| # %% | |
| prepared_libri_100 = ( | |
| pd.DataFrame(prepared_libri["train-clean-100"]["supervisions"]) | |
| .groupby("speaker")["duration"] | |
| .sum() | |
| .sort_values(ascending=False) | |
| ) | |
| prepared_libri_100 | |
| # %% | |
| for k in prepared_libri: | |
| prepared_libri_ = ( | |
| pd.DataFrame(prepared_libri[k]["supervisions"]) | |
| .groupby("speaker")["duration"] | |
| .sum() | |
| .sort_values(ascending=False) | |
| ) | |
| print(prepared_libri_.loc[prepared_libri_ >= 1800]) | |
| # %% | |
| from lhotse import CutSet, SupervisionSet | |
| supervisions_libri = SupervisionSet() | |
| supervisions_libri.to_file(libritts_root / "supervisions_libri.json.gz") | |
| # dev-clean | |
| # Series([], Name: duration, dtype: float64) | |
| # dev-other | |
| # Series([], Name: duration, dtype: float64) | |
| # test-clean | |
| # speaker | |
| # 3570 1865.052667 | |
| # Name: duration, dtype: float64 | |
| # test-other | |
| # Series([], Name: duration, dtype: float64) | |
| # train-clean-100 | |
| # speaker | |
| # 40 2096.569333 | |
| # 6209 1926.765000 | |
| # 7447 1915.213333 | |
| # 1088 1900.926000 | |
| # Name: duration, dtype: float64 | |
| # train-clean-360 | |
| # speaker | |
| # 3003 2385.213333 | |
| # 2204 2242.730333 | |
| # 3307 2086.246500 | |
| # 8080 2051.131500 | |
| # 5935 1959.650833 | |
| # 3922 1938.523500 | |
| # 7982 1893.050833 | |
| # 3638 1843.324000 | |
| # 3032 1812.692000 | |
| # Name: duration, dtype: float64 | |
| # train-other-500 | |
| # speaker | |
| # 215 2385.047833 | |
| # 6594 2341.286667 | |
| # 3433 2206.806500 | |
| # 3867 2118.326167 | |
| # 5733 2097.689833 | |
| # 7649 2016.925500 | |
| # 2834 2008.083000 | |
| # 8291 1977.892000 | |
| # 483 1964.766000 | |
| # 5181 1959.280000 | |
| # 8799 1909.690500 | |
| # 7839 1888.650500 | |
| # 1665 1877.726833 | |
| # 8430 1872.845500 | |
| # 47 1861.966167 | |
| # 2361 1839.646333 | |
| # 1132 1838.686333 | |
| # 5439 1837.487000 | |
| # 3319 1821.083833 | |
| # 5445 1808.444667 | |
| # 2208 1804.525833 | |
| # 8346 1804.405500 | |
| # Name: duration, dtype: float64 | |
| selected_speakers_man = [ | |
| # train-clean-100 | |
| "40", | |
| "1088", | |
| # train-clean-360 | |
| "3307", | |
| "5935", | |
| "3032", | |
| # train-other-500 | |
| "215", | |
| "6594", | |
| "3867", | |
| "5733", | |
| "8291", | |
| "5181", | |
| "8799", | |
| "2361", | |
| "1132", | |
| "5439", | |
| "3319", | |
| "8346", | |
| ] | |
| # %% | |
| num_speakers_lib_100_over_1900_sec = prepared_libri_100.loc[prepared_libri_100 >= 1900] | |
| num_speakers_lib_100_over_1900_sec | |
| # %% | |
| prepared_libri_360 = libritts.prepare_librittsr( | |
| libritts_root / "LibriTTS_R", | |
| # dataset_parts=["dev-clean"], | |
| dataset_parts=["train-clean-360"], | |
| num_jobs=num_jobs, | |
| ) | |
| # %% | |
| speaker_durations_360 = ( | |
| pd.DataFrame(prepared_libri_360["train-clean-360"]["supervisions"]) | |
| .groupby("speaker")["duration"] | |
| .sum() | |
| .sort_values(ascending=False) | |
| ) | |
| speaker_durations_360 | |
| # %% | |
| # Get the speaker IDs from both dataframes | |
| speaker_ids_100 = prepared_libri_100.index | |
| speaker_ids_360 = speaker_durations_360.index | |
| # Find the intersection of the speaker IDs | |
| common_speaker_ids = speaker_ids_100.intersection(speaker_ids_360) | |
| # No intersection! | |
| common_speaker_ids | |
| # %% | |
| num_speakers_lib_360_over_1900_sec = speaker_durations_360.loc[ | |
| speaker_durations_360 > 1900 | |
| ].count() | |
| num_speakers_lib_360_over_1900_sec | |
| # %% | |
| from lhotse import CutSet, Fbank, FbankConfig | |
| cuts_train = CutSet.from_manifests(**prepared_libri["train-clean-100"]) # type: ignore | |
| cuts_train | |
| # %% | |
| # You can save the prepared CutSet to a file! | |
| cuts_train.to_file("./libri_selected.json.gz") | |
| cuts_train.to_file(root_dir / "./libri_selected.json.gz") | |
| # %% | |
| from lhotse import CutSet, SupervisionSet | |
| libri_selected = CutSet.from_file(root_dir / "libri.json.gz") | |
| libri_selected | |
| # %% | |
| pprint(libri_selected[0]) | |
| print(libri_selected[0].recording.sources[0].source) | |
| # %% | |
| libri_selected[0].play_audio() | |
| # %% | |
| import torchaudio | |
| torchaudio.load( | |
| "datasets_cache/librittsr/LibriTTS_R/dev-clean/5694/64025/5694_64025_000017_000002.wav", | |
| ) | |
| # %% | |
| supervisions_libri = SupervisionSet.from_file( | |
| root_dir / "supervisions_libri.json.gz", | |
| ) | |
| recordings_libri = RecordingSet.from_file( | |
| root_dir / "recordings_libri.json.gz", | |
| ) | |
| supervisions_libri, recordings_libri | |
| # %% | |
| supervisions_libri[0] | |
| # %% | |
| speakers_dur = ( | |
| pd.DataFrame(supervisions_libri) | |
| .groupby("speaker")["duration"] | |
| .sum() | |
| .sort_values(ascending=False) | |
| ) | |
| # %% | |
| speakers_dur_1900 = speakers_dur.loc[speakers_dur >= 1900] | |
| speakers_dur_1900 | |
| # %% | |
| # selected_1900_ids = set( | |
| # map(int, speakers_dur_1900.index.to_list()), | |
| # ) | |
| selected_1900_ids = set( | |
| speakers_dur_1900.index.to_list(), | |
| ) | |
| selected_1900_ids | |
| # %% | |
| duration_limit_min = 0.5 | |
| duration_limit_max = 35.0 | |
| libri_selected.filter( | |
| lambda cut: isinstance(cut, Cut) | |
| and cut.supervisions[0].speaker in selected_1900_ids | |
| and cut.duration >= duration_limit_min | |
| and cut.duration <= duration_limit_max, | |
| ) | |
| # %% | |
| libri_selected[0] | |
| # %% | |
| cuts_train_frame = pd.DataFrame(cuts_train) | |
| cuts_train_frame | |
| # %% | |
| cuts_train[0].supervisions[0].speaker | |
| # %% | |
| # duration_limit_min = 2.0 | |
| # duration_limit_max = 2.5 | |
| cuts_train = cuts_train.filter( | |
| lambda cut: isinstance(cut, Cut) and cut.supervisions[0].speaker == "5338", | |
| # and cut.duration >= duration_limit_min | |
| # and cut.duration <= duration_limit_max, | |
| ) | |
| cuts_train | |
| # %% | |
| # cuts_train.map(lambda cut: cut.supervisions[0].speaker) | |
| # %% | |
| cuts_train[0] | |
| # %% | |
| len(cuts_train) | |
| # %% | |
| selected_speakers_libri_ids = [ | |
| # train-clean-100 | |
| 40, | |
| 1088, | |
| # train-clean-360 | |
| 3307, | |
| 5935, | |
| 3032, | |
| # train-other-500 | |
| 215, | |
| 6594, | |
| 3867, | |
| 5733, | |
| 8291, | |
| 5181, | |
| 8799, | |
| 2361, | |
| 1132, | |
| 5439, | |
| 3319, | |
| 8346, | |
| ] | |
| # The selected speakers from the HiFiTTS dataset | |
| selected_speakers_hi_fi_ids = [ | |
| 92, | |
| 6670, | |
| 6671, | |
| 6097, | |
| 8051, | |
| 11614, | |
| 11697, | |
| 9017, | |
| 12787, | |
| 9136, | |
| ] | |
| selected_speakers_ids = { | |
| v: k | |
| for k, v in enumerate( | |
| selected_speakers_libri_ids + selected_speakers_hi_fi_ids, | |
| ) | |
| } | |
| selected_speakers_ids[1088] | |
| # %% | |
| selected_speakers_libri_ids = [ | |
| # train-clean-100 | |
| 40, | |
| 1088, | |
| # train-clean-360 | |
| 3307, | |
| 5935, | |
| 3032, | |
| # train-other-500 | |
| 215, | |
| 6594, | |
| 3867, | |
| 5733, | |
| 8291, | |
| 5181, | |
| 8799, | |
| 2361, | |
| 1132, | |
| 5439, | |
| 3319, | |
| 8346, | |
| ] | |
| # The selected speakers from the HiFiTTS dataset | |
| selected_speakers_hi_fi_ids = [ | |
| "Cori Samuel", # 92, | |
| "Phil Benson", # 6097, | |
| "Mike Pelton", # 6670, | |
| "Tony Oliva", # 6671, | |
| "Maria Kasper", # 8051, | |
| "John Van Stan", # 9017, | |
| "Helen Taylor", # 9136, | |
| "Sylviamb", # 11614, | |
| "Celine Major", # 11697, | |
| "LikeManyWaters", # 12787, | |
| ] | |
| # Map the speaker ids to string and list of selected speaker ids to set | |
| selected_speakers_ids = { | |
| v: k | |
| for k, v in enumerate( | |
| selected_speakers_libri_ids + selected_speakers_hi_fi_ids, | |
| ) | |
| } | |
| selected_speakers_ids, len(selected_speakers_ids) | |
| # %% | |
| import os | |
| import sys | |
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| sys.path.append(os.path.dirname(SCRIPT_DIR)) | |
| from pathlib import Path | |
| from IPython import display | |
| import torchaudio | |
| from voicefixer import Vocoder | |
| from .hifi_libri_dataset import HifiLibriDataset, HifiLibriItem | |
| vocoder_vf = Vocoder(44100) | |
| dataset = HifiLibriDataset(cache_dir="datasets_cache", cache=True) | |
| item = dataset[0] | |
| wav = vocoder_vf.forward(item.mel.permute((1, 0)).unsqueeze(0)) | |
| display.Audio(wav.squeeze(0).cpu().detach().numpy(), rate=44100) | |
| # wav_path = Path(f"results/{item.id}.wav") | |
| # torchaudio.save(str(wav_path), wav, 44100) | |
| # %% | |