ashishkblink commited on
Commit
31d2662
·
verified ·
1 Parent(s): f36d2e1

Upload f5_tts/train/datasets/prepare_ljspeech.py with huggingface_hub

Browse files
f5_tts/train/datasets/prepare_ljspeech.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ sys.path.append(os.getcwd())
5
+
6
+ import json
7
+ from importlib.resources import files
8
+ from pathlib import Path
9
+ from tqdm import tqdm
10
+ import soundfile as sf
11
+ from datasets.arrow_writer import ArrowWriter
12
+
13
+
14
+ def main():
15
+ result = []
16
+ duration_list = []
17
+ text_vocab_set = set()
18
+
19
+ with open(meta_info, "r") as f:
20
+ lines = f.readlines()
21
+ for line in tqdm(lines):
22
+ uttr, text, norm_text = line.split("|")
23
+ norm_text = norm_text.strip()
24
+ wav_path = Path(dataset_dir) / "wavs" / f"{uttr}.wav"
25
+ duration = sf.info(wav_path).duration
26
+ if duration < 0.4 or duration > 30:
27
+ continue
28
+ result.append({"audio_path": str(wav_path), "text": norm_text, "duration": duration})
29
+ duration_list.append(duration)
30
+ text_vocab_set.update(list(norm_text))
31
+
32
+ # save preprocessed dataset to disk
33
+ if not os.path.exists(f"{save_dir}"):
34
+ os.makedirs(f"{save_dir}")
35
+ print(f"\nSaving to {save_dir} ...")
36
+
37
+ with ArrowWriter(path=f"{save_dir}/raw.arrow") as writer:
38
+ for line in tqdm(result, desc="Writing to raw.arrow ..."):
39
+ writer.write(line)
40
+
41
+ # dup a json separately saving duration in case for DynamicBatchSampler ease
42
+ with open(f"{save_dir}/duration.json", "w", encoding="utf-8") as f:
43
+ json.dump({"duration": duration_list}, f, ensure_ascii=False)
44
+
45
+ # vocab map, i.e. tokenizer
46
+ # add alphabets and symbols (optional, if plan to ft on de/fr etc.)
47
+ with open(f"{save_dir}/vocab.txt", "w") as f:
48
+ for vocab in sorted(text_vocab_set):
49
+ f.write(vocab + "\n")
50
+
51
+ print(f"\nFor {dataset_name}, sample count: {len(result)}")
52
+ print(f"For {dataset_name}, vocab size is: {len(text_vocab_set)}")
53
+ print(f"For {dataset_name}, total {sum(duration_list)/3600:.2f} hours")
54
+
55
+
56
+ if __name__ == "__main__":
57
+ tokenizer = "char" # "pinyin" | "char"
58
+
59
+ dataset_dir = "<SOME_PATH>/LJSpeech-1.1"
60
+ dataset_name = f"LJSpeech_{tokenizer}"
61
+ meta_info = os.path.join(dataset_dir, "metadata.csv")
62
+ save_dir = str(files("f5_tts").joinpath("../../")) + f"/data/{dataset_name}"
63
+ print(f"\nPrepare for {dataset_name}, will save to {save_dir}\n")
64
+
65
+ main()