StormblessedKal commited on
Commit
8f1c298
·
verified ·
1 Parent(s): 915136a

Upload new model: raw-datasets for ellie and ember french voices v2

Browse files
raw-datasets/ellie-ember-french-v2.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b7026c219e87cc4486dc7c6b09dbc4fbd29093f76439532f802b2ef76e90e5a
3
+ size 904490716
raw-datasets/process_french_dataset_v2.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import csv
4
+ import argparse
5
+ import subprocess
6
+ import random
7
+ import unicodedata
8
+ from phonemizer import phonemize
9
+ from phonemizer.backend import EspeakBackend
10
+ from tqdm import tqdm
11
+
12
+ # --- Configuration ---
13
+ INPUT_CSV = "/root/src/data/elevenlabs_generations_simple_ember_french_v2/mappings.csv"
14
+ INPUT_DIR = "/root/src/data/elevenlabs_generations_simple_ember_french_v2"
15
+ OUTPUT_WAV_DIR = os.path.join(INPUT_DIR, "wavs")
16
+ TRAIN_LIST_OUTPUT = os.path.join(INPUT_DIR, "train_list.txt")
17
+ VAL_LIST_OUTPUT = os.path.join(INPUT_DIR, "val_list.txt")
18
+ DEFAULT_SPEAKER_ID = 3219
19
+
20
+ NASAL_VOWEL_MAP = {
21
+ 'ɑ̃': 'ɑŋ',
22
+ 'ɔ̃': 'ɔŋ',
23
+ 'ɛ̃': 'ɛŋ',
24
+ 'œ̃': 'œŋ'
25
+ }
26
+
27
+ def clean_phonemes(text):
28
+ """
29
+ Clean phonemes the same way we cleaned train_list and val_list for emma French Voices.
30
+
31
+ Operations:
32
+ 1. Remove hyphens with trailing space (word separators)
33
+ 2. Normalize Unicode (NFC) to merge combining tilde with vowels
34
+ 3. Replace nasal vowels with approximations using existing symbols
35
+ """
36
+ # Step 1: Remove hyphens with trailing space
37
+ if '- ' in text:
38
+ text = text.replace('- ', '')
39
+
40
+ # Also remove hyphens without trailing space
41
+ if '-' in text:
42
+ text = text.replace('-', '')
43
+
44
+ # Step 2: Normalize Unicode to merge combining characters
45
+ text = unicodedata.normalize('NFC', text)
46
+
47
+ # Step 3: Replace nasal vowels with their approximation
48
+ for nasal_vowel, approximation in NASAL_VOWEL_MAP.items():
49
+ if nasal_vowel in text:
50
+ text = text.replace(nasal_vowel, approximation)
51
+
52
+ # Clean up multiple consecutive spaces
53
+ text = ' '.join(text.split())
54
+ return text
55
+
56
+ def convert_to_24khz(input_path, output_path):
57
+ """Converts wav file to 24kHz mono using ffmpeg."""
58
+ try:
59
+ cmd = [
60
+ "ffmpeg",
61
+ "-y", # Overwrite output file without asking
62
+ "-i", input_path,
63
+ "-ar", "24000",
64
+ "-ac", "1", # Mono
65
+ output_path
66
+ ]
67
+ # Run ffmpeg, suppress output unless there's an error
68
+ subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
69
+ return True
70
+ except subprocess.CalledProcessError as e:
71
+ print(f"Error converting {input_path}: {e.stderr.decode()}")
72
+ return False
73
+
74
+ def main():
75
+ parser = argparse.ArgumentParser(description="Process French dataset.")
76
+ parser.add_argument("--input_csv", type=str, default=INPUT_CSV)
77
+ parser.add_argument("--input_dir", type=str, default=INPUT_DIR)
78
+ parser.add_argument("--output_wav_dir", type=str, default=OUTPUT_WAV_DIR)
79
+ parser.add_argument("--train_list", type=str, default=TRAIN_LIST_OUTPUT)
80
+ parser.add_argument("--val_list", type=str, default=VAL_LIST_OUTPUT)
81
+ parser.add_argument("--speaker_id", type=int, default=DEFAULT_SPEAKER_ID)
82
+ parser.add_argument("--split_ratio", type=float, default=0.9)
83
+ args = parser.parse_args()
84
+
85
+ print(f"Input CSV: {args.input_csv}")
86
+ print(f"Input Dir: {args.input_dir}")
87
+ print(f"Output Wav Dir: {args.output_wav_dir}")
88
+ print(f"Speaker ID: {args.speaker_id}")
89
+
90
+ os.makedirs(args.output_wav_dir, exist_ok=True)
91
+
92
+ entries = []
93
+
94
+ # Read CSV
95
+ with open(args.input_csv, 'r', encoding='utf-8') as f:
96
+ reader = csv.DictReader(f)
97
+ for row in reader:
98
+ entries.append(row)
99
+
100
+ print(f"Found {len(entries)} entries.")
101
+
102
+ processed_entries = []
103
+ texts_to_phonemize = []
104
+ rejected_count = 0
105
+
106
+ # First pass: Convert audio and collect texts
107
+ print("Converting audio to 24kHz...")
108
+ for row in tqdm(entries):
109
+ # Check duration
110
+ try:
111
+ duration = float(row['duration_seconds'])
112
+ if duration < 1.5 or duration > 18.0:
113
+ rejected_count += 1
114
+ continue
115
+ except (ValueError, KeyError):
116
+ print(f"Warning: Invalid duration for {row.get('audio_file', 'unknown')}")
117
+ rejected_count += 1
118
+ continue
119
+
120
+ # Handle Windows-style paths in CSV
121
+ orig_filename = row['audio_file'].replace('\\', '/')
122
+ # The CSV path seems to include the folder name "elevenlabs_generations_simple_ellie_french/"
123
+ # But the files are in args.input_dir.
124
+ # If input_dir is ".../elevenlabs_generations_simple_ellie_french", and filename is "elevenlabs.../file.wav",
125
+ # we might have a duplication or we need to take just the basename.
126
+
127
+ # Check if the file exists as is relative to input_dir, or if we need to strip the dir prefix.
128
+ # Based on the list_dir, the files are directly in input_dir.
129
+ # The CSV says "elevenlabs_generations_simple_ellie_french\french_generation_1.wav"
130
+ # So we should take the basename.
131
+ basename = os.path.basename(orig_filename)
132
+ input_wav_path = os.path.join(args.input_dir, basename)
133
+
134
+ if not os.path.exists(input_wav_path):
135
+ # Try the full relative path just in case
136
+ input_wav_path_alt = os.path.join(os.path.dirname(args.input_dir), orig_filename)
137
+ if os.path.exists(input_wav_path_alt):
138
+ input_wav_path = input_wav_path_alt
139
+ else:
140
+ print(f"Warning: File not found: {input_wav_path}")
141
+ continue
142
+
143
+ output_wav_path = os.path.join(args.output_wav_dir, basename)
144
+
145
+ if convert_to_24khz(input_wav_path, output_wav_path):
146
+ # We'll use relative path for the list file: wavs/basename
147
+ relative_path = os.path.join("wavs", basename)
148
+ processed_entries.append({
149
+ "path": relative_path,
150
+ "text": row['text'],
151
+ "speaker_id": args.speaker_id
152
+ })
153
+ texts_to_phonemize.append(row['text'])
154
+
155
+ if not processed_entries:
156
+ print("No entries processed successfully.")
157
+ print(f"Rejected {rejected_count} files due to duration constraints (1.5s - 18s).")
158
+ return
159
+
160
+ print(f"Rejected {rejected_count} files due to duration constraints (1.5s - 18s).")
161
+ print(f"Processing {len(processed_entries)} files.")
162
+
163
+ # Phonemize
164
+ print("Phonemizing text...")
165
+ # Using phonemize library directly as requested
166
+ # The user asked for: self.phonemizer = phonemizer.backend.EspeakBackend(language=language, preserve_punctuation=True, with_stress=True,language_switch='remove-flags')
167
+ # Note: EspeakBackend is a class, we instantiate it and call phonemize method on list?
168
+ # Actually phonemize function is easier if wrapper works, but user was specific about backend init.
169
+ # But the simple phonemize function also takes backend arguments.
170
+ # Let's try to use the phonemize function with correct args or Backend class if needed.
171
+ # The phonemize function wraps the backend.
172
+
173
+ try:
174
+ # Using the simple interface but passing backend specific args is tricky with the wrapper sometimes.
175
+ # Let's use the Backend class directly to match user request exactly.
176
+ backend = EspeakBackend(
177
+ language='fr-fr',
178
+ preserve_punctuation=True,
179
+ with_stress=True,
180
+ language_switch='remove-flags'
181
+ )
182
+ # backend.phonemize takes a list of texts
183
+ phonemized_texts = backend.phonemize(
184
+ texts_to_phonemize,
185
+ strip=True,
186
+ njobs=max(1, os.cpu_count() // 2)
187
+ )
188
+
189
+ except Exception as e:
190
+ print(f"Phonemization failed: {e}")
191
+ # Fallback or exit
192
+ return
193
+
194
+ # Clean phonemes and combine
195
+ final_lines = []
196
+ for i, entry in enumerate(processed_entries):
197
+ raw_ph = phonemized_texts[i]
198
+ clean_ph = clean_phonemes(raw_ph)
199
+ line = f"{entry['path']}|{clean_ph}|{entry['speaker_id']}\n"
200
+ final_lines.append(line)
201
+
202
+ # Shuffle or sort? The previous script sorted by segment number.
203
+ # These filenames have numbers too: french_generation_X.wav
204
+ # Let's sort them numerically.
205
+
206
+ def extract_number(line):
207
+ # path|ph|id
208
+ path = line.split('|')[0]
209
+ # wavs/french_generation_123.wav
210
+ filename = os.path.basename(path)
211
+ # french_generation_123.wav
212
+ try:
213
+ num = int(filename.split('_')[-1].split('.')[0])
214
+ return num
215
+ except:
216
+ return 0
217
+
218
+ final_lines.sort(key=extract_number)
219
+
220
+ # Split
221
+ split_idx = int(len(final_lines) * args.split_ratio)
222
+ train_data = final_lines[:split_idx]
223
+ val_data = final_lines[split_idx:]
224
+
225
+ print(f"Writing {len(train_data)} training lines to {args.train_list}")
226
+ with open(args.train_list, 'w', encoding='utf-8') as f:
227
+ f.writelines(train_data)
228
+
229
+ print(f"Writing {len(val_data)} validation lines to {args.val_list}")
230
+ with open(args.val_list, 'w', encoding='utf-8') as f:
231
+ f.writelines(val_data)
232
+
233
+ print("Done.")
234
+
235
+ if __name__ == "__main__":
236
+ main()
237
+