File size: 8,923 Bytes
8f1c298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238

import os
import csv
import argparse
import subprocess
import random
import unicodedata
from phonemizer import phonemize
from phonemizer.backend import EspeakBackend
from tqdm import tqdm

# --- Configuration ---
INPUT_CSV = "/root/src/data/elevenlabs_generations_simple_ember_french_v2/mappings.csv"
INPUT_DIR = "/root/src/data/elevenlabs_generations_simple_ember_french_v2"
OUTPUT_WAV_DIR = os.path.join(INPUT_DIR, "wavs")
TRAIN_LIST_OUTPUT = os.path.join(INPUT_DIR, "train_list.txt")
VAL_LIST_OUTPUT = os.path.join(INPUT_DIR, "val_list.txt")
DEFAULT_SPEAKER_ID = 3219 

NASAL_VOWEL_MAP = {
    'ɑ̃': 'ɑŋ',
    'ɔ̃': 'ɔŋ',
    'ɛ̃': 'ɛŋ',
    'œ̃': 'œŋ'
}

def clean_phonemes(text):
    """
    Clean phonemes the same way we cleaned train_list and val_list for emma French Voices.
    
    Operations:
    1. Remove hyphens with trailing space (word separators)
    2. Normalize Unicode (NFC) to merge combining tilde with vowels
    3. Replace nasal vowels with approximations using existing symbols
    """
    # Step 1: Remove hyphens with trailing space
    if '- ' in text:
        text = text.replace('- ', '')
    
    # Also remove hyphens without trailing space
    if '-' in text:
        text = text.replace('-', '')
    
    # Step 2: Normalize Unicode to merge combining characters
    text = unicodedata.normalize('NFC', text)
    
    # Step 3: Replace nasal vowels with their approximation
    for nasal_vowel, approximation in NASAL_VOWEL_MAP.items():
        if nasal_vowel in text:
            text = text.replace(nasal_vowel, approximation)
    
    # Clean up multiple consecutive spaces
    text = ' '.join(text.split())
    return text

def convert_to_24khz(input_path, output_path):
    """Converts wav file to 24kHz mono using ffmpeg."""
    try:
        cmd = [
            "ffmpeg",
            "-y",  # Overwrite output file without asking
            "-i", input_path,
            "-ar", "24000",
            "-ac", "1",  # Mono
            output_path
        ]
        # Run ffmpeg, suppress output unless there's an error
        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error converting {input_path}: {e.stderr.decode()}")
        return False

def main():
    parser = argparse.ArgumentParser(description="Process French dataset.")
    parser.add_argument("--input_csv", type=str, default=INPUT_CSV)
    parser.add_argument("--input_dir", type=str, default=INPUT_DIR)
    parser.add_argument("--output_wav_dir", type=str, default=OUTPUT_WAV_DIR)
    parser.add_argument("--train_list", type=str, default=TRAIN_LIST_OUTPUT)
    parser.add_argument("--val_list", type=str, default=VAL_LIST_OUTPUT)
    parser.add_argument("--speaker_id", type=int, default=DEFAULT_SPEAKER_ID)
    parser.add_argument("--split_ratio", type=float, default=0.9)
    args = parser.parse_args()

    print(f"Input CSV: {args.input_csv}")
    print(f"Input Dir: {args.input_dir}")
    print(f"Output Wav Dir: {args.output_wav_dir}")
    print(f"Speaker ID: {args.speaker_id}")

    os.makedirs(args.output_wav_dir, exist_ok=True)

    entries = []
    
    # Read CSV
    with open(args.input_csv, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            entries.append(row)

    print(f"Found {len(entries)} entries.")

    processed_entries = []
    texts_to_phonemize = []
    rejected_count = 0
    
    # First pass: Convert audio and collect texts
    print("Converting audio to 24kHz...")
    for row in tqdm(entries):
        # Check duration
        try:
            duration = float(row['duration_seconds'])
            if duration < 1.5 or duration > 18.0:
                rejected_count += 1
                continue
        except (ValueError, KeyError):
            print(f"Warning: Invalid duration for {row.get('audio_file', 'unknown')}")
            rejected_count += 1
            continue

        # Handle Windows-style paths in CSV
        orig_filename = row['audio_file'].replace('\\', '/')
        # The CSV path seems to include the folder name "elevenlabs_generations_simple_ellie_french/"
        # But the files are in args.input_dir.
        # If input_dir is ".../elevenlabs_generations_simple_ellie_french", and filename is "elevenlabs.../file.wav",
        # we might have a duplication or we need to take just the basename.
        
        # Check if the file exists as is relative to input_dir, or if we need to strip the dir prefix.
        # Based on the list_dir, the files are directly in input_dir.
        # The CSV says "elevenlabs_generations_simple_ellie_french\french_generation_1.wav"
        # So we should take the basename.
        basename = os.path.basename(orig_filename)
        input_wav_path = os.path.join(args.input_dir, basename)
        
        if not os.path.exists(input_wav_path):
            # Try the full relative path just in case
            input_wav_path_alt = os.path.join(os.path.dirname(args.input_dir), orig_filename)
            if os.path.exists(input_wav_path_alt):
                input_wav_path = input_wav_path_alt
            else:
                print(f"Warning: File not found: {input_wav_path}")
                continue

        output_wav_path = os.path.join(args.output_wav_dir, basename)
        
        if convert_to_24khz(input_wav_path, output_wav_path):
            # We'll use relative path for the list file: wavs/basename
            relative_path = os.path.join("wavs", basename)
            processed_entries.append({
                "path": relative_path,
                "text": row['text'],
                "speaker_id": args.speaker_id
            })
            texts_to_phonemize.append(row['text'])

    if not processed_entries:
        print("No entries processed successfully.")
        print(f"Rejected {rejected_count} files due to duration constraints (1.5s - 18s).")
        return

    print(f"Rejected {rejected_count} files due to duration constraints (1.5s - 18s).")
    print(f"Processing {len(processed_entries)} files.")

    # Phonemize
    print("Phonemizing text...")
    # Using phonemize library directly as requested
    # The user asked for: self.phonemizer = phonemizer.backend.EspeakBackend(language=language, preserve_punctuation=True, with_stress=True,language_switch='remove-flags')
    # Note: EspeakBackend is a class, we instantiate it and call phonemize method on list?
    # Actually phonemize function is easier if wrapper works, but user was specific about backend init.
    # But the simple phonemize function also takes backend arguments.
    # Let's try to use the phonemize function with correct args or Backend class if needed.
    # The phonemize function wraps the backend.
    
    try:
        # Using the simple interface but passing backend specific args is tricky with the wrapper sometimes.
        # Let's use the Backend class directly to match user request exactly.
        backend = EspeakBackend(
            language='fr-fr',
            preserve_punctuation=True,
            with_stress=True,
            language_switch='remove-flags'
        )
        # backend.phonemize takes a list of texts
        phonemized_texts = backend.phonemize(
            texts_to_phonemize, 
            strip=True, 
            njobs=max(1, os.cpu_count() // 2)
        ) 
        
    except Exception as e:
        print(f"Phonemization failed: {e}")
        # Fallback or exit
        return

    # Clean phonemes and combine
    final_lines = []
    for i, entry in enumerate(processed_entries):
        raw_ph = phonemized_texts[i]
        clean_ph = clean_phonemes(raw_ph)
        line = f"{entry['path']}|{clean_ph}|{entry['speaker_id']}\n"
        final_lines.append(line)

    # Shuffle or sort? The previous script sorted by segment number.
    # These filenames have numbers too: french_generation_X.wav
    # Let's sort them numerically.
    
    def extract_number(line):
        # path|ph|id
        path = line.split('|')[0]
        # wavs/french_generation_123.wav
        filename = os.path.basename(path)
        # french_generation_123.wav
        try:
            num = int(filename.split('_')[-1].split('.')[0])
            return num
        except:
            return 0

    final_lines.sort(key=extract_number)

    # Split
    split_idx = int(len(final_lines) * args.split_ratio)
    train_data = final_lines[:split_idx]
    val_data = final_lines[split_idx:]

    print(f"Writing {len(train_data)} training lines to {args.train_list}")
    with open(args.train_list, 'w', encoding='utf-8') as f:
        f.writelines(train_data)

    print(f"Writing {len(val_data)} validation lines to {args.val_list}")
    with open(args.val_list, 'w', encoding='utf-8') as f:
        f.writelines(val_data)

    print("Done.")

if __name__ == "__main__":
    main()