Upload new model: raw-datasets for ellie and ember french voices v2

Browse files

Files changed (2) hide show

raw-datasets/ellie-ember-french-v2.zip +3 -0
raw-datasets/process_french_dataset_v2.py +237 -0

raw-datasets/ellie-ember-french-v2.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b7026c219e87cc4486dc7c6b09dbc4fbd29093f76439532f802b2ef76e90e5a
+size 904490716

raw-datasets/process_french_dataset_v2.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import os
+import csv
+import argparse
+import subprocess
+import random
+import unicodedata
+from phonemizer import phonemize
+from phonemizer.backend import EspeakBackend
+from tqdm import tqdm
+# --- Configuration ---
+INPUT_CSV = "/root/src/data/elevenlabs_generations_simple_ember_french_v2/mappings.csv"
+INPUT_DIR = "/root/src/data/elevenlabs_generations_simple_ember_french_v2"
+OUTPUT_WAV_DIR = os.path.join(INPUT_DIR, "wavs")
+TRAIN_LIST_OUTPUT = os.path.join(INPUT_DIR, "train_list.txt")
+VAL_LIST_OUTPUT = os.path.join(INPUT_DIR, "val_list.txt")
+DEFAULT_SPEAKER_ID = 3219
+NASAL_VOWEL_MAP = {
+    'ɑ̃': 'ɑŋ',
+    'ɔ̃': 'ɔŋ',
+    'ɛ̃': 'ɛŋ',
+    'œ̃': 'œŋ'
+}
+def clean_phonemes(text):
+    """
+    Clean phonemes the same way we cleaned train_list and val_list for emma French Voices.
+    Operations:
+    1. Remove hyphens with trailing space (word separators)
+    2. Normalize Unicode (NFC) to merge combining tilde with vowels
+    3. Replace nasal vowels with approximations using existing symbols
+    """
+    # Step 1: Remove hyphens with trailing space
+    if '- ' in text:
+        text = text.replace('- ', '')
+    # Also remove hyphens without trailing space
+    if '-' in text:
+        text = text.replace('-', '')
+    # Step 2: Normalize Unicode to merge combining characters
+    text = unicodedata.normalize('NFC', text)
+    # Step 3: Replace nasal vowels with their approximation
+    for nasal_vowel, approximation in NASAL_VOWEL_MAP.items():
+        if nasal_vowel in text:
+            text = text.replace(nasal_vowel, approximation)
+    # Clean up multiple consecutive spaces
+    text = ' '.join(text.split())
+    return text
+def convert_to_24khz(input_path, output_path):
+    """Converts wav file to 24kHz mono using ffmpeg."""
+    try:
+        cmd = [
+            "ffmpeg",
+            "-y",  # Overwrite output file without asking
+            "-i", input_path,
+            "-ar", "24000",
+            "-ac", "1",  # Mono
+            output_path
+        ]
+        # Run ffmpeg, suppress output unless there's an error
+        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"Error converting {input_path}: {e.stderr.decode()}")
+        return False
+def main():
+    parser = argparse.ArgumentParser(description="Process French dataset.")
+    parser.add_argument("--input_csv", type=str, default=INPUT_CSV)
+    parser.add_argument("--input_dir", type=str, default=INPUT_DIR)
+    parser.add_argument("--output_wav_dir", type=str, default=OUTPUT_WAV_DIR)
+    parser.add_argument("--train_list", type=str, default=TRAIN_LIST_OUTPUT)
+    parser.add_argument("--val_list", type=str, default=VAL_LIST_OUTPUT)
+    parser.add_argument("--speaker_id", type=int, default=DEFAULT_SPEAKER_ID)
+    parser.add_argument("--split_ratio", type=float, default=0.9)
+    args = parser.parse_args()
+    print(f"Input CSV: {args.input_csv}")
+    print(f"Input Dir: {args.input_dir}")
+    print(f"Output Wav Dir: {args.output_wav_dir}")
+    print(f"Speaker ID: {args.speaker_id}")
+    os.makedirs(args.output_wav_dir, exist_ok=True)
+    entries = []
+    # Read CSV
+    with open(args.input_csv, 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            entries.append(row)
+    print(f"Found {len(entries)} entries.")
+    processed_entries = []
+    texts_to_phonemize = []
+    rejected_count = 0
+    # First pass: Convert audio and collect texts
+    print("Converting audio to 24kHz...")
+    for row in tqdm(entries):
+        # Check duration
+        try:
+            duration = float(row['duration_seconds'])
+            if duration < 1.5 or duration > 18.0:
+                rejected_count += 1
+                continue
+        except (ValueError, KeyError):
+            print(f"Warning: Invalid duration for {row.get('audio_file', 'unknown')}")
+            rejected_count += 1
+            continue
+        # Handle Windows-style paths in CSV
+        orig_filename = row['audio_file'].replace('\\', '/')
+        # The CSV path seems to include the folder name "elevenlabs_generations_simple_ellie_french/"
+        # But the files are in args.input_dir.
+        # If input_dir is ".../elevenlabs_generations_simple_ellie_french", and filename is "elevenlabs.../file.wav",
+        # we might have a duplication or we need to take just the basename.
+        # Check if the file exists as is relative to input_dir, or if we need to strip the dir prefix.
+        # Based on the list_dir, the files are directly in input_dir.
+        # The CSV says "elevenlabs_generations_simple_ellie_french\french_generation_1.wav"
+        # So we should take the basename.
+        basename = os.path.basename(orig_filename)
+        input_wav_path = os.path.join(args.input_dir, basename)
+        if not os.path.exists(input_wav_path):
+            # Try the full relative path just in case
+            input_wav_path_alt = os.path.join(os.path.dirname(args.input_dir), orig_filename)
+            if os.path.exists(input_wav_path_alt):
+                input_wav_path = input_wav_path_alt
+            else:
+                print(f"Warning: File not found: {input_wav_path}")
+                continue
+        output_wav_path = os.path.join(args.output_wav_dir, basename)
+        if convert_to_24khz(input_wav_path, output_wav_path):
+            # We'll use relative path for the list file: wavs/basename
+            relative_path = os.path.join("wavs", basename)
+            processed_entries.append({
+                "path": relative_path,
+                "text": row['text'],
+                "speaker_id": args.speaker_id
+            })
+            texts_to_phonemize.append(row['text'])
+    if not processed_entries:
+        print("No entries processed successfully.")
+        print(f"Rejected {rejected_count} files due to duration constraints (1.5s - 18s).")
+        return
+    print(f"Rejected {rejected_count} files due to duration constraints (1.5s - 18s).")
+    print(f"Processing {len(processed_entries)} files.")
+    # Phonemize
+    print("Phonemizing text...")
+    # Using phonemize library directly as requested
+    # The user asked for: self.phonemizer = phonemizer.backend.EspeakBackend(language=language, preserve_punctuation=True, with_stress=True,language_switch='remove-flags')
+    # Note: EspeakBackend is a class, we instantiate it and call phonemize method on list?
+    # Actually phonemize function is easier if wrapper works, but user was specific about backend init.
+    # But the simple phonemize function also takes backend arguments.
+    # Let's try to use the phonemize function with correct args or Backend class if needed.
+    # The phonemize function wraps the backend.
+    try:
+        # Using the simple interface but passing backend specific args is tricky with the wrapper sometimes.
+        # Let's use the Backend class directly to match user request exactly.
+        backend = EspeakBackend(
+            language='fr-fr',
+            preserve_punctuation=True,
+            with_stress=True,
+            language_switch='remove-flags'
+        )
+        # backend.phonemize takes a list of texts
+        phonemized_texts = backend.phonemize(
+            texts_to_phonemize,
+            strip=True,
+            njobs=max(1, os.cpu_count() // 2)
+        )
+    except Exception as e:
+        print(f"Phonemization failed: {e}")
+        # Fallback or exit
+        return
+    # Clean phonemes and combine
+    final_lines = []
+    for i, entry in enumerate(processed_entries):
+        raw_ph = phonemized_texts[i]
+        clean_ph = clean_phonemes(raw_ph)
+        line = f"{entry['path']}|{clean_ph}|{entry['speaker_id']}\n"
+        final_lines.append(line)
+    # Shuffle or sort? The previous script sorted by segment number.
+    # These filenames have numbers too: french_generation_X.wav
+    # Let's sort them numerically.
+    def extract_number(line):
+        # path|ph|id
+        path = line.split('|')[0]
+        # wavs/french_generation_123.wav
+        filename = os.path.basename(path)
+        # french_generation_123.wav
+        try:
+            num = int(filename.split('_')[-1].split('.')[0])
+            return num
+        except:
+            return 0
+    final_lines.sort(key=extract_number)
+    # Split
+    split_idx = int(len(final_lines) * args.split_ratio)
+    train_data = final_lines[:split_idx]
+    val_data = final_lines[split_idx:]
+    print(f"Writing {len(train_data)} training lines to {args.train_list}")
+    with open(args.train_list, 'w', encoding='utf-8') as f:
+        f.writelines(train_data)
+    print(f"Writing {len(val_data)} validation lines to {args.val_list}")
+    with open(args.val_list, 'w', encoding='utf-8') as f:
+        f.writelines(val_data)
+    print("Done.")
+if __name__ == "__main__":
+    main()