File size: 8,923 Bytes
8f1c298 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 |
import os
import csv
import argparse
import subprocess
import random
import unicodedata
from phonemizer import phonemize
from phonemizer.backend import EspeakBackend
from tqdm import tqdm
# --- Configuration ---
INPUT_CSV = "/root/src/data/elevenlabs_generations_simple_ember_french_v2/mappings.csv"
INPUT_DIR = "/root/src/data/elevenlabs_generations_simple_ember_french_v2"
OUTPUT_WAV_DIR = os.path.join(INPUT_DIR, "wavs")
TRAIN_LIST_OUTPUT = os.path.join(INPUT_DIR, "train_list.txt")
VAL_LIST_OUTPUT = os.path.join(INPUT_DIR, "val_list.txt")
DEFAULT_SPEAKER_ID = 3219
NASAL_VOWEL_MAP = {
'ɑ̃': 'ɑŋ',
'ɔ̃': 'ɔŋ',
'ɛ̃': 'ɛŋ',
'œ̃': 'œŋ'
}
def clean_phonemes(text):
"""
Clean phonemes the same way we cleaned train_list and val_list for emma French Voices.
Operations:
1. Remove hyphens with trailing space (word separators)
2. Normalize Unicode (NFC) to merge combining tilde with vowels
3. Replace nasal vowels with approximations using existing symbols
"""
# Step 1: Remove hyphens with trailing space
if '- ' in text:
text = text.replace('- ', '')
# Also remove hyphens without trailing space
if '-' in text:
text = text.replace('-', '')
# Step 2: Normalize Unicode to merge combining characters
text = unicodedata.normalize('NFC', text)
# Step 3: Replace nasal vowels with their approximation
for nasal_vowel, approximation in NASAL_VOWEL_MAP.items():
if nasal_vowel in text:
text = text.replace(nasal_vowel, approximation)
# Clean up multiple consecutive spaces
text = ' '.join(text.split())
return text
def convert_to_24khz(input_path, output_path):
"""Converts wav file to 24kHz mono using ffmpeg."""
try:
cmd = [
"ffmpeg",
"-y", # Overwrite output file without asking
"-i", input_path,
"-ar", "24000",
"-ac", "1", # Mono
output_path
]
# Run ffmpeg, suppress output unless there's an error
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
return True
except subprocess.CalledProcessError as e:
print(f"Error converting {input_path}: {e.stderr.decode()}")
return False
def main():
parser = argparse.ArgumentParser(description="Process French dataset.")
parser.add_argument("--input_csv", type=str, default=INPUT_CSV)
parser.add_argument("--input_dir", type=str, default=INPUT_DIR)
parser.add_argument("--output_wav_dir", type=str, default=OUTPUT_WAV_DIR)
parser.add_argument("--train_list", type=str, default=TRAIN_LIST_OUTPUT)
parser.add_argument("--val_list", type=str, default=VAL_LIST_OUTPUT)
parser.add_argument("--speaker_id", type=int, default=DEFAULT_SPEAKER_ID)
parser.add_argument("--split_ratio", type=float, default=0.9)
args = parser.parse_args()
print(f"Input CSV: {args.input_csv}")
print(f"Input Dir: {args.input_dir}")
print(f"Output Wav Dir: {args.output_wav_dir}")
print(f"Speaker ID: {args.speaker_id}")
os.makedirs(args.output_wav_dir, exist_ok=True)
entries = []
# Read CSV
with open(args.input_csv, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
entries.append(row)
print(f"Found {len(entries)} entries.")
processed_entries = []
texts_to_phonemize = []
rejected_count = 0
# First pass: Convert audio and collect texts
print("Converting audio to 24kHz...")
for row in tqdm(entries):
# Check duration
try:
duration = float(row['duration_seconds'])
if duration < 1.5 or duration > 18.0:
rejected_count += 1
continue
except (ValueError, KeyError):
print(f"Warning: Invalid duration for {row.get('audio_file', 'unknown')}")
rejected_count += 1
continue
# Handle Windows-style paths in CSV
orig_filename = row['audio_file'].replace('\\', '/')
# The CSV path seems to include the folder name "elevenlabs_generations_simple_ellie_french/"
# But the files are in args.input_dir.
# If input_dir is ".../elevenlabs_generations_simple_ellie_french", and filename is "elevenlabs.../file.wav",
# we might have a duplication or we need to take just the basename.
# Check if the file exists as is relative to input_dir, or if we need to strip the dir prefix.
# Based on the list_dir, the files are directly in input_dir.
# The CSV says "elevenlabs_generations_simple_ellie_french\french_generation_1.wav"
# So we should take the basename.
basename = os.path.basename(orig_filename)
input_wav_path = os.path.join(args.input_dir, basename)
if not os.path.exists(input_wav_path):
# Try the full relative path just in case
input_wav_path_alt = os.path.join(os.path.dirname(args.input_dir), orig_filename)
if os.path.exists(input_wav_path_alt):
input_wav_path = input_wav_path_alt
else:
print(f"Warning: File not found: {input_wav_path}")
continue
output_wav_path = os.path.join(args.output_wav_dir, basename)
if convert_to_24khz(input_wav_path, output_wav_path):
# We'll use relative path for the list file: wavs/basename
relative_path = os.path.join("wavs", basename)
processed_entries.append({
"path": relative_path,
"text": row['text'],
"speaker_id": args.speaker_id
})
texts_to_phonemize.append(row['text'])
if not processed_entries:
print("No entries processed successfully.")
print(f"Rejected {rejected_count} files due to duration constraints (1.5s - 18s).")
return
print(f"Rejected {rejected_count} files due to duration constraints (1.5s - 18s).")
print(f"Processing {len(processed_entries)} files.")
# Phonemize
print("Phonemizing text...")
# Using phonemize library directly as requested
# The user asked for: self.phonemizer = phonemizer.backend.EspeakBackend(language=language, preserve_punctuation=True, with_stress=True,language_switch='remove-flags')
# Note: EspeakBackend is a class, we instantiate it and call phonemize method on list?
# Actually phonemize function is easier if wrapper works, but user was specific about backend init.
# But the simple phonemize function also takes backend arguments.
# Let's try to use the phonemize function with correct args or Backend class if needed.
# The phonemize function wraps the backend.
try:
# Using the simple interface but passing backend specific args is tricky with the wrapper sometimes.
# Let's use the Backend class directly to match user request exactly.
backend = EspeakBackend(
language='fr-fr',
preserve_punctuation=True,
with_stress=True,
language_switch='remove-flags'
)
# backend.phonemize takes a list of texts
phonemized_texts = backend.phonemize(
texts_to_phonemize,
strip=True,
njobs=max(1, os.cpu_count() // 2)
)
except Exception as e:
print(f"Phonemization failed: {e}")
# Fallback or exit
return
# Clean phonemes and combine
final_lines = []
for i, entry in enumerate(processed_entries):
raw_ph = phonemized_texts[i]
clean_ph = clean_phonemes(raw_ph)
line = f"{entry['path']}|{clean_ph}|{entry['speaker_id']}\n"
final_lines.append(line)
# Shuffle or sort? The previous script sorted by segment number.
# These filenames have numbers too: french_generation_X.wav
# Let's sort them numerically.
def extract_number(line):
# path|ph|id
path = line.split('|')[0]
# wavs/french_generation_123.wav
filename = os.path.basename(path)
# french_generation_123.wav
try:
num = int(filename.split('_')[-1].split('.')[0])
return num
except:
return 0
final_lines.sort(key=extract_number)
# Split
split_idx = int(len(final_lines) * args.split_ratio)
train_data = final_lines[:split_idx]
val_data = final_lines[split_idx:]
print(f"Writing {len(train_data)} training lines to {args.train_list}")
with open(args.train_list, 'w', encoding='utf-8') as f:
f.writelines(train_data)
print(f"Writing {len(val_data)} validation lines to {args.val_list}")
with open(args.val_list, 'w', encoding='utf-8') as f:
f.writelines(val_data)
print("Done.")
if __name__ == "__main__":
main()
|