#!/usr/bin/env python3 """ Nigerian TTS Data Preprocessor V4 (Simplified) =============================================== Prepares Pidgin and English datasets for TTS training. Stores audio paths and text - WavTokenizer encoding happens during training. Outputs: UbuntuFarms/nigerian-tts-preprocessed-v4 """ import os os.environ["TRANSFORMERS_NO_TF"] = "1" import gradio as gr import numpy as np from datasets import load_dataset, Dataset, Audio from huggingface_hub import HfApi, login import time # Configuration HF_TOKEN = os.environ.get("HF_TOKEN", "") OUTPUT_DATASET = "UbuntuFarms/nigerian-tts-preprocessed-v4" SAMPLE_RATE = 24000 MAX_DURATION = 20.0 # seconds MIN_DURATION = 1.0 # Datasets to process DATASETS_CONFIG = { # === PIDGIN === "pidgin": { "repo": "asr-nigerian-pidgin/nigerian-pidgin-1.0", "audio_col": "audio", "text_col": "sentence", "language": "pidgin", }, # === ENGLISH === "english_common_voice": { "repo": "benjaminogbonna/nigerian_common_voice_dataset", "config": "english", "audio_col": "audio", "text_col": "sentence", "language": "english_cv", }, "english_accented": { "repo": "benjaminogbonna/nigerian_accented_english_dataset", "audio_col": "audio", "text_col": "sentence", "language": "english_accented", }, # === YORUBA (Additional) === "yoruba_parallel": { "repo": "michsethowusu/yoruba-speech-text-parallel", "audio_col": "audio", "text_col": "text", "language": "yoruba_extra", }, "yoruba_common_voice": { "repo": "benjaminogbonna/nigerian_common_voice_dataset", "config": "yoruba", "audio_col": "audio", "text_col": "sentence", "language": "yoruba_cv", }, # === HAUSA (Additional) === "hausa_twb": { "repo": "CLEAR-Global/TWB-Voice-1.0", "config": "hau", "audio_col": "audio", "text_col": "text", "language": "hausa_twb", }, "hausa_common_voice": { "repo": "benjaminogbonna/nigerian_common_voice_dataset", "config": "hausa", "audio_col": "audio", "text_col": "sentence", "language": "hausa_cv", }, # === IGBO (Additional) === "igbo_common_voice": { "repo": "benjaminogbonna/nigerian_common_voice_dataset", "config": "igbo", "audio_col": "audio", "text_col": "sentence", "language": "igbo_cv", }, } processing_log = [] def log(msg): """Add message to processing log.""" timestamp = time.strftime("%H:%M:%S") log_msg = f"[{timestamp}] {msg}" processing_log.append(log_msg) print(log_msg) return "\n".join(processing_log[-50:]) def process_sample(sample, language, text_col="sentence"): """Process a single sample - just validate and format.""" try: # Get audio info audio = sample.get("audio") if audio is None: return None, "No audio" # Handle different audio formats if hasattr(audio, '__getitem__'): audio_array = audio["array"] sample_rate = audio["sampling_rate"] elif isinstance(audio, dict): audio_array = audio.get("array", []) sample_rate = audio.get("sampling_rate", 16000) else: return None, f"Unknown audio format: {type(audio)}" if len(audio_array) == 0: return None, "Empty audio" # Check duration duration = len(audio_array) / sample_rate if duration < MIN_DURATION: return None, f"Too short: {duration:.1f}s" if duration > MAX_DURATION: return None, f"Too long: {duration:.1f}s" # Get text text = sample.get(text_col, "") if not text or len(text.strip()) < 2: return None, "No text" text = text.strip() # Return processed sample with audio data return { "audio": {"array": np.array(audio_array, dtype=np.float32), "sampling_rate": sample_rate}, "text": text, "language": language, "duration": duration, }, None except Exception as e: return None, str(e) def process_dataset(dataset_key, max_samples=5000, progress=gr.Progress()): """Process a specific dataset.""" global processing_log processing_log = [] if dataset_key not in DATASETS_CONFIG: return f"Unknown dataset: {dataset_key}", "" config = DATASETS_CONFIG[dataset_key] log(f"Processing: {dataset_key}") log(f"Repository: {config['repo']}") # Login to HuggingFace if HF_TOKEN: login(token=HF_TOKEN) log("Logged in to HuggingFace") else: return "Error: HF_TOKEN not set", "\n".join(processing_log) # Load dataset log("Loading dataset...") try: if "config" in config: ds = load_dataset(config["repo"], config["config"], split="train", streaming=True) else: ds = load_dataset(config["repo"], split="train", streaming=True) log("Dataset loaded (streaming mode)") except Exception as e: log(f"Error loading dataset: {e}") return f"Error: {e}", "\n".join(processing_log) # Process samples processed = [] errors = {} log(f"Processing up to {max_samples} samples...") for i, sample in enumerate(ds): if i >= max_samples: break if i % 100 == 0: progress((i / max_samples), f"Processing {i}/{max_samples}") log(f"Progress: {i}/{max_samples} (processed: {len(processed)}, errors: {sum(errors.values())})") result, error = process_sample( sample, config["language"], config.get("text_col", "sentence") ) if result: processed.append(result) else: errors[error] = errors.get(error, 0) + 1 log(f"Processed: {len(processed)} samples") log(f"Errors: {sum(errors.values())}") for error, count in sorted(errors.items(), key=lambda x: -x[1])[:5]: log(f" - {error}: {count}") if len(processed) == 0: return "No samples processed successfully", "\n".join(processing_log) # Create dataset log("Creating HuggingFace dataset...") output_ds = Dataset.from_list(processed) # Cast audio column output_ds = output_ds.cast_column("audio", Audio(sampling_rate=SAMPLE_RATE)) # Push to hub log(f"Pushing to {OUTPUT_DATASET}...") try: api = HfApi(token=HF_TOKEN) # Create repo if needed try: api.dataset_info(OUTPUT_DATASET) except: api.create_repo(OUTPUT_DATASET, repo_type="dataset", exist_ok=True) # Push output_ds.push_to_hub( OUTPUT_DATASET, config_name=config["language"], token=HF_TOKEN, commit_message=f"Add {config['language']} data from {config['repo']}" ) log(f"Pushed to {OUTPUT_DATASET} (config: {config['language']})") except Exception as e: log(f"Push error: {e}") import traceback log(traceback.format_exc()) return f"Push error: {e}", "\n".join(processing_log) return f"Success! Processed {len(processed)} {config['language']} samples", "\n".join(processing_log) def process_all(max_per_dataset=5000, progress=gr.Progress()): """Process all datasets.""" results = [] for i, key in enumerate(DATASETS_CONFIG.keys()): progress((i / len(DATASETS_CONFIG)), f"Processing {key}...") result, _ = process_dataset(key, max_per_dataset, progress) results.append(f"{key}: {result}") return "\n".join(results), "\n".join(processing_log) # Gradio UI with gr.Blocks(title="Nigerian TTS Preprocessor V4") as demo: gr.Markdown(""" # Nigerian TTS Data Preprocessor V4 Prepares Pidgin and English audio datasets for TTS training. Stores audio + text, WavTokenizer encoding happens during training on GPU. **Datasets:** - Pidgin: `asr-nigerian-pidgin/nigerian-pidgin-1.0` - English: `benjaminogbonna/nigerian_common_voice_dataset` - English: `benjaminogbonna/nigerian_accented_english_dataset` **Output:** `UbuntuFarms/nigerian-tts-preprocessed-v4` """) with gr.Row(): dataset_choice = gr.Dropdown( choices=list(DATASETS_CONFIG.keys()) + ["all"], value="pidgin", label="Dataset to Process" ) max_samples = gr.Slider(100, 50000, value=5000, step=100, label="Max Samples") process_btn = gr.Button("Start Processing", variant="primary") with gr.Row(): status = gr.Textbox(label="Status", lines=3) log_output = gr.Textbox(label="Processing Log", lines=20) def run_processing(dataset_key, max_samples, progress=gr.Progress()): if dataset_key == "all": return process_all(int(max_samples), progress) else: return process_dataset(dataset_key, int(max_samples), progress) process_btn.click( run_processing, inputs=[dataset_choice, max_samples], outputs=[status, log_output] ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)