| |
| """ |
| Nigerian TTS Data Preprocessor V4 (Simplified) |
| =============================================== |
| Prepares Pidgin and English datasets for TTS training. |
| Stores audio paths and text - WavTokenizer encoding happens during training. |
| |
| Outputs: UbuntuFarms/nigerian-tts-preprocessed-v4 |
| """ |
|
|
| import os |
| os.environ["TRANSFORMERS_NO_TF"] = "1" |
|
|
| import gradio as gr |
| import numpy as np |
| from datasets import load_dataset, Dataset, Audio |
| from huggingface_hub import HfApi, login |
| import time |
|
|
| |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") |
| OUTPUT_DATASET = "UbuntuFarms/nigerian-tts-preprocessed-v4" |
| SAMPLE_RATE = 24000 |
| MAX_DURATION = 20.0 |
| MIN_DURATION = 1.0 |
|
|
| |
| DATASETS_CONFIG = { |
| |
| "pidgin": { |
| "repo": "asr-nigerian-pidgin/nigerian-pidgin-1.0", |
| "audio_col": "audio", |
| "text_col": "sentence", |
| "language": "pidgin", |
| }, |
| |
| "english_common_voice": { |
| "repo": "benjaminogbonna/nigerian_common_voice_dataset", |
| "config": "english", |
| "audio_col": "audio", |
| "text_col": "sentence", |
| "language": "english_cv", |
| }, |
| "english_accented": { |
| "repo": "benjaminogbonna/nigerian_accented_english_dataset", |
| "audio_col": "audio", |
| "text_col": "sentence", |
| "language": "english_accented", |
| }, |
| |
| "yoruba_parallel": { |
| "repo": "michsethowusu/yoruba-speech-text-parallel", |
| "audio_col": "audio", |
| "text_col": "text", |
| "language": "yoruba_extra", |
| }, |
| "yoruba_common_voice": { |
| "repo": "benjaminogbonna/nigerian_common_voice_dataset", |
| "config": "yoruba", |
| "audio_col": "audio", |
| "text_col": "sentence", |
| "language": "yoruba_cv", |
| }, |
| |
| "hausa_twb": { |
| "repo": "CLEAR-Global/TWB-Voice-1.0", |
| "config": "hau", |
| "audio_col": "audio", |
| "text_col": "text", |
| "language": "hausa_twb", |
| }, |
| "hausa_common_voice": { |
| "repo": "benjaminogbonna/nigerian_common_voice_dataset", |
| "config": "hausa", |
| "audio_col": "audio", |
| "text_col": "sentence", |
| "language": "hausa_cv", |
| }, |
| |
| "igbo_common_voice": { |
| "repo": "benjaminogbonna/nigerian_common_voice_dataset", |
| "config": "igbo", |
| "audio_col": "audio", |
| "text_col": "sentence", |
| "language": "igbo_cv", |
| }, |
| } |
|
|
| processing_log = [] |
|
|
| def log(msg): |
| """Add message to processing log.""" |
| timestamp = time.strftime("%H:%M:%S") |
| log_msg = f"[{timestamp}] {msg}" |
| processing_log.append(log_msg) |
| print(log_msg) |
| return "\n".join(processing_log[-50:]) |
|
|
| def process_sample(sample, language, text_col="sentence"): |
| """Process a single sample - just validate and format.""" |
| try: |
| |
| audio = sample.get("audio") |
| if audio is None: |
| return None, "No audio" |
|
|
| |
| if hasattr(audio, '__getitem__'): |
| audio_array = audio["array"] |
| sample_rate = audio["sampling_rate"] |
| elif isinstance(audio, dict): |
| audio_array = audio.get("array", []) |
| sample_rate = audio.get("sampling_rate", 16000) |
| else: |
| return None, f"Unknown audio format: {type(audio)}" |
|
|
| if len(audio_array) == 0: |
| return None, "Empty audio" |
|
|
| |
| duration = len(audio_array) / sample_rate |
| if duration < MIN_DURATION: |
| return None, f"Too short: {duration:.1f}s" |
| if duration > MAX_DURATION: |
| return None, f"Too long: {duration:.1f}s" |
|
|
| |
| text = sample.get(text_col, "") |
| if not text or len(text.strip()) < 2: |
| return None, "No text" |
|
|
| text = text.strip() |
|
|
| |
| return { |
| "audio": {"array": np.array(audio_array, dtype=np.float32), "sampling_rate": sample_rate}, |
| "text": text, |
| "language": language, |
| "duration": duration, |
| }, None |
|
|
| except Exception as e: |
| return None, str(e) |
|
|
| def process_dataset(dataset_key, max_samples=5000, progress=gr.Progress()): |
| """Process a specific dataset.""" |
| global processing_log |
| processing_log = [] |
|
|
| if dataset_key not in DATASETS_CONFIG: |
| return f"Unknown dataset: {dataset_key}", "" |
|
|
| config = DATASETS_CONFIG[dataset_key] |
| log(f"Processing: {dataset_key}") |
| log(f"Repository: {config['repo']}") |
|
|
| |
| if HF_TOKEN: |
| login(token=HF_TOKEN) |
| log("Logged in to HuggingFace") |
| else: |
| return "Error: HF_TOKEN not set", "\n".join(processing_log) |
|
|
| |
| log("Loading dataset...") |
| try: |
| if "config" in config: |
| ds = load_dataset(config["repo"], config["config"], split="train", streaming=True) |
| else: |
| ds = load_dataset(config["repo"], split="train", streaming=True) |
| log("Dataset loaded (streaming mode)") |
| except Exception as e: |
| log(f"Error loading dataset: {e}") |
| return f"Error: {e}", "\n".join(processing_log) |
|
|
| |
| processed = [] |
| errors = {} |
|
|
| log(f"Processing up to {max_samples} samples...") |
|
|
| for i, sample in enumerate(ds): |
| if i >= max_samples: |
| break |
|
|
| if i % 100 == 0: |
| progress((i / max_samples), f"Processing {i}/{max_samples}") |
| log(f"Progress: {i}/{max_samples} (processed: {len(processed)}, errors: {sum(errors.values())})") |
|
|
| result, error = process_sample( |
| sample, |
| config["language"], |
| config.get("text_col", "sentence") |
| ) |
|
|
| if result: |
| processed.append(result) |
| else: |
| errors[error] = errors.get(error, 0) + 1 |
|
|
| log(f"Processed: {len(processed)} samples") |
| log(f"Errors: {sum(errors.values())}") |
| for error, count in sorted(errors.items(), key=lambda x: -x[1])[:5]: |
| log(f" - {error}: {count}") |
|
|
| if len(processed) == 0: |
| return "No samples processed successfully", "\n".join(processing_log) |
|
|
| |
| log("Creating HuggingFace dataset...") |
| output_ds = Dataset.from_list(processed) |
|
|
| |
| output_ds = output_ds.cast_column("audio", Audio(sampling_rate=SAMPLE_RATE)) |
|
|
| |
| log(f"Pushing to {OUTPUT_DATASET}...") |
| try: |
| api = HfApi(token=HF_TOKEN) |
|
|
| |
| try: |
| api.dataset_info(OUTPUT_DATASET) |
| except: |
| api.create_repo(OUTPUT_DATASET, repo_type="dataset", exist_ok=True) |
|
|
| |
| output_ds.push_to_hub( |
| OUTPUT_DATASET, |
| config_name=config["language"], |
| token=HF_TOKEN, |
| commit_message=f"Add {config['language']} data from {config['repo']}" |
| ) |
|
|
| log(f"Pushed to {OUTPUT_DATASET} (config: {config['language']})") |
|
|
| except Exception as e: |
| log(f"Push error: {e}") |
| import traceback |
| log(traceback.format_exc()) |
| return f"Push error: {e}", "\n".join(processing_log) |
|
|
| return f"Success! Processed {len(processed)} {config['language']} samples", "\n".join(processing_log) |
|
|
| def process_all(max_per_dataset=5000, progress=gr.Progress()): |
| """Process all datasets.""" |
| results = [] |
|
|
| for i, key in enumerate(DATASETS_CONFIG.keys()): |
| progress((i / len(DATASETS_CONFIG)), f"Processing {key}...") |
| result, _ = process_dataset(key, max_per_dataset, progress) |
| results.append(f"{key}: {result}") |
|
|
| return "\n".join(results), "\n".join(processing_log) |
|
|
| |
| with gr.Blocks(title="Nigerian TTS Preprocessor V4") as demo: |
| gr.Markdown(""" |
| # Nigerian TTS Data Preprocessor V4 |
| |
| Prepares Pidgin and English audio datasets for TTS training. |
| Stores audio + text, WavTokenizer encoding happens during training on GPU. |
| |
| **Datasets:** |
| - Pidgin: `asr-nigerian-pidgin/nigerian-pidgin-1.0` |
| - English: `benjaminogbonna/nigerian_common_voice_dataset` |
| - English: `benjaminogbonna/nigerian_accented_english_dataset` |
| |
| **Output:** `UbuntuFarms/nigerian-tts-preprocessed-v4` |
| """) |
|
|
| with gr.Row(): |
| dataset_choice = gr.Dropdown( |
| choices=list(DATASETS_CONFIG.keys()) + ["all"], |
| value="pidgin", |
| label="Dataset to Process" |
| ) |
| max_samples = gr.Slider(100, 50000, value=5000, step=100, label="Max Samples") |
|
|
| process_btn = gr.Button("Start Processing", variant="primary") |
|
|
| with gr.Row(): |
| status = gr.Textbox(label="Status", lines=3) |
| log_output = gr.Textbox(label="Processing Log", lines=20) |
|
|
| def run_processing(dataset_key, max_samples, progress=gr.Progress()): |
| if dataset_key == "all": |
| return process_all(int(max_samples), progress) |
| else: |
| return process_dataset(dataset_key, int(max_samples), progress) |
|
|
| process_btn.click( |
| run_processing, |
| inputs=[dataset_choice, max_samples], |
| outputs=[status, log_output] |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True) |
|
|