Spaces:

UbuntuFarms
/

nigerian-tts-preprocessor

Runtime error

File size: 12,065 Bytes

#!/usr/bin/env python3
"""
Nigerian TTS Data Preprocessor
==============================
Runs on HuggingFace FREE CPU to preprocess audio data.
Downloads datasets, encodes with WavTokenizer, saves to HF Hub.
"""

import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
import numpy as np
import gradio as gr
import gc
from datasets import load_dataset, concatenate_datasets, Dataset
from huggingface_hub import login, hf_hub_download, HfApi
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIG
# ============================================================================

HF_TOKEN = os.environ.get("HF_TOKEN", "")
BASE_MODEL = "HuggingFaceTB/SmolLM2-360M"
WAVTOKENIZER_REPO = "novateur/WavTokenizer-medium-speech-75token"
WAVTOKENIZER_CONFIG = "wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
WAVTOKENIZER_CHECKPOINT = "wavtokenizer_medium_speech_320_24k_v2.ckpt"

SAMPLE_RATE = 24000
AUDIO_VOCAB_SIZE = 4096
MAX_AUDIO_LENGTH = 20  # seconds

EXISTING_DATASET = "UbuntuFarms/nigerian-tts-preprocessed-v2"
OUTPUT_DATASET = "UbuntuFarms/nigerian-tts-preprocessed-v3"

DEVICE = "cpu"  # Free tier = CPU only

# Data sources to add - VERIFIED WORKING datasets
DATA_SOURCES = {
    "pidgin": [
        # WORKING: 5883 samples with audio and text
        {"name": "Pidgin ASR Combined", "id": "timniel/Pidgin_ASR_Dataset_Combined", "subset": None},
        # WORKING: 65 samples with audio and text
        {"name": "Nigerian Pidgin Speech", "id": "Rexe/nigerian-pidgin-speech", "subset": None},
    ],
    "english": [
        # WORKING: 498 samples Nigerian English
        {"name": "Nigerian English TTS", "id": "Donmonc/nigerian_english_tts", "subset": None},
    ],
    # Note: yoruba, hausa, igbo already have 50k+ samples in existing dataset
    # Only add more if needed
}

# Global models
WAVTOKENIZER = None
TOKENIZER = None

# ============================================================================
# MODEL LOADING
# ============================================================================

def load_models():
    """Load WavTokenizer and text tokenizer."""
    global WAVTOKENIZER, TOKENIZER

    print("Loading models on CPU...")

    # Text tokenizer
    from transformers import AutoTokenizer
    TOKENIZER = AutoTokenizer.from_pretrained(BASE_MODEL)
    if TOKENIZER.pad_token is None:
        TOKENIZER.pad_token = TOKENIZER.eos_token
    special_tokens = ["<|audio|>", "[hausa]", "[yoruba]", "[igbo]", "[pidgin]", "[english]"]
    TOKENIZER.add_special_tokens({"additional_special_tokens": special_tokens})
    print(f"Tokenizer loaded: {len(TOKENIZER)} tokens")

    # WavTokenizer
    config_path = hf_hub_download(WAVTOKENIZER_REPO, WAVTOKENIZER_CONFIG)
    checkpoint_path = hf_hub_download(WAVTOKENIZER_REPO, WAVTOKENIZER_CHECKPOINT)

    from outetts.wav_tokenizer.decoder import WavTokenizer
    WAVTOKENIZER = WavTokenizer.from_pretrained0802(config_path, checkpoint_path)
    WAVTOKENIZER = WAVTOKENIZER.to(DEVICE)
    WAVTOKENIZER.eval()
    print("WavTokenizer loaded on CPU")

    return "Models loaded successfully!"

def encode_audio(audio_array, sample_rate):
    """Encode audio to WavTokenizer codes."""
    import torchaudio.functional as F

    # Convert to tensor
    if isinstance(audio_array, np.ndarray):
        audio_tensor = torch.from_numpy(audio_array).float()
    else:
        audio_tensor = audio_array.float()

    # Ensure mono
    if audio_tensor.dim() == 2:
        audio_tensor = audio_tensor.mean(dim=0)

    # Resample to 24kHz
    if sample_rate != SAMPLE_RATE:
        audio_tensor = F.resample(audio_tensor, sample_rate, SAMPLE_RATE)

    # Normalize
    audio_tensor = audio_tensor / (torch.max(torch.abs(audio_tensor)) + 1e-8)

    # Encode
    audio_tensor = audio_tensor.unsqueeze(0).unsqueeze(0).to(DEVICE)

    with torch.no_grad():
        _, codes = WAVTOKENIZER.encode_infer(audio_tensor, bandwidth_id=torch.tensor([0], device=DEVICE))

    return codes.squeeze().cpu().tolist()

# ============================================================================
# PREPROCESSING
# ============================================================================

def preprocess_dataset(source_info, language, max_samples, progress=gr.Progress()):
    """Download and preprocess a single dataset."""
    global TOKENIZER

    if WAVTOKENIZER is None:
        return None, "Please load models first!"

    text_vocab_size = len(TOKENIZER)
    processed = []

    try:
        progress(0, desc=f"Loading {source_info['name']}...")

        # Load dataset
        if source_info.get("subset"):
            ds = load_dataset(source_info["id"], source_info["subset"], split="train", trust_remote_code=True)
        else:
            ds = load_dataset(source_info["id"], split="train", trust_remote_code=True)

        total = min(len(ds), max_samples)
        if total < len(ds):
            ds = ds.shuffle(seed=42).select(range(total))

        # Find columns
        audio_col = next((c for c in ds.column_names if "audio" in c.lower()), None)
        text_col = next((c for c in ds.column_names if c in ["text", "sentence", "transcription", "transcript"]), None)

        if not audio_col or not text_col:
            return None, f"Could not find audio/text columns in {ds.column_names}"

        # Process
        for i, item in enumerate(ds):
            if i % 10 == 0:
                progress(i / total, desc=f"Processing {i}/{total}...")

            try:
                audio_data = item[audio_col]
                if isinstance(audio_data, dict):
                    audio_array = audio_data["array"]
                    sr = audio_data["sampling_rate"]
                else:
                    continue

                text = item[text_col]
                if not text or len(text.strip()) < 2:
                    continue

                # Check duration
                duration = len(audio_array) / sr
                if duration < 0.5 or duration > MAX_AUDIO_LENGTH:
                    continue

                # Encode audio
                audio_codes = encode_audio(audio_array, sr)
                if len(audio_codes) < 10:
                    continue

                # Build input_ids
                prompt = f"[{language}] {text.strip()} <|audio|>"
                text_ids = TOKENIZER.encode(prompt, add_special_tokens=False)
                audio_ids = [code + text_vocab_size for code in audio_codes]
                input_ids = text_ids + audio_ids

                processed.append({
                    "input_ids": input_ids,
                    "language": language,
                })

            except Exception as e:
                continue

            # Memory cleanup every 100 samples
            if i % 100 == 0:
                gc.collect()

        progress(1.0, desc="Done!")
        return processed, f"Processed {len(processed)} samples for {language}"

    except Exception as e:
        return None, f"Error: {str(e)}"

def run_full_preprocessing(languages, max_per_source, hf_token, progress=gr.Progress()):
    """Run preprocessing for selected languages and push to Hub."""
    global HF_TOKEN

    if not hf_token:
        return "Please provide HuggingFace token!"

    HF_TOKEN = hf_token
    login(token=HF_TOKEN)

    # Load models if needed
    if WAVTOKENIZER is None:
        load_models()

    all_samples = []
    status_log = []

    # Process each selected language
    selected_langs = [l.strip().lower() for l in languages.split(",")]

    for lang in selected_langs:
        if lang not in DATA_SOURCES:
            status_log.append(f"Unknown language: {lang}")
            continue

        for source in DATA_SOURCES[lang]:
            progress(0, desc=f"Processing {source['name']}...")
            samples, msg = preprocess_dataset(source, lang, max_per_source, progress)
            status_log.append(msg)

            if samples:
                all_samples.extend(samples)

            gc.collect()

    if not all_samples:
        return "\n".join(status_log) + "\n\nNo samples processed!"

    status_log.append(f"\nTotal new samples: {len(all_samples)}")

    # Load existing and combine
    progress(0.9, desc="Combining with existing dataset...")
    try:
        existing = load_dataset(EXISTING_DATASET, split="train")
        existing_slim = existing.select_columns(["input_ids", "language"])

        new_ds = Dataset.from_list(all_samples)
        combined = concatenate_datasets([existing_slim, new_ds])

        status_log.append(f"Combined: {len(combined)} total samples")

        # Count languages
        lang_counts = {}
        for item in combined:
            l = item.get("language", "unknown")
            lang_counts[l] = lang_counts.get(l, 0) + 1

        status_log.append("\nLanguage distribution:")
        for l, c in sorted(lang_counts.items(), key=lambda x: -x[1]):
            status_log.append(f"  {l}: {c:,}")

        # Push to Hub
        progress(0.95, desc="Pushing to HuggingFace Hub...")
        combined.push_to_hub(OUTPUT_DATASET, token=HF_TOKEN)
        status_log.append(f"\nDataset saved: https://huggingface.co/datasets/{OUTPUT_DATASET}")

    except Exception as e:
        status_log.append(f"Error combining/pushing: {str(e)}")

    progress(1.0, desc="Complete!")
    return "\n".join(status_log)

# ============================================================================
# GRADIO UI
# ============================================================================

print("=" * 60)
print("NIGERIAN TTS DATA PREPROCESSOR")
print("Runs on FREE CPU - saves GPU costs!")
print("=" * 60)

with gr.Blocks(title="Nigerian TTS Preprocessor") as demo:
    gr.Markdown("# Nigerian TTS Data Preprocessor")
    gr.Markdown("Preprocess audio datasets on **FREE CPU** to save RunPod GPU costs.")

    with gr.Row():
        with gr.Column():
            hf_token = gr.Textbox(
                label="HuggingFace Token (write access)",
                type="password",
                placeholder="hf_..."
            )
            languages = gr.Textbox(
                label="Languages to process (comma-separated)",
                value="pidgin",
                placeholder="pidgin,english"
            )
            max_samples = gr.Slider(
                minimum=100,
                maximum=20000,
                value=5000,
                step=100,
                label="Max samples per source"
            )

            load_btn = gr.Button("1. Load Models", variant="secondary")
            run_btn = gr.Button("2. Run Preprocessing", variant="primary")

        with gr.Column():
            output = gr.Textbox(
                label="Status",
                lines=20,
                max_lines=30
            )

    gr.Markdown("""
    ## Instructions
    1. Enter your HuggingFace token (needs write access)
    2. Click "Load Models" to load WavTokenizer
    3. Set languages to "pidgin" (or "pidgin,english" for both)
    4. Click "Run Preprocessing" - this will take a while on CPU!
    5. Once done, train on RunPod using the new dataset

    ## Available Datasets (VERIFIED WORKING)
    - **Pidgin**: `timniel/Pidgin_ASR_Dataset_Combined` (5,883 samples)
    - **Pidgin**: `Rexe/nigerian-pidgin-speech` (65 samples)
    - **English**: `Donmonc/nigerian_english_tts` (498 Nigerian English samples)

    ## Current Status
    - Existing dataset: `UbuntuFarms/nigerian-tts-preprocessed-v2` (148k samples)
    - yoruba: 53,332 samples
    - igbo: 47,526 samples
    - hausa: 47,288 samples
    - **pidgin: 0 samples** (CRITICAL - this is why Pidgin produces white noise!)
    - Output: `UbuntuFarms/nigerian-tts-preprocessed-v3`
    """)

    load_btn.click(fn=load_models, outputs=output)
    run_btn.click(
        fn=run_full_preprocessing,
        inputs=[languages, max_samples, hf_token],
        outputs=output
    )

if __name__ == "__main__":
    demo.launch()