Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| Nigerian TTS Data Preprocessor | |
| ============================== | |
| Runs on HuggingFace FREE CPU to preprocess audio data. | |
| Downloads datasets, encodes with WavTokenizer, saves to HF Hub. | |
| """ | |
| import os | |
| os.environ["TRANSFORMERS_NO_TF"] = "1" | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| import torch | |
| import numpy as np | |
| import gradio as gr | |
| import gc | |
| from datasets import load_dataset, concatenate_datasets, Dataset | |
| from huggingface_hub import login, hf_hub_download, HfApi | |
| from tqdm import tqdm | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # ============================================================================ | |
| # CONFIG | |
| # ============================================================================ | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| BASE_MODEL = "HuggingFaceTB/SmolLM2-360M" | |
| WAVTOKENIZER_REPO = "novateur/WavTokenizer-medium-speech-75token" | |
| WAVTOKENIZER_CONFIG = "wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml" | |
| WAVTOKENIZER_CHECKPOINT = "wavtokenizer_medium_speech_320_24k_v2.ckpt" | |
| SAMPLE_RATE = 24000 | |
| AUDIO_VOCAB_SIZE = 4096 | |
| MAX_AUDIO_LENGTH = 20 # seconds | |
| EXISTING_DATASET = "UbuntuFarms/nigerian-tts-preprocessed-v2" | |
| OUTPUT_DATASET = "UbuntuFarms/nigerian-tts-preprocessed-v3" | |
| DEVICE = "cpu" # Free tier = CPU only | |
| # Data sources to add - VERIFIED WORKING datasets | |
| DATA_SOURCES = { | |
| "pidgin": [ | |
| # WORKING: 5883 samples with audio and text | |
| {"name": "Pidgin ASR Combined", "id": "timniel/Pidgin_ASR_Dataset_Combined", "subset": None}, | |
| # WORKING: 65 samples with audio and text | |
| {"name": "Nigerian Pidgin Speech", "id": "Rexe/nigerian-pidgin-speech", "subset": None}, | |
| ], | |
| "english": [ | |
| # WORKING: 498 samples Nigerian English | |
| {"name": "Nigerian English TTS", "id": "Donmonc/nigerian_english_tts", "subset": None}, | |
| ], | |
| # Note: yoruba, hausa, igbo already have 50k+ samples in existing dataset | |
| # Only add more if needed | |
| } | |
| # Global models | |
| WAVTOKENIZER = None | |
| TOKENIZER = None | |
| # ============================================================================ | |
| # MODEL LOADING | |
| # ============================================================================ | |
| def load_models(): | |
| """Load WavTokenizer and text tokenizer.""" | |
| global WAVTOKENIZER, TOKENIZER | |
| print("Loading models on CPU...") | |
| # Text tokenizer | |
| from transformers import AutoTokenizer | |
| TOKENIZER = AutoTokenizer.from_pretrained(BASE_MODEL) | |
| if TOKENIZER.pad_token is None: | |
| TOKENIZER.pad_token = TOKENIZER.eos_token | |
| special_tokens = ["<|audio|>", "[hausa]", "[yoruba]", "[igbo]", "[pidgin]", "[english]"] | |
| TOKENIZER.add_special_tokens({"additional_special_tokens": special_tokens}) | |
| print(f"Tokenizer loaded: {len(TOKENIZER)} tokens") | |
| # WavTokenizer | |
| config_path = hf_hub_download(WAVTOKENIZER_REPO, WAVTOKENIZER_CONFIG) | |
| checkpoint_path = hf_hub_download(WAVTOKENIZER_REPO, WAVTOKENIZER_CHECKPOINT) | |
| from outetts.wav_tokenizer.decoder import WavTokenizer | |
| WAVTOKENIZER = WavTokenizer.from_pretrained0802(config_path, checkpoint_path) | |
| WAVTOKENIZER = WAVTOKENIZER.to(DEVICE) | |
| WAVTOKENIZER.eval() | |
| print("WavTokenizer loaded on CPU") | |
| return "Models loaded successfully!" | |
| def encode_audio(audio_array, sample_rate): | |
| """Encode audio to WavTokenizer codes.""" | |
| import torchaudio.functional as F | |
| # Convert to tensor | |
| if isinstance(audio_array, np.ndarray): | |
| audio_tensor = torch.from_numpy(audio_array).float() | |
| else: | |
| audio_tensor = audio_array.float() | |
| # Ensure mono | |
| if audio_tensor.dim() == 2: | |
| audio_tensor = audio_tensor.mean(dim=0) | |
| # Resample to 24kHz | |
| if sample_rate != SAMPLE_RATE: | |
| audio_tensor = F.resample(audio_tensor, sample_rate, SAMPLE_RATE) | |
| # Normalize | |
| audio_tensor = audio_tensor / (torch.max(torch.abs(audio_tensor)) + 1e-8) | |
| # Encode | |
| audio_tensor = audio_tensor.unsqueeze(0).unsqueeze(0).to(DEVICE) | |
| with torch.no_grad(): | |
| _, codes = WAVTOKENIZER.encode_infer(audio_tensor, bandwidth_id=torch.tensor([0], device=DEVICE)) | |
| return codes.squeeze().cpu().tolist() | |
| # ============================================================================ | |
| # PREPROCESSING | |
| # ============================================================================ | |
| def preprocess_dataset(source_info, language, max_samples, progress=gr.Progress()): | |
| """Download and preprocess a single dataset.""" | |
| global TOKENIZER | |
| if WAVTOKENIZER is None: | |
| return None, "Please load models first!" | |
| text_vocab_size = len(TOKENIZER) | |
| processed = [] | |
| try: | |
| progress(0, desc=f"Loading {source_info['name']}...") | |
| # Load dataset | |
| if source_info.get("subset"): | |
| ds = load_dataset(source_info["id"], source_info["subset"], split="train", trust_remote_code=True) | |
| else: | |
| ds = load_dataset(source_info["id"], split="train", trust_remote_code=True) | |
| total = min(len(ds), max_samples) | |
| if total < len(ds): | |
| ds = ds.shuffle(seed=42).select(range(total)) | |
| # Find columns | |
| audio_col = next((c for c in ds.column_names if "audio" in c.lower()), None) | |
| text_col = next((c for c in ds.column_names if c in ["text", "sentence", "transcription", "transcript"]), None) | |
| if not audio_col or not text_col: | |
| return None, f"Could not find audio/text columns in {ds.column_names}" | |
| # Process | |
| for i, item in enumerate(ds): | |
| if i % 10 == 0: | |
| progress(i / total, desc=f"Processing {i}/{total}...") | |
| try: | |
| audio_data = item[audio_col] | |
| if isinstance(audio_data, dict): | |
| audio_array = audio_data["array"] | |
| sr = audio_data["sampling_rate"] | |
| else: | |
| continue | |
| text = item[text_col] | |
| if not text or len(text.strip()) < 2: | |
| continue | |
| # Check duration | |
| duration = len(audio_array) / sr | |
| if duration < 0.5 or duration > MAX_AUDIO_LENGTH: | |
| continue | |
| # Encode audio | |
| audio_codes = encode_audio(audio_array, sr) | |
| if len(audio_codes) < 10: | |
| continue | |
| # Build input_ids | |
| prompt = f"[{language}] {text.strip()} <|audio|>" | |
| text_ids = TOKENIZER.encode(prompt, add_special_tokens=False) | |
| audio_ids = [code + text_vocab_size for code in audio_codes] | |
| input_ids = text_ids + audio_ids | |
| processed.append({ | |
| "input_ids": input_ids, | |
| "language": language, | |
| }) | |
| except Exception as e: | |
| continue | |
| # Memory cleanup every 100 samples | |
| if i % 100 == 0: | |
| gc.collect() | |
| progress(1.0, desc="Done!") | |
| return processed, f"Processed {len(processed)} samples for {language}" | |
| except Exception as e: | |
| return None, f"Error: {str(e)}" | |
| def run_full_preprocessing(languages, max_per_source, hf_token, progress=gr.Progress()): | |
| """Run preprocessing for selected languages and push to Hub.""" | |
| global HF_TOKEN | |
| if not hf_token: | |
| return "Please provide HuggingFace token!" | |
| HF_TOKEN = hf_token | |
| login(token=HF_TOKEN) | |
| # Load models if needed | |
| if WAVTOKENIZER is None: | |
| load_models() | |
| all_samples = [] | |
| status_log = [] | |
| # Process each selected language | |
| selected_langs = [l.strip().lower() for l in languages.split(",")] | |
| for lang in selected_langs: | |
| if lang not in DATA_SOURCES: | |
| status_log.append(f"Unknown language: {lang}") | |
| continue | |
| for source in DATA_SOURCES[lang]: | |
| progress(0, desc=f"Processing {source['name']}...") | |
| samples, msg = preprocess_dataset(source, lang, max_per_source, progress) | |
| status_log.append(msg) | |
| if samples: | |
| all_samples.extend(samples) | |
| gc.collect() | |
| if not all_samples: | |
| return "\n".join(status_log) + "\n\nNo samples processed!" | |
| status_log.append(f"\nTotal new samples: {len(all_samples)}") | |
| # Load existing and combine | |
| progress(0.9, desc="Combining with existing dataset...") | |
| try: | |
| existing = load_dataset(EXISTING_DATASET, split="train") | |
| existing_slim = existing.select_columns(["input_ids", "language"]) | |
| new_ds = Dataset.from_list(all_samples) | |
| combined = concatenate_datasets([existing_slim, new_ds]) | |
| status_log.append(f"Combined: {len(combined)} total samples") | |
| # Count languages | |
| lang_counts = {} | |
| for item in combined: | |
| l = item.get("language", "unknown") | |
| lang_counts[l] = lang_counts.get(l, 0) + 1 | |
| status_log.append("\nLanguage distribution:") | |
| for l, c in sorted(lang_counts.items(), key=lambda x: -x[1]): | |
| status_log.append(f" {l}: {c:,}") | |
| # Push to Hub | |
| progress(0.95, desc="Pushing to HuggingFace Hub...") | |
| combined.push_to_hub(OUTPUT_DATASET, token=HF_TOKEN) | |
| status_log.append(f"\nDataset saved: https://huggingface.co/datasets/{OUTPUT_DATASET}") | |
| except Exception as e: | |
| status_log.append(f"Error combining/pushing: {str(e)}") | |
| progress(1.0, desc="Complete!") | |
| return "\n".join(status_log) | |
| # ============================================================================ | |
| # GRADIO UI | |
| # ============================================================================ | |
| print("=" * 60) | |
| print("NIGERIAN TTS DATA PREPROCESSOR") | |
| print("Runs on FREE CPU - saves GPU costs!") | |
| print("=" * 60) | |
| with gr.Blocks(title="Nigerian TTS Preprocessor") as demo: | |
| gr.Markdown("# Nigerian TTS Data Preprocessor") | |
| gr.Markdown("Preprocess audio datasets on **FREE CPU** to save RunPod GPU costs.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| hf_token = gr.Textbox( | |
| label="HuggingFace Token (write access)", | |
| type="password", | |
| placeholder="hf_..." | |
| ) | |
| languages = gr.Textbox( | |
| label="Languages to process (comma-separated)", | |
| value="pidgin", | |
| placeholder="pidgin,english" | |
| ) | |
| max_samples = gr.Slider( | |
| minimum=100, | |
| maximum=20000, | |
| value=5000, | |
| step=100, | |
| label="Max samples per source" | |
| ) | |
| load_btn = gr.Button("1. Load Models", variant="secondary") | |
| run_btn = gr.Button("2. Run Preprocessing", variant="primary") | |
| with gr.Column(): | |
| output = gr.Textbox( | |
| label="Status", | |
| lines=20, | |
| max_lines=30 | |
| ) | |
| gr.Markdown(""" | |
| ## Instructions | |
| 1. Enter your HuggingFace token (needs write access) | |
| 2. Click "Load Models" to load WavTokenizer | |
| 3. Set languages to "pidgin" (or "pidgin,english" for both) | |
| 4. Click "Run Preprocessing" - this will take a while on CPU! | |
| 5. Once done, train on RunPod using the new dataset | |
| ## Available Datasets (VERIFIED WORKING) | |
| - **Pidgin**: `timniel/Pidgin_ASR_Dataset_Combined` (5,883 samples) | |
| - **Pidgin**: `Rexe/nigerian-pidgin-speech` (65 samples) | |
| - **English**: `Donmonc/nigerian_english_tts` (498 Nigerian English samples) | |
| ## Current Status | |
| - Existing dataset: `UbuntuFarms/nigerian-tts-preprocessed-v2` (148k samples) | |
| - yoruba: 53,332 samples | |
| - igbo: 47,526 samples | |
| - hausa: 47,288 samples | |
| - **pidgin: 0 samples** (CRITICAL - this is why Pidgin produces white noise!) | |
| - Output: `UbuntuFarms/nigerian-tts-preprocessed-v3` | |
| """) | |
| load_btn.click(fn=load_models, outputs=output) | |
| run_btn.click( | |
| fn=run_full_preprocessing, | |
| inputs=[languages, max_samples, hf_token], | |
| outputs=output | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |