Imakandi-Labs's picture
Upload folder using huggingface_hub
53f67e7 verified
#!/usr/bin/env python3
"""
Nigerian TTS Data Preprocessor
==============================
Runs on HuggingFace FREE CPU to preprocess audio data.
Downloads datasets, encodes with WavTokenizer, saves to HF Hub.
"""
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import torch
import numpy as np
import gradio as gr
import gc
from datasets import load_dataset, concatenate_datasets, Dataset
from huggingface_hub import login, hf_hub_download, HfApi
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
# ============================================================================
# CONFIG
# ============================================================================
HF_TOKEN = os.environ.get("HF_TOKEN", "")
BASE_MODEL = "HuggingFaceTB/SmolLM2-360M"
WAVTOKENIZER_REPO = "novateur/WavTokenizer-medium-speech-75token"
WAVTOKENIZER_CONFIG = "wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
WAVTOKENIZER_CHECKPOINT = "wavtokenizer_medium_speech_320_24k_v2.ckpt"
SAMPLE_RATE = 24000
AUDIO_VOCAB_SIZE = 4096
MAX_AUDIO_LENGTH = 20 # seconds
EXISTING_DATASET = "UbuntuFarms/nigerian-tts-preprocessed-v2"
OUTPUT_DATASET = "UbuntuFarms/nigerian-tts-preprocessed-v3"
DEVICE = "cpu" # Free tier = CPU only
# Data sources to add - VERIFIED WORKING datasets
DATA_SOURCES = {
"pidgin": [
# WORKING: 5883 samples with audio and text
{"name": "Pidgin ASR Combined", "id": "timniel/Pidgin_ASR_Dataset_Combined", "subset": None},
# WORKING: 65 samples with audio and text
{"name": "Nigerian Pidgin Speech", "id": "Rexe/nigerian-pidgin-speech", "subset": None},
],
"english": [
# WORKING: 498 samples Nigerian English
{"name": "Nigerian English TTS", "id": "Donmonc/nigerian_english_tts", "subset": None},
],
# Note: yoruba, hausa, igbo already have 50k+ samples in existing dataset
# Only add more if needed
}
# Global models
WAVTOKENIZER = None
TOKENIZER = None
# ============================================================================
# MODEL LOADING
# ============================================================================
def load_models():
"""Load WavTokenizer and text tokenizer."""
global WAVTOKENIZER, TOKENIZER
print("Loading models on CPU...")
# Text tokenizer
from transformers import AutoTokenizer
TOKENIZER = AutoTokenizer.from_pretrained(BASE_MODEL)
if TOKENIZER.pad_token is None:
TOKENIZER.pad_token = TOKENIZER.eos_token
special_tokens = ["<|audio|>", "[hausa]", "[yoruba]", "[igbo]", "[pidgin]", "[english]"]
TOKENIZER.add_special_tokens({"additional_special_tokens": special_tokens})
print(f"Tokenizer loaded: {len(TOKENIZER)} tokens")
# WavTokenizer
config_path = hf_hub_download(WAVTOKENIZER_REPO, WAVTOKENIZER_CONFIG)
checkpoint_path = hf_hub_download(WAVTOKENIZER_REPO, WAVTOKENIZER_CHECKPOINT)
from outetts.wav_tokenizer.decoder import WavTokenizer
WAVTOKENIZER = WavTokenizer.from_pretrained0802(config_path, checkpoint_path)
WAVTOKENIZER = WAVTOKENIZER.to(DEVICE)
WAVTOKENIZER.eval()
print("WavTokenizer loaded on CPU")
return "Models loaded successfully!"
def encode_audio(audio_array, sample_rate):
"""Encode audio to WavTokenizer codes."""
import torchaudio.functional as F
# Convert to tensor
if isinstance(audio_array, np.ndarray):
audio_tensor = torch.from_numpy(audio_array).float()
else:
audio_tensor = audio_array.float()
# Ensure mono
if audio_tensor.dim() == 2:
audio_tensor = audio_tensor.mean(dim=0)
# Resample to 24kHz
if sample_rate != SAMPLE_RATE:
audio_tensor = F.resample(audio_tensor, sample_rate, SAMPLE_RATE)
# Normalize
audio_tensor = audio_tensor / (torch.max(torch.abs(audio_tensor)) + 1e-8)
# Encode
audio_tensor = audio_tensor.unsqueeze(0).unsqueeze(0).to(DEVICE)
with torch.no_grad():
_, codes = WAVTOKENIZER.encode_infer(audio_tensor, bandwidth_id=torch.tensor([0], device=DEVICE))
return codes.squeeze().cpu().tolist()
# ============================================================================
# PREPROCESSING
# ============================================================================
def preprocess_dataset(source_info, language, max_samples, progress=gr.Progress()):
"""Download and preprocess a single dataset."""
global TOKENIZER
if WAVTOKENIZER is None:
return None, "Please load models first!"
text_vocab_size = len(TOKENIZER)
processed = []
try:
progress(0, desc=f"Loading {source_info['name']}...")
# Load dataset
if source_info.get("subset"):
ds = load_dataset(source_info["id"], source_info["subset"], split="train", trust_remote_code=True)
else:
ds = load_dataset(source_info["id"], split="train", trust_remote_code=True)
total = min(len(ds), max_samples)
if total < len(ds):
ds = ds.shuffle(seed=42).select(range(total))
# Find columns
audio_col = next((c for c in ds.column_names if "audio" in c.lower()), None)
text_col = next((c for c in ds.column_names if c in ["text", "sentence", "transcription", "transcript"]), None)
if not audio_col or not text_col:
return None, f"Could not find audio/text columns in {ds.column_names}"
# Process
for i, item in enumerate(ds):
if i % 10 == 0:
progress(i / total, desc=f"Processing {i}/{total}...")
try:
audio_data = item[audio_col]
if isinstance(audio_data, dict):
audio_array = audio_data["array"]
sr = audio_data["sampling_rate"]
else:
continue
text = item[text_col]
if not text or len(text.strip()) < 2:
continue
# Check duration
duration = len(audio_array) / sr
if duration < 0.5 or duration > MAX_AUDIO_LENGTH:
continue
# Encode audio
audio_codes = encode_audio(audio_array, sr)
if len(audio_codes) < 10:
continue
# Build input_ids
prompt = f"[{language}] {text.strip()} <|audio|>"
text_ids = TOKENIZER.encode(prompt, add_special_tokens=False)
audio_ids = [code + text_vocab_size for code in audio_codes]
input_ids = text_ids + audio_ids
processed.append({
"input_ids": input_ids,
"language": language,
})
except Exception as e:
continue
# Memory cleanup every 100 samples
if i % 100 == 0:
gc.collect()
progress(1.0, desc="Done!")
return processed, f"Processed {len(processed)} samples for {language}"
except Exception as e:
return None, f"Error: {str(e)}"
def run_full_preprocessing(languages, max_per_source, hf_token, progress=gr.Progress()):
"""Run preprocessing for selected languages and push to Hub."""
global HF_TOKEN
if not hf_token:
return "Please provide HuggingFace token!"
HF_TOKEN = hf_token
login(token=HF_TOKEN)
# Load models if needed
if WAVTOKENIZER is None:
load_models()
all_samples = []
status_log = []
# Process each selected language
selected_langs = [l.strip().lower() for l in languages.split(",")]
for lang in selected_langs:
if lang not in DATA_SOURCES:
status_log.append(f"Unknown language: {lang}")
continue
for source in DATA_SOURCES[lang]:
progress(0, desc=f"Processing {source['name']}...")
samples, msg = preprocess_dataset(source, lang, max_per_source, progress)
status_log.append(msg)
if samples:
all_samples.extend(samples)
gc.collect()
if not all_samples:
return "\n".join(status_log) + "\n\nNo samples processed!"
status_log.append(f"\nTotal new samples: {len(all_samples)}")
# Load existing and combine
progress(0.9, desc="Combining with existing dataset...")
try:
existing = load_dataset(EXISTING_DATASET, split="train")
existing_slim = existing.select_columns(["input_ids", "language"])
new_ds = Dataset.from_list(all_samples)
combined = concatenate_datasets([existing_slim, new_ds])
status_log.append(f"Combined: {len(combined)} total samples")
# Count languages
lang_counts = {}
for item in combined:
l = item.get("language", "unknown")
lang_counts[l] = lang_counts.get(l, 0) + 1
status_log.append("\nLanguage distribution:")
for l, c in sorted(lang_counts.items(), key=lambda x: -x[1]):
status_log.append(f" {l}: {c:,}")
# Push to Hub
progress(0.95, desc="Pushing to HuggingFace Hub...")
combined.push_to_hub(OUTPUT_DATASET, token=HF_TOKEN)
status_log.append(f"\nDataset saved: https://huggingface.co/datasets/{OUTPUT_DATASET}")
except Exception as e:
status_log.append(f"Error combining/pushing: {str(e)}")
progress(1.0, desc="Complete!")
return "\n".join(status_log)
# ============================================================================
# GRADIO UI
# ============================================================================
print("=" * 60)
print("NIGERIAN TTS DATA PREPROCESSOR")
print("Runs on FREE CPU - saves GPU costs!")
print("=" * 60)
with gr.Blocks(title="Nigerian TTS Preprocessor") as demo:
gr.Markdown("# Nigerian TTS Data Preprocessor")
gr.Markdown("Preprocess audio datasets on **FREE CPU** to save RunPod GPU costs.")
with gr.Row():
with gr.Column():
hf_token = gr.Textbox(
label="HuggingFace Token (write access)",
type="password",
placeholder="hf_..."
)
languages = gr.Textbox(
label="Languages to process (comma-separated)",
value="pidgin",
placeholder="pidgin,english"
)
max_samples = gr.Slider(
minimum=100,
maximum=20000,
value=5000,
step=100,
label="Max samples per source"
)
load_btn = gr.Button("1. Load Models", variant="secondary")
run_btn = gr.Button("2. Run Preprocessing", variant="primary")
with gr.Column():
output = gr.Textbox(
label="Status",
lines=20,
max_lines=30
)
gr.Markdown("""
## Instructions
1. Enter your HuggingFace token (needs write access)
2. Click "Load Models" to load WavTokenizer
3. Set languages to "pidgin" (or "pidgin,english" for both)
4. Click "Run Preprocessing" - this will take a while on CPU!
5. Once done, train on RunPod using the new dataset
## Available Datasets (VERIFIED WORKING)
- **Pidgin**: `timniel/Pidgin_ASR_Dataset_Combined` (5,883 samples)
- **Pidgin**: `Rexe/nigerian-pidgin-speech` (65 samples)
- **English**: `Donmonc/nigerian_english_tts` (498 Nigerian English samples)
## Current Status
- Existing dataset: `UbuntuFarms/nigerian-tts-preprocessed-v2` (148k samples)
- yoruba: 53,332 samples
- igbo: 47,526 samples
- hausa: 47,288 samples
- **pidgin: 0 samples** (CRITICAL - this is why Pidgin produces white noise!)
- Output: `UbuntuFarms/nigerian-tts-preprocessed-v3`
""")
load_btn.click(fn=load_models, outputs=output)
run_btn.click(
fn=run_full_preprocessing,
inputs=[languages, max_samples, hf_token],
outputs=output
)
if __name__ == "__main__":
demo.launch()