Imakandi-Labs's picture
Upload folder using huggingface_hub
85a8cdd verified
#!/usr/bin/env python3
"""
Nigerian TTS Data Preprocessor V4 (Simplified)
===============================================
Prepares Pidgin and English datasets for TTS training.
Stores audio paths and text - WavTokenizer encoding happens during training.
Outputs: UbuntuFarms/nigerian-tts-preprocessed-v4
"""
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
import gradio as gr
import numpy as np
from datasets import load_dataset, Dataset, Audio
from huggingface_hub import HfApi, login
import time
# Configuration
HF_TOKEN = os.environ.get("HF_TOKEN", "")
OUTPUT_DATASET = "UbuntuFarms/nigerian-tts-preprocessed-v4"
SAMPLE_RATE = 24000
MAX_DURATION = 20.0 # seconds
MIN_DURATION = 1.0
# Datasets to process
DATASETS_CONFIG = {
# === PIDGIN ===
"pidgin": {
"repo": "asr-nigerian-pidgin/nigerian-pidgin-1.0",
"audio_col": "audio",
"text_col": "sentence",
"language": "pidgin",
},
# === ENGLISH ===
"english_common_voice": {
"repo": "benjaminogbonna/nigerian_common_voice_dataset",
"config": "english",
"audio_col": "audio",
"text_col": "sentence",
"language": "english_cv",
},
"english_accented": {
"repo": "benjaminogbonna/nigerian_accented_english_dataset",
"audio_col": "audio",
"text_col": "sentence",
"language": "english_accented",
},
# === YORUBA (Additional) ===
"yoruba_parallel": {
"repo": "michsethowusu/yoruba-speech-text-parallel",
"audio_col": "audio",
"text_col": "text",
"language": "yoruba_extra",
},
"yoruba_common_voice": {
"repo": "benjaminogbonna/nigerian_common_voice_dataset",
"config": "yoruba",
"audio_col": "audio",
"text_col": "sentence",
"language": "yoruba_cv",
},
# === HAUSA (Additional) ===
"hausa_twb": {
"repo": "CLEAR-Global/TWB-Voice-1.0",
"config": "hau",
"audio_col": "audio",
"text_col": "text",
"language": "hausa_twb",
},
"hausa_common_voice": {
"repo": "benjaminogbonna/nigerian_common_voice_dataset",
"config": "hausa",
"audio_col": "audio",
"text_col": "sentence",
"language": "hausa_cv",
},
# === IGBO (Additional) ===
"igbo_common_voice": {
"repo": "benjaminogbonna/nigerian_common_voice_dataset",
"config": "igbo",
"audio_col": "audio",
"text_col": "sentence",
"language": "igbo_cv",
},
}
processing_log = []
def log(msg):
"""Add message to processing log."""
timestamp = time.strftime("%H:%M:%S")
log_msg = f"[{timestamp}] {msg}"
processing_log.append(log_msg)
print(log_msg)
return "\n".join(processing_log[-50:])
def process_sample(sample, language, text_col="sentence"):
"""Process a single sample - just validate and format."""
try:
# Get audio info
audio = sample.get("audio")
if audio is None:
return None, "No audio"
# Handle different audio formats
if hasattr(audio, '__getitem__'):
audio_array = audio["array"]
sample_rate = audio["sampling_rate"]
elif isinstance(audio, dict):
audio_array = audio.get("array", [])
sample_rate = audio.get("sampling_rate", 16000)
else:
return None, f"Unknown audio format: {type(audio)}"
if len(audio_array) == 0:
return None, "Empty audio"
# Check duration
duration = len(audio_array) / sample_rate
if duration < MIN_DURATION:
return None, f"Too short: {duration:.1f}s"
if duration > MAX_DURATION:
return None, f"Too long: {duration:.1f}s"
# Get text
text = sample.get(text_col, "")
if not text or len(text.strip()) < 2:
return None, "No text"
text = text.strip()
# Return processed sample with audio data
return {
"audio": {"array": np.array(audio_array, dtype=np.float32), "sampling_rate": sample_rate},
"text": text,
"language": language,
"duration": duration,
}, None
except Exception as e:
return None, str(e)
def process_dataset(dataset_key, max_samples=5000, progress=gr.Progress()):
"""Process a specific dataset."""
global processing_log
processing_log = []
if dataset_key not in DATASETS_CONFIG:
return f"Unknown dataset: {dataset_key}", ""
config = DATASETS_CONFIG[dataset_key]
log(f"Processing: {dataset_key}")
log(f"Repository: {config['repo']}")
# Login to HuggingFace
if HF_TOKEN:
login(token=HF_TOKEN)
log("Logged in to HuggingFace")
else:
return "Error: HF_TOKEN not set", "\n".join(processing_log)
# Load dataset
log("Loading dataset...")
try:
if "config" in config:
ds = load_dataset(config["repo"], config["config"], split="train", streaming=True)
else:
ds = load_dataset(config["repo"], split="train", streaming=True)
log("Dataset loaded (streaming mode)")
except Exception as e:
log(f"Error loading dataset: {e}")
return f"Error: {e}", "\n".join(processing_log)
# Process samples
processed = []
errors = {}
log(f"Processing up to {max_samples} samples...")
for i, sample in enumerate(ds):
if i >= max_samples:
break
if i % 100 == 0:
progress((i / max_samples), f"Processing {i}/{max_samples}")
log(f"Progress: {i}/{max_samples} (processed: {len(processed)}, errors: {sum(errors.values())})")
result, error = process_sample(
sample,
config["language"],
config.get("text_col", "sentence")
)
if result:
processed.append(result)
else:
errors[error] = errors.get(error, 0) + 1
log(f"Processed: {len(processed)} samples")
log(f"Errors: {sum(errors.values())}")
for error, count in sorted(errors.items(), key=lambda x: -x[1])[:5]:
log(f" - {error}: {count}")
if len(processed) == 0:
return "No samples processed successfully", "\n".join(processing_log)
# Create dataset
log("Creating HuggingFace dataset...")
output_ds = Dataset.from_list(processed)
# Cast audio column
output_ds = output_ds.cast_column("audio", Audio(sampling_rate=SAMPLE_RATE))
# Push to hub
log(f"Pushing to {OUTPUT_DATASET}...")
try:
api = HfApi(token=HF_TOKEN)
# Create repo if needed
try:
api.dataset_info(OUTPUT_DATASET)
except:
api.create_repo(OUTPUT_DATASET, repo_type="dataset", exist_ok=True)
# Push
output_ds.push_to_hub(
OUTPUT_DATASET,
config_name=config["language"],
token=HF_TOKEN,
commit_message=f"Add {config['language']} data from {config['repo']}"
)
log(f"Pushed to {OUTPUT_DATASET} (config: {config['language']})")
except Exception as e:
log(f"Push error: {e}")
import traceback
log(traceback.format_exc())
return f"Push error: {e}", "\n".join(processing_log)
return f"Success! Processed {len(processed)} {config['language']} samples", "\n".join(processing_log)
def process_all(max_per_dataset=5000, progress=gr.Progress()):
"""Process all datasets."""
results = []
for i, key in enumerate(DATASETS_CONFIG.keys()):
progress((i / len(DATASETS_CONFIG)), f"Processing {key}...")
result, _ = process_dataset(key, max_per_dataset, progress)
results.append(f"{key}: {result}")
return "\n".join(results), "\n".join(processing_log)
# Gradio UI
with gr.Blocks(title="Nigerian TTS Preprocessor V4") as demo:
gr.Markdown("""
# Nigerian TTS Data Preprocessor V4
Prepares Pidgin and English audio datasets for TTS training.
Stores audio + text, WavTokenizer encoding happens during training on GPU.
**Datasets:**
- Pidgin: `asr-nigerian-pidgin/nigerian-pidgin-1.0`
- English: `benjaminogbonna/nigerian_common_voice_dataset`
- English: `benjaminogbonna/nigerian_accented_english_dataset`
**Output:** `UbuntuFarms/nigerian-tts-preprocessed-v4`
""")
with gr.Row():
dataset_choice = gr.Dropdown(
choices=list(DATASETS_CONFIG.keys()) + ["all"],
value="pidgin",
label="Dataset to Process"
)
max_samples = gr.Slider(100, 50000, value=5000, step=100, label="Max Samples")
process_btn = gr.Button("Start Processing", variant="primary")
with gr.Row():
status = gr.Textbox(label="Status", lines=3)
log_output = gr.Textbox(label="Processing Log", lines=20)
def run_processing(dataset_key, max_samples, progress=gr.Progress()):
if dataset_key == "all":
return process_all(int(max_samples), progress)
else:
return process_dataset(dataset_key, int(max_samples), progress)
process_btn.click(
run_processing,
inputs=[dataset_choice, max_samples],
outputs=[status, log_output]
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)