""" Download and prepare Portuguese BR emotion datasets (VERBO + emoUERJ). This script helps download and format the datasets for fine-tuning. """ import logging import argparse from pathlib import Path import json import requests from datasets import Dataset, Audio import pandas as pd logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def download_verbo(): """ Download VERBO dataset. VERBO: Brazilian Portuguese emotional speech corpus - 1,167 samples - 7 emotions: neutral, happy, sad, angry, fearful, disgusted, surprised - Paper: "VERBO: A Corpus for Emotion Recognition in Brazilian Portuguese" - Source: http://www02.smt.ufrj.br/~verbo/ Note: VERBO may require manual download or authorization. """ logger.info("\nšŸ“„ VERBO Dataset") logger.info("=" * 60) logger.info("Dataset: VERBO - Brazilian Portuguese Emotional Speech") logger.info("Samples: 1,167") logger.info("Emotions: 7 (neutral, happy, sad, angry, fearful, disgusted, surprised)") logger.info("\nāš ļø Manual download required:") logger.info("1. Visit: http://www02.smt.ufrj.br/~verbo/") logger.info("2. Request access to the dataset") logger.info("3. Download and extract to: data/raw/verbo/") logger.info("\nExpected structure:") logger.info(" data/raw/verbo/") logger.info(" ā”œā”€ā”€ neutral/") logger.info(" ā”œā”€ā”€ happy/") logger.info(" ā”œā”€ā”€ sad/") logger.info(" ā”œā”€ā”€ angry/") logger.info(" ā”œā”€ā”€ fearful/") logger.info(" ā”œā”€ā”€ disgusted/") logger.info(" └── surprised/") def download_emouej(): """ Download emoUERJ dataset. emoUERJ: Brazilian Portuguese emotional speech dataset - 377 samples - 4 emotions: neutral, happy, sad, angry - Paper: "emoUERJ: A Deep Learning-Based Emotion Classifier for Brazilian Portuguese" - Source: UERJ (State University of Rio de Janeiro) Note: emoUERJ may require manual download or authorization. """ logger.info("\nšŸ“„ emoUERJ Dataset") logger.info("=" * 60) logger.info("Dataset: emoUERJ - Brazilian Portuguese Emotional Speech") logger.info("Samples: 377") logger.info("Emotions: 4 (neutral, happy, sad, angry)") logger.info("\nāš ļø Manual download required:") logger.info("1. Contact UERJ researchers or check university repository") logger.info("2. Download and extract to: data/raw/emouej/") logger.info("\nExpected structure:") logger.info(" data/raw/emouej/") logger.info(" ā”œā”€ā”€ neutral/") logger.info(" ā”œā”€ā”€ happy/") logger.info(" ā”œā”€ā”€ sad/") logger.info(" └── angry/") def download_coraa_ser(): """ Download CORAA-SER dataset. CORAA-SER: Brazilian Portuguese Speech Emotion Recognition subset - Part of CORAA corpus (290 hours total) - Prosodic annotations available - Multiple speakers, spontaneous speech - Source: https://github.com/nilc-nlp/CORAA Note: CORAA is large. Download only the emotion/prosody subset if possible. """ logger.info("\nšŸ“„ CORAA-SER Dataset") logger.info("=" * 60) logger.info("Dataset: CORAA - Brazilian Portuguese Speech Corpus") logger.info("Total: 290 hours") logger.info("Annotations: Prosodic features (intonation, stress, rhythm)") logger.info("\nāš ļø Large dataset - manual download recommended:") logger.info("1. Visit: https://github.com/nilc-nlp/CORAA") logger.info("2. Download emotion/prosody subset if available") logger.info("3. Extract to: data/raw/coraa/") logger.info("\nšŸ’” Note: CORAA has prosodic annotations but limited emotion labels") logger.info(" Use primarily for prosody modeling, not emotion classification") def prepare_local_dataset(data_dir: Path, emotion_folders: list): """ Prepare a local dataset from folder structure. Expected structure: data_dir/ ā”œā”€ā”€ emotion1/ │ ā”œā”€ā”€ audio1.wav │ └── audio2.wav ā”œā”€ā”€ emotion2/ │ └── audio3.wav """ logger.info(f"\nšŸ”„ Preparing dataset from: {data_dir}") if not data_dir.exists(): logger.error(f"āŒ Directory not found: {data_dir}") return None samples = [] for emotion in emotion_folders: emotion_dir = data_dir / emotion if not emotion_dir.exists(): logger.warning(f"āš ļø Emotion folder not found: {emotion_dir}") continue # Find all audio files audio_files = list(emotion_dir.glob("*.wav")) + \ list(emotion_dir.glob("*.mp3")) + \ list(emotion_dir.glob("*.flac")) logger.info(f" {emotion}: {len(audio_files)} files") for audio_file in audio_files: samples.append({ "audio": str(audio_file), "emotion": emotion, "file_name": audio_file.name }) if not samples: logger.error("āŒ No audio files found") return None # Create HuggingFace dataset df = pd.DataFrame(samples) dataset = Dataset.from_pandas(df) dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) logger.info(f"āœ… Dataset created: {len(dataset)} samples") return dataset def main(): parser = argparse.ArgumentParser(description="Download PT-BR emotion datasets") parser.add_argument("--dataset", type=str, choices=["verbo", "emouej", "coraa", "all"], default="all", help="Dataset to download") parser.add_argument("--prepare-local", type=str, default=None, help="Prepare dataset from local directory") parser.add_argument("--emotions", type=str, nargs="+", default=["neutral", "happy", "sad", "angry", "fearful", "disgusted", "surprised"], help="Emotion folders to look for") parser.add_argument("--output", type=str, default="data/prepared/", help="Output directory for prepared datasets") args = parser.parse_args() logger.info("\n" + "=" * 60) logger.info("Portuguese BR Emotion Dataset Downloader") logger.info("=" * 60) # If preparing local dataset if args.prepare_local: data_dir = Path(args.prepare_local) dataset = prepare_local_dataset(data_dir, args.emotions) if dataset is not None: output_path = Path(args.output) / f"{data_dir.name}_prepared" output_path.mkdir(parents=True, exist_ok=True) dataset.save_to_disk(str(output_path)) logger.info(f"āœ… Saved to: {output_path}") return # Show download instructions if args.dataset in ["verbo", "all"]: download_verbo() if args.dataset in ["emouej", "all"]: download_emouej() if args.dataset in ["coraa", "all"]: download_coraa_ser() logger.info("\n" + "=" * 60) logger.info("šŸ“ Next Steps:") logger.info("=" * 60) logger.info("1. Download datasets manually from sources above") logger.info("2. Extract to data/raw/ directory") logger.info("3. Run this script with --prepare-local to format for training:") logger.info("\n python scripts/data/download_ptbr_datasets.py \\") logger.info(" --prepare-local data/raw/verbo/ \\") logger.info(" --emotions neutral happy sad angry fearful disgusted surprised") logger.info("\n4. Use prepared dataset for fine-tuning:") logger.info("\n python scripts/training/finetune_emotion2vec.py") if __name__ == "__main__": main()