| | """ |
| | Download and prepare Portuguese BR emotion datasets (VERBO + emoUERJ). |
| | |
| | This script helps download and format the datasets for fine-tuning. |
| | """ |
| |
|
| | import logging |
| | import argparse |
| | from pathlib import Path |
| | import json |
| | import requests |
| | from datasets import Dataset, Audio |
| | import pandas as pd |
| |
|
| | logging.basicConfig(level=logging.INFO) |
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | def download_verbo(): |
| | """ |
| | Download VERBO dataset. |
| | |
| | VERBO: Brazilian Portuguese emotional speech corpus |
| | - 1,167 samples |
| | - 7 emotions: neutral, happy, sad, angry, fearful, disgusted, surprised |
| | - Paper: "VERBO: A Corpus for Emotion Recognition in Brazilian Portuguese" |
| | - Source: http://www02.smt.ufrj.br/~verbo/ |
| | |
| | Note: VERBO may require manual download or authorization. |
| | """ |
| | logger.info("\nπ₯ VERBO Dataset") |
| | logger.info("=" * 60) |
| | logger.info("Dataset: VERBO - Brazilian Portuguese Emotional Speech") |
| | logger.info("Samples: 1,167") |
| | logger.info("Emotions: 7 (neutral, happy, sad, angry, fearful, disgusted, surprised)") |
| | logger.info("\nβ οΈ Manual download required:") |
| | logger.info("1. Visit: http://www02.smt.ufrj.br/~verbo/") |
| | logger.info("2. Request access to the dataset") |
| | logger.info("3. Download and extract to: data/raw/verbo/") |
| | logger.info("\nExpected structure:") |
| | logger.info(" data/raw/verbo/") |
| | logger.info(" βββ neutral/") |
| | logger.info(" βββ happy/") |
| | logger.info(" βββ sad/") |
| | logger.info(" βββ angry/") |
| | logger.info(" βββ fearful/") |
| | logger.info(" βββ disgusted/") |
| | logger.info(" βββ surprised/") |
| |
|
| |
|
| | def download_emouej(): |
| | """ |
| | Download emoUERJ dataset. |
| | |
| | emoUERJ: Brazilian Portuguese emotional speech dataset |
| | - 377 samples |
| | - 4 emotions: neutral, happy, sad, angry |
| | - Paper: "emoUERJ: A Deep Learning-Based Emotion Classifier for Brazilian Portuguese" |
| | - Source: UERJ (State University of Rio de Janeiro) |
| | |
| | Note: emoUERJ may require manual download or authorization. |
| | """ |
| | logger.info("\nπ₯ emoUERJ Dataset") |
| | logger.info("=" * 60) |
| | logger.info("Dataset: emoUERJ - Brazilian Portuguese Emotional Speech") |
| | logger.info("Samples: 377") |
| | logger.info("Emotions: 4 (neutral, happy, sad, angry)") |
| | logger.info("\nβ οΈ Manual download required:") |
| | logger.info("1. Contact UERJ researchers or check university repository") |
| | logger.info("2. Download and extract to: data/raw/emouej/") |
| | logger.info("\nExpected structure:") |
| | logger.info(" data/raw/emouej/") |
| | logger.info(" βββ neutral/") |
| | logger.info(" βββ happy/") |
| | logger.info(" βββ sad/") |
| | logger.info(" βββ angry/") |
| |
|
| |
|
| | def download_coraa_ser(): |
| | """ |
| | Download CORAA-SER dataset. |
| | |
| | CORAA-SER: Brazilian Portuguese Speech Emotion Recognition subset |
| | - Part of CORAA corpus (290 hours total) |
| | - Prosodic annotations available |
| | - Multiple speakers, spontaneous speech |
| | - Source: https://github.com/nilc-nlp/CORAA |
| | |
| | Note: CORAA is large. Download only the emotion/prosody subset if possible. |
| | """ |
| | logger.info("\nπ₯ CORAA-SER Dataset") |
| | logger.info("=" * 60) |
| | logger.info("Dataset: CORAA - Brazilian Portuguese Speech Corpus") |
| | logger.info("Total: 290 hours") |
| | logger.info("Annotations: Prosodic features (intonation, stress, rhythm)") |
| | logger.info("\nβ οΈ Large dataset - manual download recommended:") |
| | logger.info("1. Visit: https://github.com/nilc-nlp/CORAA") |
| | logger.info("2. Download emotion/prosody subset if available") |
| | logger.info("3. Extract to: data/raw/coraa/") |
| | logger.info("\nπ‘ Note: CORAA has prosodic annotations but limited emotion labels") |
| | logger.info(" Use primarily for prosody modeling, not emotion classification") |
| |
|
| |
|
| | def prepare_local_dataset(data_dir: Path, emotion_folders: list): |
| | """ |
| | Prepare a local dataset from folder structure. |
| | |
| | Expected structure: |
| | data_dir/ |
| | βββ emotion1/ |
| | β βββ audio1.wav |
| | β βββ audio2.wav |
| | βββ emotion2/ |
| | β βββ audio3.wav |
| | """ |
| | logger.info(f"\nπ Preparing dataset from: {data_dir}") |
| |
|
| | if not data_dir.exists(): |
| | logger.error(f"β Directory not found: {data_dir}") |
| | return None |
| |
|
| | samples = [] |
| | for emotion in emotion_folders: |
| | emotion_dir = data_dir / emotion |
| | if not emotion_dir.exists(): |
| | logger.warning(f"β οΈ Emotion folder not found: {emotion_dir}") |
| | continue |
| |
|
| | |
| | audio_files = list(emotion_dir.glob("*.wav")) + \ |
| | list(emotion_dir.glob("*.mp3")) + \ |
| | list(emotion_dir.glob("*.flac")) |
| |
|
| | logger.info(f" {emotion}: {len(audio_files)} files") |
| |
|
| | for audio_file in audio_files: |
| | samples.append({ |
| | "audio": str(audio_file), |
| | "emotion": emotion, |
| | "file_name": audio_file.name |
| | }) |
| |
|
| | if not samples: |
| | logger.error("β No audio files found") |
| | return None |
| |
|
| | |
| | df = pd.DataFrame(samples) |
| | dataset = Dataset.from_pandas(df) |
| | dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) |
| |
|
| | logger.info(f"β
Dataset created: {len(dataset)} samples") |
| | return dataset |
| |
|
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser(description="Download PT-BR emotion datasets") |
| | parser.add_argument("--dataset", type=str, choices=["verbo", "emouej", "coraa", "all"], |
| | default="all", help="Dataset to download") |
| | parser.add_argument("--prepare-local", type=str, default=None, |
| | help="Prepare dataset from local directory") |
| | parser.add_argument("--emotions", type=str, nargs="+", |
| | default=["neutral", "happy", "sad", "angry", "fearful", "disgusted", "surprised"], |
| | help="Emotion folders to look for") |
| | parser.add_argument("--output", type=str, default="data/prepared/", |
| | help="Output directory for prepared datasets") |
| |
|
| | args = parser.parse_args() |
| |
|
| | logger.info("\n" + "=" * 60) |
| | logger.info("Portuguese BR Emotion Dataset Downloader") |
| | logger.info("=" * 60) |
| |
|
| | |
| | if args.prepare_local: |
| | data_dir = Path(args.prepare_local) |
| | dataset = prepare_local_dataset(data_dir, args.emotions) |
| |
|
| | if dataset is not None: |
| | output_path = Path(args.output) / f"{data_dir.name}_prepared" |
| | output_path.mkdir(parents=True, exist_ok=True) |
| | dataset.save_to_disk(str(output_path)) |
| | logger.info(f"β
Saved to: {output_path}") |
| |
|
| | return |
| |
|
| | |
| | if args.dataset in ["verbo", "all"]: |
| | download_verbo() |
| |
|
| | if args.dataset in ["emouej", "all"]: |
| | download_emouej() |
| |
|
| | if args.dataset in ["coraa", "all"]: |
| | download_coraa_ser() |
| |
|
| | logger.info("\n" + "=" * 60) |
| | logger.info("π Next Steps:") |
| | logger.info("=" * 60) |
| | logger.info("1. Download datasets manually from sources above") |
| | logger.info("2. Extract to data/raw/ directory") |
| | logger.info("3. Run this script with --prepare-local to format for training:") |
| | logger.info("\n python scripts/data/download_ptbr_datasets.py \\") |
| | logger.info(" --prepare-local data/raw/verbo/ \\") |
| | logger.info(" --emotions neutral happy sad angry fearful disgusted surprised") |
| | logger.info("\n4. Use prepared dataset for fine-tuning:") |
| | logger.info("\n python scripts/training/finetune_emotion2vec.py") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|