""" Create synthetic audio samples for testing fine-tuning and annotation. This script generates synthetic audio samples with different characteristics to simulate emotional speech for testing purposes before real datasets are available. """ import numpy as np import soundfile as sf from pathlib import Path import logging from typing import Dict, List import librosa logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class SyntheticAudioGenerator: """Generate synthetic audio samples with emotion-like characteristics.""" def __init__(self, sample_rate: int = 16000): self.sample_rate = sample_rate def generate_base_tone(self, duration: float, frequency: float) -> np.ndarray: """Generate a base tone with given frequency.""" t = np.linspace(0, duration, int(duration * self.sample_rate)) tone = np.sin(2 * np.pi * frequency * t) return tone def add_harmonics(self, tone: np.ndarray, frequencies: List[float], amplitudes: List[float]) -> np.ndarray: """Add harmonic frequencies to simulate voice complexity.""" duration = len(tone) / self.sample_rate t = np.linspace(0, duration, len(tone)) for freq, amp in zip(frequencies, amplitudes): harmonic = amp * np.sin(2 * np.pi * freq * t) tone = tone + harmonic return tone def apply_envelope(self, audio: np.ndarray, attack: float = 0.1, decay: float = 0.1, sustain: float = 0.7, release: float = 0.2) -> np.ndarray: """Apply ADSR envelope to audio.""" n_samples = len(audio) envelope = np.ones(n_samples) # Attack attack_samples = int(attack * n_samples) envelope[:attack_samples] = np.linspace(0, 1, attack_samples) # Decay decay_samples = int(decay * n_samples) decay_end = attack_samples + decay_samples envelope[attack_samples:decay_end] = np.linspace(1, sustain, decay_samples) # Sustain (already at sustain level) sustain_end = n_samples - int(release * n_samples) envelope[decay_end:sustain_end] = sustain # Release envelope[sustain_end:] = np.linspace(sustain, 0, n_samples - sustain_end) return audio * envelope def generate_neutral(self, duration: float = 3.0) -> np.ndarray: """ Generate neutral emotion audio. Characteristics: Medium pitch, steady rhythm, minimal variation. """ # Base frequency: medium pitch (male: ~120Hz, female: ~220Hz) base_freq = 150.0 tone = self.generate_base_tone(duration, base_freq) # Add subtle harmonics harmonics = [base_freq * 2, base_freq * 3, base_freq * 4] amplitudes = [0.3, 0.15, 0.08] tone = self.add_harmonics(tone, harmonics, amplitudes) # Steady envelope tone = self.apply_envelope(tone, attack=0.1, decay=0.05, sustain=0.8, release=0.15) # Normalize tone = tone / np.max(np.abs(tone)) * 0.7 return tone.astype(np.float32) def generate_happy(self, duration: float = 3.0) -> np.ndarray: """ Generate happy emotion audio. Characteristics: Higher pitch, faster rhythm, more energy. """ # Higher pitch base_freq = 200.0 tone = self.generate_base_tone(duration, base_freq) # More pronounced harmonics harmonics = [base_freq * 2, base_freq * 3, base_freq * 4, base_freq * 5] amplitudes = [0.4, 0.25, 0.15, 0.1] tone = self.add_harmonics(tone, harmonics, amplitudes) # Add vibrato (pitch modulation) t = np.linspace(0, duration, len(tone)) vibrato = 1 + 0.02 * np.sin(2 * np.pi * 5 * t) # 5Hz vibrato tone = tone * vibrato # Energetic envelope tone = self.apply_envelope(tone, attack=0.05, decay=0.05, sustain=0.9, release=0.1) # Higher energy tone = tone / np.max(np.abs(tone)) * 0.85 return tone.astype(np.float32) def generate_sad(self, duration: float = 3.0) -> np.ndarray: """ Generate sad emotion audio. Characteristics: Lower pitch, slower rhythm, less energy. """ # Lower pitch base_freq = 100.0 tone = self.generate_base_tone(duration, base_freq) # Fewer harmonics (less bright) harmonics = [base_freq * 2, base_freq * 3] amplitudes = [0.25, 0.12] tone = self.add_harmonics(tone, harmonics, amplitudes) # Add tremolo (amplitude modulation) t = np.linspace(0, duration, len(tone)) tremolo = 1 - 0.05 * np.sin(2 * np.pi * 3 * t) # 3Hz tremolo tone = tone * tremolo # Slower envelope tone = self.apply_envelope(tone, attack=0.15, decay=0.1, sustain=0.6, release=0.25) # Lower energy tone = tone / np.max(np.abs(tone)) * 0.6 return tone.astype(np.float32) def generate_angry(self, duration: float = 3.0) -> np.ndarray: """ Generate angry emotion audio. Characteristics: Variable pitch, harsh harmonics, high energy. """ # Medium-high pitch with variations base_freq = 180.0 tone = self.generate_base_tone(duration, base_freq) # Harsh harmonics harmonics = [base_freq * 2, base_freq * 3, base_freq * 4, base_freq * 6] amplitudes = [0.5, 0.3, 0.2, 0.15] tone = self.add_harmonics(tone, harmonics, amplitudes) # Add roughness (noise) noise = np.random.randn(len(tone)) * 0.1 tone = tone + noise # Aggressive envelope tone = self.apply_envelope(tone, attack=0.02, decay=0.05, sustain=0.95, release=0.08) # High energy tone = tone / np.max(np.abs(tone)) * 0.9 return tone.astype(np.float32) def generate_fearful(self, duration: float = 3.0) -> np.ndarray: """ Generate fearful emotion audio. Characteristics: Variable pitch, trembling, high frequency. """ # Higher pitch with instability base_freq = 220.0 tone = self.generate_base_tone(duration, base_freq) # Unstable harmonics harmonics = [base_freq * 2, base_freq * 3, base_freq * 5] amplitudes = [0.35, 0.2, 0.15] tone = self.add_harmonics(tone, harmonics, amplitudes) # Add trembling (fast amplitude modulation) t = np.linspace(0, duration, len(tone)) trembling = 1 - 0.08 * np.sin(2 * np.pi * 8 * t) # 8Hz trembling tone = tone * trembling # Unstable envelope tone = self.apply_envelope(tone, attack=0.08, decay=0.12, sustain=0.7, release=0.15) tone = tone / np.max(np.abs(tone)) * 0.75 return tone.astype(np.float32) def generate_disgusted(self, duration: float = 3.0) -> np.ndarray: """ Generate disgusted emotion audio. Characteristics: Lower pitch, nasal quality, reduced energy. """ # Lower-medium pitch base_freq = 130.0 tone = self.generate_base_tone(duration, base_freq) # Nasal harmonics (odd harmonics emphasized) harmonics = [base_freq * 3, base_freq * 5, base_freq * 7] amplitudes = [0.4, 0.25, 0.15] tone = self.add_harmonics(tone, harmonics, amplitudes) # Add slight roughness noise = np.random.randn(len(tone)) * 0.05 tone = tone + noise # Reduced energy envelope tone = self.apply_envelope(tone, attack=0.12, decay=0.1, sustain=0.65, release=0.2) tone = tone / np.max(np.abs(tone)) * 0.65 return tone.astype(np.float32) def generate_surprised(self, duration: float = 3.0) -> np.ndarray: """ Generate surprised emotion audio. Characteristics: Sudden onset, high pitch, short duration tendency. """ # High pitch base_freq = 250.0 tone = self.generate_base_tone(duration, base_freq) # Bright harmonics harmonics = [base_freq * 2, base_freq * 3, base_freq * 4] amplitudes = [0.45, 0.3, 0.2] tone = self.add_harmonics(tone, harmonics, amplitudes) # Very fast attack envelope tone = self.apply_envelope(tone, attack=0.01, decay=0.15, sustain=0.8, release=0.12) tone = tone / np.max(np.abs(tone)) * 0.8 return tone.astype(np.float32) def create_test_dataset(output_dir: Path, samples_per_emotion: int = 10): """ Create a synthetic test dataset with multiple samples per emotion. Args: output_dir: Directory to save audio files samples_per_emotion: Number of samples to generate per emotion """ logger.info("šŸŽµ Creating synthetic test dataset...") logger.info(f"Output: {output_dir}") logger.info(f"Samples per emotion: {samples_per_emotion}") output_dir.mkdir(parents=True, exist_ok=True) generator = SyntheticAudioGenerator(sample_rate=16000) emotions = { "neutral": generator.generate_neutral, "happy": generator.generate_happy, "sad": generator.generate_sad, "angry": generator.generate_angry, "fearful": generator.generate_fearful, "disgusted": generator.generate_disgusted, "surprised": generator.generate_surprised } total_files = 0 for emotion, generate_fn in emotions.items(): emotion_dir = output_dir / emotion emotion_dir.mkdir(exist_ok=True) logger.info(f"\n Generating {emotion}...") for i in range(samples_per_emotion): # Vary duration slightly duration = 2.5 + np.random.rand() * 1.0 # 2.5 to 3.5 seconds audio = generate_fn(duration) filename = emotion_dir / f"{emotion}_{i:03d}.wav" sf.write(filename, audio, 16000) total_files += 1 logger.info(f" āœ“ {samples_per_emotion} files created") logger.info(f"\nāœ… Total: {total_files} synthetic audio files created") logger.info(f"šŸ“ Location: {output_dir}") # Create metadata file metadata = { "dataset_name": "synthetic_emotions_test", "total_samples": total_files, "samples_per_emotion": samples_per_emotion, "emotions": list(emotions.keys()), "sample_rate": 16000, "description": "Synthetic audio samples for testing emotion recognition" } import json with open(output_dir / "metadata.json", "w") as f: json.dump(metadata, f, indent=2) logger.info(f"šŸ“„ Metadata saved to: {output_dir / 'metadata.json'}") return output_dir def main(): import argparse parser = argparse.ArgumentParser(description="Create synthetic test audio data") parser.add_argument("--output", type=str, default="data/raw/synthetic/", help="Output directory") parser.add_argument("--samples", type=int, default=10, help="Samples per emotion (default: 10)") args = parser.parse_args() output_dir = Path(args.output) create_test_dataset(output_dir, args.samples) logger.info("\n" + "="*60) logger.info("Next steps:") logger.info("="*60) logger.info("\n1. Prepare dataset for training:") logger.info(f"\n python scripts/data/download_ptbr_datasets.py \\") logger.info(f" --prepare-local {output_dir}") logger.info("\n2. Fine-tune with synthetic data:") logger.info("\n python scripts/training/finetune_emotion2vec.py \\") logger.info(" --dataset data/prepared/synthetic_prepared \\") logger.info(" --epochs 5 \\") logger.info(" --device cpu") logger.info("\nšŸ’” Note: This is synthetic data for testing only.") logger.info(" Use real datasets (VERBO, emoUERJ) for production fine-tuning.") if __name__ == "__main__": main()