#!/usr/bin/env python3 """RAVDESS 영어 감정 음성 데이터셋 준비 스크립트. data/archive.zip을 압축 해제하고, 전화 품질 전처리를 적용하여 emotion2vec 영어 평가용 manifest.csv를 생성한다. Usage: python scripts/prepare_ravdess.py python scripts/prepare_ravdess.py --skip-phone # 전화 전처리 생략 """ from __future__ import annotations import argparse import csv import logging import sys import zipfile from pathlib import Path import librosa import numpy as np import soundfile as sf PROJECT_ROOT = Path(__file__).parent.parent sys.path.insert(0, str(PROJECT_ROOT)) from src.common.phone_simulator import CompandingType, PhoneSimulator logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", ) logger = logging.getLogger("prepare_ravdess") # RAVDESS emotion code → project 7-class taxonomy RAVDESS_EMOTION_MAP = { 1: "neutral", # neutral 2: "neutral", # calm → neutral (프로젝트 taxonomy에 calm 없음) 3: "joy", # happy → joy 4: "sadness", # sad → sadness 5: "anger", # angry → anger 6: "fear", # fearful → fear 7: "disgust", # disgust 8: "surprise", # surprised → surprise } RAVDESS_EMOTION_NAME = { 1: "neutral", 2: "calm", 3: "happy", 4: "sad", 5: "angry", 6: "fearful", 7: "disgust", 8: "surprised", } ARCHIVE_PATH = PROJECT_ROOT / "data" / "archive.zip" OUTPUT_DIR = PROJECT_ROOT / "data" / "ravdess" def parse_ravdess_filename(filename: str) -> dict | None: """RAVDESS 파일명에서 메타데이터 추출. Format: Modality-VocalChannel-Emotion-Intensity-Statement-Repetition-Actor.wav Example: 03-01-05-02-01-01-12.wav """ stem = Path(filename).stem parts = stem.split("-") if len(parts) != 7: return None emotion_code = int(parts[2]) return { "modality": int(parts[0]), "vocal_channel": int(parts[1]), "emotion_code": emotion_code, "emotion_raw": RAVDESS_EMOTION_NAME.get(emotion_code, "unknown"), "emotion": RAVDESS_EMOTION_MAP.get(emotion_code, "neutral"), "intensity": int(parts[3]), # 1=normal, 2=strong "statement": int(parts[4]), # 1="Kids...", 2="Dogs..." "repetition": int(parts[5]), "actor_id": int(parts[6]), } def extract_archive(archive_path: Path, output_dir: Path) -> list[Path]: """archive.zip 압축 해제 → clean/ 디렉토리.""" clean_dir = output_dir / "clean" if clean_dir.exists() and any(clean_dir.rglob("*.wav")): wavs = sorted(clean_dir.rglob("*.wav")) logger.info(f"이미 압축 해제됨: {len(wavs)}개 WAV in {clean_dir}") return wavs clean_dir.mkdir(parents=True, exist_ok=True) logger.info(f"압축 해제 중: {archive_path} → {clean_dir}") with zipfile.ZipFile(archive_path, "r") as zf: wav_members = [m for m in zf.namelist() if m.endswith(".wav")] for i, member in enumerate(wav_members, 1): # Actor_NN/filename.wav → clean/Actor_NN/filename.wav target = clean_dir / member target.parent.mkdir(parents=True, exist_ok=True) with zf.open(member) as src, open(target, "wb") as dst: dst.write(src.read()) if i % 500 == 0: logger.info(f" [{i}/{len(wav_members)}] 압축 해제 중...") wavs = sorted(clean_dir.rglob("*.wav")) logger.info(f"압축 해제 완료: {len(wavs)}개 WAV") return wavs def apply_phone_simulation(clean_wavs: list[Path], output_dir: Path) -> dict[str, Path]: """clean WAV → phone 품질 변환. {clean_path_str: phone_path} 반환.""" phone_dir = output_dir / "phone" simulator = PhoneSimulator(companding=CompandingType.ULAW) # 영어 = 북미 μ-law mapping = {} total = len(clean_wavs) for i, wav_path in enumerate(clean_wavs, 1): # clean/Actor_NN/file.wav → phone/Actor_NN/file.wav relative = wav_path.relative_to(output_dir / "clean") phone_path = phone_dir / relative phone_path.parent.mkdir(parents=True, exist_ok=True) if phone_path.exists(): mapping[str(wav_path)] = phone_path continue try: audio, sr = librosa.load(str(wav_path), sr=None, mono=True) processed, new_sr = simulator.process(audio, sr) sf.write(str(phone_path), processed, new_sr, subtype="PCM_16") mapping[str(wav_path)] = phone_path except Exception as e: logger.warning(f"전화 변환 실패 [{wav_path.name}]: {e}") if i % 500 == 0: logger.info(f" [{i}/{total}] 전화 품질 변환 중...") logger.info(f"전화 품질 변환 완료: {len(mapping)}/{total}") return mapping def build_manifest( clean_wavs: list[Path], phone_mapping: dict[str, Path] | None, output_dir: Path, ) -> Path: """manifest.csv 생성.""" manifest_path = output_dir / "manifest.csv" rows = [] for wav_path in clean_wavs: meta = parse_ravdess_filename(wav_path.name) if meta is None: logger.warning(f"파일명 파싱 실패: {wav_path.name}") continue phone_path = "" if phone_mapping and str(wav_path) in phone_mapping: phone_path = str(phone_mapping[str(wav_path)]) rows.append({ "clean_path": str(wav_path), "phone_path": phone_path, "emotion": meta["emotion"], "emotion_raw": meta["emotion_raw"], "actor_id": meta["actor_id"], "intensity": meta["intensity"], "statement": meta["statement"], "repetition": meta["repetition"], }) # 감정별 통계 출력 from collections import Counter emotion_counts = Counter(r["emotion"] for r in rows) logger.info("감정 분포:") for emotion, count in sorted(emotion_counts.items()): logger.info(f" {emotion}: {count}") with open(manifest_path, "w", newline="") as f: writer = csv.DictWriter(f, fieldnames=[ "clean_path", "phone_path", "emotion", "emotion_raw", "actor_id", "intensity", "statement", "repetition", ]) writer.writeheader() writer.writerows(rows) logger.info(f"manifest 저장: {manifest_path} ({len(rows)}행)") return manifest_path def main(): parser = argparse.ArgumentParser(description="RAVDESS 영어 감정 데이터 준비") parser.add_argument("--skip-phone", action="store_true", help="전화 품질 전처리 생략") args = parser.parse_args() if not ARCHIVE_PATH.exists(): logger.error(f"archive.zip을 찾을 수 없습니다: {ARCHIVE_PATH}") sys.exit(1) OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # 1. 압축 해제 clean_wavs = extract_archive(ARCHIVE_PATH, OUTPUT_DIR) # 2. 전화 품질 전처리 phone_mapping = None if not args.skip_phone: phone_mapping = apply_phone_simulation(clean_wavs, OUTPUT_DIR) else: logger.info("전화 품질 전처리 생략 (--skip-phone)") # 3. manifest.csv 생성 build_manifest(clean_wavs, phone_mapping, OUTPUT_DIR) logger.info("완료!") if __name__ == "__main__": main()