ustwo-api / scripts /prepare_ravdess.py
asdfasdfqrqwer's picture
Deploy from GitHub 2026-04-23T03:56:31Z
c857b85
Raw
History Blame Contribute Delete
7.33 kB
#!/usr/bin/env python3
"""RAVDESS μ˜μ–΄ 감정 μŒμ„± 데이터셋 μ€€λΉ„ 슀크립트.
data/archive.zip을 μ••μΆ• ν•΄μ œν•˜κ³ , μ „ν™” ν’ˆμ§ˆ μ „μ²˜λ¦¬λ₯Ό μ μš©ν•˜μ—¬
emotion2vec μ˜μ–΄ ν‰κ°€μš© manifest.csvλ₯Ό μƒμ„±ν•œλ‹€.
Usage:
python scripts/prepare_ravdess.py
python scripts/prepare_ravdess.py --skip-phone # μ „ν™” μ „μ²˜λ¦¬ μƒλž΅
"""
from __future__ import annotations
import argparse
import csv
import logging
import sys
import zipfile
from pathlib import Path
import librosa
import numpy as np
import soundfile as sf
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
from src.common.phone_simulator import CompandingType, PhoneSimulator
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger("prepare_ravdess")
# RAVDESS emotion code β†’ project 7-class taxonomy
RAVDESS_EMOTION_MAP = {
1: "neutral", # neutral
2: "neutral", # calm β†’ neutral (ν”„λ‘œμ νŠΈ taxonomy에 calm μ—†μŒ)
3: "joy", # happy β†’ joy
4: "sadness", # sad β†’ sadness
5: "anger", # angry β†’ anger
6: "fear", # fearful β†’ fear
7: "disgust", # disgust
8: "surprise", # surprised β†’ surprise
}
RAVDESS_EMOTION_NAME = {
1: "neutral", 2: "calm", 3: "happy", 4: "sad",
5: "angry", 6: "fearful", 7: "disgust", 8: "surprised",
}
ARCHIVE_PATH = PROJECT_ROOT / "data" / "archive.zip"
OUTPUT_DIR = PROJECT_ROOT / "data" / "ravdess"
def parse_ravdess_filename(filename: str) -> dict | None:
"""RAVDESS 파일λͺ…μ—μ„œ 메타데이터 μΆ”μΆœ.
Format: Modality-VocalChannel-Emotion-Intensity-Statement-Repetition-Actor.wav
Example: 03-01-05-02-01-01-12.wav
"""
stem = Path(filename).stem
parts = stem.split("-")
if len(parts) != 7:
return None
emotion_code = int(parts[2])
return {
"modality": int(parts[0]),
"vocal_channel": int(parts[1]),
"emotion_code": emotion_code,
"emotion_raw": RAVDESS_EMOTION_NAME.get(emotion_code, "unknown"),
"emotion": RAVDESS_EMOTION_MAP.get(emotion_code, "neutral"),
"intensity": int(parts[3]), # 1=normal, 2=strong
"statement": int(parts[4]), # 1="Kids...", 2="Dogs..."
"repetition": int(parts[5]),
"actor_id": int(parts[6]),
}
def extract_archive(archive_path: Path, output_dir: Path) -> list[Path]:
"""archive.zip μ••μΆ• ν•΄μ œ β†’ clean/ 디렉토리."""
clean_dir = output_dir / "clean"
if clean_dir.exists() and any(clean_dir.rglob("*.wav")):
wavs = sorted(clean_dir.rglob("*.wav"))
logger.info(f"이미 μ••μΆ• ν•΄μ œλ¨: {len(wavs)}개 WAV in {clean_dir}")
return wavs
clean_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"μ••μΆ• ν•΄μ œ 쀑: {archive_path} β†’ {clean_dir}")
with zipfile.ZipFile(archive_path, "r") as zf:
wav_members = [m for m in zf.namelist() if m.endswith(".wav")]
for i, member in enumerate(wav_members, 1):
# Actor_NN/filename.wav β†’ clean/Actor_NN/filename.wav
target = clean_dir / member
target.parent.mkdir(parents=True, exist_ok=True)
with zf.open(member) as src, open(target, "wb") as dst:
dst.write(src.read())
if i % 500 == 0:
logger.info(f" [{i}/{len(wav_members)}] μ••μΆ• ν•΄μ œ 쀑...")
wavs = sorted(clean_dir.rglob("*.wav"))
logger.info(f"μ••μΆ• ν•΄μ œ μ™„λ£Œ: {len(wavs)}개 WAV")
return wavs
def apply_phone_simulation(clean_wavs: list[Path], output_dir: Path) -> dict[str, Path]:
"""clean WAV β†’ phone ν’ˆμ§ˆ λ³€ν™˜. {clean_path_str: phone_path} λ°˜ν™˜."""
phone_dir = output_dir / "phone"
simulator = PhoneSimulator(companding=CompandingType.ULAW) # μ˜μ–΄ = 뢁미 ΞΌ-law
mapping = {}
total = len(clean_wavs)
for i, wav_path in enumerate(clean_wavs, 1):
# clean/Actor_NN/file.wav β†’ phone/Actor_NN/file.wav
relative = wav_path.relative_to(output_dir / "clean")
phone_path = phone_dir / relative
phone_path.parent.mkdir(parents=True, exist_ok=True)
if phone_path.exists():
mapping[str(wav_path)] = phone_path
continue
try:
audio, sr = librosa.load(str(wav_path), sr=None, mono=True)
processed, new_sr = simulator.process(audio, sr)
sf.write(str(phone_path), processed, new_sr, subtype="PCM_16")
mapping[str(wav_path)] = phone_path
except Exception as e:
logger.warning(f"μ „ν™” λ³€ν™˜ μ‹€νŒ¨ [{wav_path.name}]: {e}")
if i % 500 == 0:
logger.info(f" [{i}/{total}] μ „ν™” ν’ˆμ§ˆ λ³€ν™˜ 쀑...")
logger.info(f"μ „ν™” ν’ˆμ§ˆ λ³€ν™˜ μ™„λ£Œ: {len(mapping)}/{total}")
return mapping
def build_manifest(
clean_wavs: list[Path],
phone_mapping: dict[str, Path] | None,
output_dir: Path,
) -> Path:
"""manifest.csv 생성."""
manifest_path = output_dir / "manifest.csv"
rows = []
for wav_path in clean_wavs:
meta = parse_ravdess_filename(wav_path.name)
if meta is None:
logger.warning(f"파일λͺ… νŒŒμ‹± μ‹€νŒ¨: {wav_path.name}")
continue
phone_path = ""
if phone_mapping and str(wav_path) in phone_mapping:
phone_path = str(phone_mapping[str(wav_path)])
rows.append({
"clean_path": str(wav_path),
"phone_path": phone_path,
"emotion": meta["emotion"],
"emotion_raw": meta["emotion_raw"],
"actor_id": meta["actor_id"],
"intensity": meta["intensity"],
"statement": meta["statement"],
"repetition": meta["repetition"],
})
# 감정별 톡계 좜λ ₯
from collections import Counter
emotion_counts = Counter(r["emotion"] for r in rows)
logger.info("감정 뢄포:")
for emotion, count in sorted(emotion_counts.items()):
logger.info(f" {emotion}: {count}")
with open(manifest_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=[
"clean_path", "phone_path", "emotion", "emotion_raw",
"actor_id", "intensity", "statement", "repetition",
])
writer.writeheader()
writer.writerows(rows)
logger.info(f"manifest μ €μž₯: {manifest_path} ({len(rows)}ν–‰)")
return manifest_path
def main():
parser = argparse.ArgumentParser(description="RAVDESS μ˜μ–΄ 감정 데이터 μ€€λΉ„")
parser.add_argument("--skip-phone", action="store_true", help="μ „ν™” ν’ˆμ§ˆ μ „μ²˜λ¦¬ μƒλž΅")
args = parser.parse_args()
if not ARCHIVE_PATH.exists():
logger.error(f"archive.zip을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€: {ARCHIVE_PATH}")
sys.exit(1)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# 1. μ••μΆ• ν•΄μ œ
clean_wavs = extract_archive(ARCHIVE_PATH, OUTPUT_DIR)
# 2. μ „ν™” ν’ˆμ§ˆ μ „μ²˜λ¦¬
phone_mapping = None
if not args.skip_phone:
phone_mapping = apply_phone_simulation(clean_wavs, OUTPUT_DIR)
else:
logger.info("μ „ν™” ν’ˆμ§ˆ μ „μ²˜λ¦¬ μƒλž΅ (--skip-phone)")
# 3. manifest.csv 생성
build_manifest(clean_wavs, phone_mapping, OUTPUT_DIR)
logger.info("μ™„λ£Œ!")
if __name__ == "__main__":
main()