| |
| """RAVDESS μμ΄ κ°μ μμ± λ°μ΄ν°μ
μ€λΉ μ€ν¬λ¦½νΈ. |
| |
| data/archive.zipμ μμΆ ν΄μ νκ³ , μ ν νμ§ μ μ²λ¦¬λ₯Ό μ μ©νμ¬ |
| emotion2vec μμ΄ νκ°μ© manifest.csvλ₯Ό μμ±νλ€. |
| |
| Usage: |
| python scripts/prepare_ravdess.py |
| python scripts/prepare_ravdess.py --skip-phone # μ ν μ μ²λ¦¬ μλ΅ |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import csv |
| import logging |
| import sys |
| import zipfile |
| from pathlib import Path |
|
|
| import librosa |
| import numpy as np |
| import soundfile as sf |
|
|
| PROJECT_ROOT = Path(__file__).parent.parent |
| sys.path.insert(0, str(PROJECT_ROOT)) |
|
|
| from src.common.phone_simulator import CompandingType, PhoneSimulator |
|
|
| logging.basicConfig( |
| level=logging.INFO, |
| format="%(asctime)s - %(levelname)s - %(message)s", |
| ) |
| logger = logging.getLogger("prepare_ravdess") |
|
|
| |
| RAVDESS_EMOTION_MAP = { |
| 1: "neutral", |
| 2: "neutral", |
| 3: "joy", |
| 4: "sadness", |
| 5: "anger", |
| 6: "fear", |
| 7: "disgust", |
| 8: "surprise", |
| } |
|
|
| RAVDESS_EMOTION_NAME = { |
| 1: "neutral", 2: "calm", 3: "happy", 4: "sad", |
| 5: "angry", 6: "fearful", 7: "disgust", 8: "surprised", |
| } |
|
|
| ARCHIVE_PATH = PROJECT_ROOT / "data" / "archive.zip" |
| OUTPUT_DIR = PROJECT_ROOT / "data" / "ravdess" |
|
|
|
|
| def parse_ravdess_filename(filename: str) -> dict | None: |
| """RAVDESS νμΌλͺ
μμ λ©νλ°μ΄ν° μΆμΆ. |
| |
| Format: Modality-VocalChannel-Emotion-Intensity-Statement-Repetition-Actor.wav |
| Example: 03-01-05-02-01-01-12.wav |
| """ |
| stem = Path(filename).stem |
| parts = stem.split("-") |
| if len(parts) != 7: |
| return None |
|
|
| emotion_code = int(parts[2]) |
| return { |
| "modality": int(parts[0]), |
| "vocal_channel": int(parts[1]), |
| "emotion_code": emotion_code, |
| "emotion_raw": RAVDESS_EMOTION_NAME.get(emotion_code, "unknown"), |
| "emotion": RAVDESS_EMOTION_MAP.get(emotion_code, "neutral"), |
| "intensity": int(parts[3]), |
| "statement": int(parts[4]), |
| "repetition": int(parts[5]), |
| "actor_id": int(parts[6]), |
| } |
|
|
|
|
| def extract_archive(archive_path: Path, output_dir: Path) -> list[Path]: |
| """archive.zip μμΆ ν΄μ β clean/ λλ ν 리.""" |
| clean_dir = output_dir / "clean" |
|
|
| if clean_dir.exists() and any(clean_dir.rglob("*.wav")): |
| wavs = sorted(clean_dir.rglob("*.wav")) |
| logger.info(f"μ΄λ―Έ μμΆ ν΄μ λ¨: {len(wavs)}κ° WAV in {clean_dir}") |
| return wavs |
|
|
| clean_dir.mkdir(parents=True, exist_ok=True) |
| logger.info(f"μμΆ ν΄μ μ€: {archive_path} β {clean_dir}") |
|
|
| with zipfile.ZipFile(archive_path, "r") as zf: |
| wav_members = [m for m in zf.namelist() if m.endswith(".wav")] |
| for i, member in enumerate(wav_members, 1): |
| |
| target = clean_dir / member |
| target.parent.mkdir(parents=True, exist_ok=True) |
| with zf.open(member) as src, open(target, "wb") as dst: |
| dst.write(src.read()) |
| if i % 500 == 0: |
| logger.info(f" [{i}/{len(wav_members)}] μμΆ ν΄μ μ€...") |
|
|
| wavs = sorted(clean_dir.rglob("*.wav")) |
| logger.info(f"μμΆ ν΄μ μλ£: {len(wavs)}κ° WAV") |
| return wavs |
|
|
|
|
| def apply_phone_simulation(clean_wavs: list[Path], output_dir: Path) -> dict[str, Path]: |
| """clean WAV β phone νμ§ λ³ν. {clean_path_str: phone_path} λ°ν.""" |
| phone_dir = output_dir / "phone" |
| simulator = PhoneSimulator(companding=CompandingType.ULAW) |
|
|
| mapping = {} |
| total = len(clean_wavs) |
|
|
| for i, wav_path in enumerate(clean_wavs, 1): |
| |
| relative = wav_path.relative_to(output_dir / "clean") |
| phone_path = phone_dir / relative |
| phone_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
| if phone_path.exists(): |
| mapping[str(wav_path)] = phone_path |
| continue |
|
|
| try: |
| audio, sr = librosa.load(str(wav_path), sr=None, mono=True) |
| processed, new_sr = simulator.process(audio, sr) |
| sf.write(str(phone_path), processed, new_sr, subtype="PCM_16") |
| mapping[str(wav_path)] = phone_path |
| except Exception as e: |
| logger.warning(f"μ ν λ³ν μ€ν¨ [{wav_path.name}]: {e}") |
|
|
| if i % 500 == 0: |
| logger.info(f" [{i}/{total}] μ ν νμ§ λ³ν μ€...") |
|
|
| logger.info(f"μ ν νμ§ λ³ν μλ£: {len(mapping)}/{total}") |
| return mapping |
|
|
|
|
| def build_manifest( |
| clean_wavs: list[Path], |
| phone_mapping: dict[str, Path] | None, |
| output_dir: Path, |
| ) -> Path: |
| """manifest.csv μμ±.""" |
| manifest_path = output_dir / "manifest.csv" |
| rows = [] |
|
|
| for wav_path in clean_wavs: |
| meta = parse_ravdess_filename(wav_path.name) |
| if meta is None: |
| logger.warning(f"νμΌλͺ
νμ± μ€ν¨: {wav_path.name}") |
| continue |
|
|
| phone_path = "" |
| if phone_mapping and str(wav_path) in phone_mapping: |
| phone_path = str(phone_mapping[str(wav_path)]) |
|
|
| rows.append({ |
| "clean_path": str(wav_path), |
| "phone_path": phone_path, |
| "emotion": meta["emotion"], |
| "emotion_raw": meta["emotion_raw"], |
| "actor_id": meta["actor_id"], |
| "intensity": meta["intensity"], |
| "statement": meta["statement"], |
| "repetition": meta["repetition"], |
| }) |
|
|
| |
| from collections import Counter |
| emotion_counts = Counter(r["emotion"] for r in rows) |
| logger.info("κ°μ λΆν¬:") |
| for emotion, count in sorted(emotion_counts.items()): |
| logger.info(f" {emotion}: {count}") |
|
|
| with open(manifest_path, "w", newline="") as f: |
| writer = csv.DictWriter(f, fieldnames=[ |
| "clean_path", "phone_path", "emotion", "emotion_raw", |
| "actor_id", "intensity", "statement", "repetition", |
| ]) |
| writer.writeheader() |
| writer.writerows(rows) |
|
|
| logger.info(f"manifest μ μ₯: {manifest_path} ({len(rows)}ν)") |
| return manifest_path |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="RAVDESS μμ΄ κ°μ λ°μ΄ν° μ€λΉ") |
| parser.add_argument("--skip-phone", action="store_true", help="μ ν νμ§ μ μ²λ¦¬ μλ΅") |
| args = parser.parse_args() |
|
|
| if not ARCHIVE_PATH.exists(): |
| logger.error(f"archive.zipμ μ°Ύμ μ μμ΅λλ€: {ARCHIVE_PATH}") |
| sys.exit(1) |
|
|
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| |
| clean_wavs = extract_archive(ARCHIVE_PATH, OUTPUT_DIR) |
|
|
| |
| phone_mapping = None |
| if not args.skip_phone: |
| phone_mapping = apply_phone_simulation(clean_wavs, OUTPUT_DIR) |
| else: |
| logger.info("μ ν νμ§ μ μ²λ¦¬ μλ΅ (--skip-phone)") |
|
|
| |
| build_manifest(clean_wavs, phone_mapping, OUTPUT_DIR) |
|
|
| logger.info("μλ£!") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|