Spaces:

AswinMathew
/

anime-gen-api

Sleeping

File size: 26,006 Bytes

7190fd0

"""One-time setup: download AnimeVox dataset, extract reference clips, encode voice states.

Usage:
    python scripts/setup_animevox.py

This script:
1. Downloads the AnimeVox dataset from HuggingFace (11,020 clips, 19 characters)
2. For each character, selects the best 15-20s reference clip
3. Exports reference WAV files to data/animevox_voices/{character}/reference.wav
4. Pre-encodes Pocket TTS voice states (.safetensors) for instant loading
5. Saves data/animevox_catalog.json with tagged character metadata
"""
import json
import os
import sys
import time
from pathlib import Path

import numpy as np

# Add project root to path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

# Known anime character metadata for trait tagging.
# Characters not in this map get auto-tagged with defaults.
KNOWN_CHARACTERS = {
    # --- Male characters ---
    "Naruto Uzumaki": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["energetic", "brash", "determined"],
        "pitch_range": "mid",
    },
    "Naruto": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["energetic", "brash", "determined"],
        "pitch_range": "mid",
    },
    "Sasuke Uchiha": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["cold", "intense", "brooding"],
        "pitch_range": "mid-low",
    },
    "Sasuke": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["cold", "intense", "brooding"],
        "pitch_range": "mid-low",
    },
    "Ichigo Kurosaki": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["intense", "brash", "determined"],
        "pitch_range": "mid-low",
    },
    "Ichigo": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["intense", "brash", "determined"],
        "pitch_range": "mid-low",
    },
    "Eren Yeager": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["passionate", "angry", "intense"],
        "pitch_range": "mid",
    },
    "Eren": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["passionate", "angry", "intense"],
        "pitch_range": "mid",
    },
    "Tanjiro Kamado": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["gentle", "determined", "earnest"],
        "pitch_range": "mid",
    },
    "Tanjiro": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["gentle", "determined", "earnest"],
        "pitch_range": "mid",
    },
    "Light Yagami": {
        "gender": "male", "age_category": "young_adult",
        "vocal_traits": ["cold", "calculating", "confident"],
        "pitch_range": "mid-low",
    },
    "Light": {
        "gender": "male", "age_category": "young_adult",
        "vocal_traits": ["cold", "calculating", "confident"],
        "pitch_range": "mid-low",
    },
    "Goku": {
        "gender": "male", "age_category": "adult",
        "vocal_traits": ["cheerful", "energetic", "simple"],
        "pitch_range": "mid",
    },
    "Vegeta": {
        "gender": "male", "age_category": "adult",
        "vocal_traits": ["proud", "intense", "arrogant"],
        "pitch_range": "mid-low",
    },
    "Levi Ackerman": {
        "gender": "male", "age_category": "adult",
        "vocal_traits": ["cold", "calm", "authoritative"],
        "pitch_range": "low",
    },
    "Levi": {
        "gender": "male", "age_category": "adult",
        "vocal_traits": ["cold", "calm", "authoritative"],
        "pitch_range": "low",
    },
    "L": {
        "gender": "male", "age_category": "young_adult",
        "vocal_traits": ["calm", "quirky", "soft"],
        "pitch_range": "mid",
    },
    "L Lawliet": {
        "gender": "male", "age_category": "young_adult",
        "vocal_traits": ["calm", "quirky", "soft"],
        "pitch_range": "mid",
    },
    "Izuku Midoriya": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["nervous", "earnest", "determined"],
        "pitch_range": "mid-high",
    },
    "Deku": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["nervous", "earnest", "determined"],
        "pitch_range": "mid-high",
    },
    "Kirito": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["calm", "determined", "gentle"],
        "pitch_range": "mid",
    },
    "Kaneki Ken": {
        "gender": "male", "age_category": "young_adult",
        "vocal_traits": ["soft", "melancholic", "intense"],
        "pitch_range": "mid",
    },
    "Kaneki": {
        "gender": "male", "age_category": "young_adult",
        "vocal_traits": ["soft", "melancholic", "intense"],
        "pitch_range": "mid",
    },
    "Gon Freecss": {
        "gender": "male", "age_category": "child",
        "vocal_traits": ["cheerful", "energetic", "innocent"],
        "pitch_range": "high",
    },
    "Gon": {
        "gender": "male", "age_category": "child",
        "vocal_traits": ["cheerful", "energetic", "innocent"],
        "pitch_range": "high",
    },
    "Killua Zoldyck": {
        "gender": "male", "age_category": "child",
        "vocal_traits": ["cool", "playful", "intense"],
        "pitch_range": "mid-high",
    },
    "Killua": {
        "gender": "male", "age_category": "child",
        "vocal_traits": ["cool", "playful", "intense"],
        "pitch_range": "mid-high",
    },
    "Edward Elric": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["brash", "passionate", "determined"],
        "pitch_range": "mid",
    },
    "Lelouch": {
        "gender": "male", "age_category": "young_adult",
        "vocal_traits": ["commanding", "calculating", "dramatic"],
        "pitch_range": "mid-low",
    },
    "Lelouch Lamperouge": {
        "gender": "male", "age_category": "young_adult",
        "vocal_traits": ["commanding", "calculating", "dramatic"],
        "pitch_range": "mid-low",
    },
    "Spike Spiegel": {
        "gender": "male", "age_category": "adult",
        "vocal_traits": ["cool", "laid-back", "witty"],
        "pitch_range": "mid-low",
    },
    "Spike": {
        "gender": "male", "age_category": "adult",
        "vocal_traits": ["cool", "laid-back", "witty"],
        "pitch_range": "mid-low",
    },
    "Luffy": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["cheerful", "energetic", "carefree"],
        "pitch_range": "mid-high",
    },
    "Monkey D. Luffy": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["cheerful", "energetic", "carefree"],
        "pitch_range": "mid-high",
    },
    "Bakugo": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["aggressive", "loud", "angry"],
        "pitch_range": "mid",
    },
    "Katsuki Bakugo": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["aggressive", "loud", "angry"],
        "pitch_range": "mid",
    },
    "Todoroki": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["calm", "cold", "reserved"],
        "pitch_range": "mid-low",
    },
    "Shoto Todoroki": {
        "gender": "male", "age_category": "teen",
        "vocal_traits": ["calm", "cold", "reserved"],
        "pitch_range": "mid-low",
    },
    # --- Female characters ---
    "Frieren": {
        "gender": "female", "age_category": "adult",
        "vocal_traits": ["calm", "ethereal", "soft"],
        "pitch_range": "mid",
    },
    "Mikasa Ackerman": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["calm", "determined", "quiet"],
        "pitch_range": "mid-low",
    },
    "Mikasa": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["calm", "determined", "quiet"],
        "pitch_range": "mid-low",
    },
    "Nezuko Kamado": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["gentle", "soft", "innocent"],
        "pitch_range": "mid-high",
    },
    "Nezuko": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["gentle", "soft", "innocent"],
        "pitch_range": "mid-high",
    },
    "Asuna": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["gentle", "determined", "warm"],
        "pitch_range": "mid",
    },
    "Asuna Yuuki": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["gentle", "determined", "warm"],
        "pitch_range": "mid",
    },
    "Sakura Haruno": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["energetic", "emotional", "determined"],
        "pitch_range": "mid-high",
    },
    "Sakura": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["energetic", "emotional", "determined"],
        "pitch_range": "mid-high",
    },
    "Hinata Hyuga": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["shy", "soft", "gentle"],
        "pitch_range": "mid-high",
    },
    "Hinata": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["shy", "soft", "gentle"],
        "pitch_range": "mid-high",
    },
    "Rukia Kuchiki": {
        "gender": "female", "age_category": "young_adult",
        "vocal_traits": ["assertive", "serious", "caring"],
        "pitch_range": "mid",
    },
    "Rukia": {
        "gender": "female", "age_category": "young_adult",
        "vocal_traits": ["assertive", "serious", "caring"],
        "pitch_range": "mid",
    },
    "Ochako Uraraka": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["cheerful", "energetic", "warm"],
        "pitch_range": "mid-high",
    },
    "Ochako": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["cheerful", "energetic", "warm"],
        "pitch_range": "mid-high",
    },
    "Misa Amane": {
        "gender": "female", "age_category": "young_adult",
        "vocal_traits": ["bubbly", "dramatic", "cute"],
        "pitch_range": "high",
    },
    "Misa": {
        "gender": "female", "age_category": "young_adult",
        "vocal_traits": ["bubbly", "dramatic", "cute"],
        "pitch_range": "high",
    },
    "Winry Rockbell": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["fiery", "caring", "determined"],
        "pitch_range": "mid",
    },
    "Fern": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["serious", "diligent", "quiet"],
        "pitch_range": "mid",
    },
    "Himmel": {
        "gender": "male", "age_category": "young_adult",
        "vocal_traits": ["heroic", "warm", "confident"],
        "pitch_range": "mid",
    },
    # --- AnimeVox dataset characters (all female) ---
    "Rin Tohsaka": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["assertive", "confident", "sharp"],
        "pitch_range": "mid",
    },
    "Marin Kitagawa": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["cheerful", "bubbly", "energetic"],
        "pitch_range": "mid-high",
    },
    "Emilia": {
        "gender": "female", "age_category": "young_adult",
        "vocal_traits": ["gentle", "kind", "earnest"],
        "pitch_range": "mid-high",
    },
    "Kurisu Makise": {
        "gender": "female", "age_category": "young_adult",
        "vocal_traits": ["sharp", "intellectual", "tsundere"],
        "pitch_range": "mid",
    },
    "Megumin": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["dramatic", "energetic", "theatrical"],
        "pitch_range": "mid-high",
    },
    "Momo Ayase": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["brash", "energetic", "bold"],
        "pitch_range": "mid",
    },
    "Mai Sakurajima": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["calm", "cool", "witty"],
        "pitch_range": "mid",
    },
    "Madoka Kaname": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["gentle", "innocent", "soft"],
        "pitch_range": "high",
    },
    "Rem": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["devoted", "gentle", "intense"],
        "pitch_range": "mid-high",
    },
    "Saber": {
        "gender": "female", "age_category": "young_adult",
        "vocal_traits": ["noble", "commanding", "determined"],
        "pitch_range": "mid",
    },
    "Homura Akemi": {
        "gender": "female", "age_category": "teen",
        "vocal_traits": ["cold", "mysterious", "intense"],
        "pitch_range": "mid-low",
    },
    "Makima": {
        "gender": "female", "age_category": "adult",
        "vocal_traits": ["cold", "calm", "manipulative"],
        "pitch_range": "mid-low",
    },
    "Shiro": {
        "gender": "female", "age_category": "child",
        "vocal_traits": ["calm", "monotone", "intellectual"],
        "pitch_range": "mid",
    },
    "Lucy": {
        "gender": "female", "age_category": "young_adult",
        "vocal_traits": ["cool", "reserved", "melancholic"],
        "pitch_range": "mid",
    },
    "Power": {
        "gender": "female", "age_category": "young_adult",
        "vocal_traits": ["loud", "brash", "chaotic"],
        "pitch_range": "mid-high",
    },
}


def _sanitize_name(name: str) -> str:
    """Convert character name to filesystem-safe ID."""
    return name.lower().replace(" ", "_").replace(".", "").replace("'", "")


def _get_traits(character_name: str) -> dict:
    """Look up traits from known characters, or return defaults."""
    if character_name in KNOWN_CHARACTERS:
        return KNOWN_CHARACTERS[character_name]
    # Try partial match (first name)
    first_name = character_name.split()[0] if " " in character_name else character_name
    if first_name in KNOWN_CHARACTERS:
        return KNOWN_CHARACTERS[first_name]
    # Default — will need manual correction
    return {
        "gender": "unknown",
        "age_category": "young_adult",
        "vocal_traits": ["neutral"],
        "pitch_range": "mid",
    }


def _decode_audio(audio_bytes: bytes) -> tuple[np.ndarray, int]:
    """Decode audio bytes (WAV/FLAC/etc) to numpy array + sample rate."""
    import soundfile as sf
    import io
    data, sr = sf.read(io.BytesIO(audio_bytes))
    return data.astype(np.float32), sr


def main():
    print("=" * 60)
    print("AnimeVox Setup — Voice Reference Library")
    print("=" * 60)

    try:
        from huggingface_hub import hf_hub_download
        import pyarrow.parquet as pq
    except ImportError:
        print("\nERROR: 'huggingface-hub' or 'pyarrow' not installed.")
        print("Run: pip install huggingface-hub pyarrow")
        sys.exit(1)

    try:
        import soundfile as sf
    except ImportError:
        print("\nERROR: 'soundfile' not installed.")
        print("Run: pip install soundfile")
        sys.exit(1)

    try:
        import scipy.io.wavfile
    except ImportError:
        print("\nERROR: 'scipy' not installed.")
        print("Run: pip install scipy")
        sys.exit(1)

    voices_dir = PROJECT_ROOT / "data" / "animevox_voices"
    catalog_path = PROJECT_ROOT / "data" / "animevox_catalog.json"

    # Check if already set up
    if catalog_path.exists():
        with open(catalog_path) as f:
            existing = json.load(f)
        print(f"\nCatalog already exists with {len(existing)} characters.")
        resp = input("Re-run setup? (y/N): ").strip().lower()
        if resp != "y":
            print("Skipped.")
            return

    # Step 1: Download parquet files directly (avoids torchcodec dependency)
    PARQUET_FILES = [f"data/train-{i:05d}-of-00007.parquet" for i in range(7)]
    print(f"\n[1/5] Downloading AnimeVox parquet files ({len(PARQUET_FILES)} shards)...")
    t0 = time.time()

    all_rows = []  # list of (char_name, anime, audio_bytes, transcription)
    for shard in PARQUET_FILES:
        print(f"       Downloading {shard}...", end=" ", flush=True)
        path = hf_hub_download("taresh18/AnimeVox", shard, repo_type="dataset")
        table = pq.read_table(path, columns=["audio", "character_name", "anime", "transcription"])
        for i in range(len(table)):
            audio_entry = table.column("audio")[i].as_py()
            all_rows.append({
                "character_name": table.column("character_name")[i].as_py(),
                "anime": table.column("anime")[i].as_py(),
                "audio_bytes": audio_entry["bytes"],
                "transcription": table.column("transcription")[i].as_py(),
            })
        print(f"{len(table)} rows")

    print(f"       Total: {len(all_rows)} clips in {time.time() - t0:.1f}s")

    # Step 2: Discover characters
    print("\n[2/5] Discovering characters...")
    char_clips: dict[str, list[int]] = {}
    char_anime: dict[str, str] = {}
    for idx, row in enumerate(all_rows):
        name = row["character_name"]
        if name not in char_clips:
            char_clips[name] = []
            char_anime[name] = row["anime"]
        char_clips[name].append(idx)

    print(f"       Found {len(char_clips)} characters:")
    for name, clips in sorted(char_clips.items(), key=lambda x: -len(x[1])):
        print(f"         {name} ({char_anime[name]}): {len(clips)} clips")

    # Step 3: Select best reference clip per character
    print("\n[3/5] Selecting best reference clips (10-20s, clean audio)...")
    voices_dir.mkdir(parents=True, exist_ok=True)

    import random
    selected_refs = {}
    for name, clip_indices in char_clips.items():
        char_id = _sanitize_name(name)
        char_dir = voices_dir / char_id
        char_dir.mkdir(parents=True, exist_ok=True)
        ref_path = char_dir / "reference.wav"

        # Sample up to 80 clips to find a good reference
        sample_indices = clip_indices if len(clip_indices) <= 80 else random.sample(clip_indices, 80)

        best_idx = None
        best_duration = 0.0
        best_score = -1

        print(f"  {name}: scanning {len(sample_indices)}/{len(clip_indices)} clips...", end=" ", flush=True)

        for idx in sample_indices:
            try:
                audio_bytes = all_rows[idx]["audio_bytes"]
                samples, sr = _decode_audio(audio_bytes)
                duration = len(samples) / sr

                # Score: prefer 10-20s clips, penalize very short or very long
                if 15 <= duration <= 20:
                    score = 100
                elif 10 <= duration <= 25:
                    score = 80
                elif 5 <= duration <= 30:
                    score = 60
                else:
                    score = max(0, 40 - abs(duration - 15))

                # Penalize quiet clips
                rms = np.sqrt(np.mean(samples ** 2))
                if rms < 0.01:
                    score -= 20

                if score > best_score or (score == best_score and duration > best_duration):
                    best_score = score
                    best_duration = duration
                    best_idx = idx

                if best_score >= 100:
                    break
            except Exception as e:
                continue

        if best_idx is not None:
            samples, sr = _decode_audio(all_rows[best_idx]["audio_bytes"])

            # Write as int16 WAV
            samples_int16 = np.clip(samples * 32767, -32768, 32767).astype(np.int16)
            scipy.io.wavfile.write(str(ref_path), sr, samples_int16)

            selected_refs[name] = {
                "path": str(ref_path),
                "duration": len(samples) / sr,
                "transcript": all_rows[best_idx].get("transcription", ""),
            }
            print(f"{len(samples)/sr:.1f}s saved (score={best_score})")
        else:
            print("NO SUITABLE CLIP FOUND")

    # Free memory — parquet data no longer needed
    del all_rows

    # Step 4: Encode voice states with Pocket TTS
    print("\n[4/5] Encoding voice states with Pocket TTS...")
    print("       (Loading model — ~2GB RAM, first run downloads weights)")

    try:
        from pocket_tts import TTSModel
        from pocket_tts.models.tts_model import export_model_state as _export_state
    except ImportError:
        print("\nERROR: 'pocket-tts' not installed.")
        print("Run: pip install pocket-tts")
        sys.exit(1)

    t0 = time.time()
    tts_model = TTSModel.load_model()
    print(f"       Model loaded in {time.time() - t0:.1f}s")

    # Check if voice cloning is available by testing with a reference
    voice_cloning_available = False
    if selected_refs:
        first_ref = next(iter(selected_refs.values()))
        try:
            test_state = tts_model.get_state_for_audio_prompt(first_ref["path"])
            voice_cloning_available = True
            print("       Voice cloning: AVAILABLE")
        except Exception as e:
            if "voice cloning" in str(e).lower():
                print("       Voice cloning: NOT AVAILABLE (gated model)")
                print("       To enable: accept terms at https://huggingface.co/kyutai/pocket-tts")
                print("                  then run: huggingface-cli login")
                print("       Using PRESET voices as fallback...")
            else:
                print(f"       Voice cloning test failed: {e}")

    # Encode voice states
    encoded_count = 0
    if voice_cloning_available:
        for name, ref_info in selected_refs.items():
            char_id = _sanitize_name(name)
            char_dir = voices_dir / char_id
            state_path = char_dir / "voice_state.safetensors"

            try:
                t0 = time.time()
                voice_state = tts_model.get_state_for_audio_prompt(ref_info["path"])
                _export_state(voice_state, str(state_path))

                elapsed = time.time() - t0
                print(f"  {name}: voice state encoded in {elapsed:.1f}s")
                encoded_count += 1
            except Exception as e:
                print(f"  {name}: FAILED — {e}")
    else:
        # Fallback: assign preset voice names (no .safetensors export needed)
        # Pocket TTS non-cloning model supports these built-in voices:
        PRESET_VOICES_FEMALE = ["alba", "fantine", "cosette", "eponine", "azelma"]
        PRESET_VOICES_MALE = ["marius", "javert", "jean"]
        print("\n       Assigning preset voices as fallback...")

        female_idx = 0
        for name in sorted(selected_refs.keys()):
            char_id = _sanitize_name(name)
            char_dir = voices_dir / char_id
            traits = _get_traits(name)
            gender = traits.get("gender", "female")

            if gender == "male":
                preset = PRESET_VOICES_MALE[encoded_count % len(PRESET_VOICES_MALE)]
            else:
                preset = PRESET_VOICES_FEMALE[female_idx % len(PRESET_VOICES_FEMALE)]
                female_idx += 1

            # Write preset name to a marker file (no .safetensors needed)
            preset_path = char_dir / "preset_voice.txt"
            with open(preset_path, "w") as f:
                f.write(preset)

            # Verify it works
            try:
                voice_state = tts_model.get_state_for_audio_prompt(preset)
                print(f"  {name}: preset '{preset}' verified OK")
                encoded_count += 1
            except Exception as e:
                print(f"  {name}: preset '{preset}' FAILED — {e}")

    print(f"\n       {encoded_count}/{len(selected_refs)} voice states encoded")

    # Step 5: Build catalog
    print("\n[5/5] Building voice catalog...")
    catalog = []
    for name in sorted(char_clips.keys()):
        char_id = _sanitize_name(name)
        char_dir = voices_dir / char_id
        ref_path = char_dir / "reference.wav"
        state_path = char_dir / "voice_state.safetensors"
        preset_path = char_dir / "preset_voice.txt"

        traits = _get_traits(name)

        # Determine voice source: .safetensors (cloned) or preset name
        voice_state = None
        preset_voice = None
        if state_path.exists():
            voice_state = str(state_path)
        elif preset_path.exists():
            preset_voice = preset_path.read_text().strip()

        entry = {
            "id": char_id,
            "display_name": name,
            "source_anime": char_anime.get(name, "Unknown"),
            "gender": traits["gender"],
            "age_category": traits["age_category"],
            "vocal_traits": traits["vocal_traits"],
            "pitch_range": traits["pitch_range"],
            "clip_count": len(char_clips[name]),
            "reference_path": str(ref_path) if ref_path.exists() else None,
            "voice_state_path": voice_state,
            "preset_voice": preset_voice,
        }
        catalog.append(entry)

        if voice_state:
            status = "ready (cloned)"
        elif preset_voice:
            status = f"ready (preset: {preset_voice})"
        elif ref_path.exists():
            status = "ref only"
        else:
            status = "MISSING"
        traits_flag = "" if traits["gender"] != "unknown" else " [NEEDS MANUAL TAGGING]"
        print(f"  {name}: {status}{traits_flag}")

    with open(catalog_path, "w") as f:
        json.dump(catalog, f, indent=2)

    print(f"\nCatalog saved to {catalog_path}")
    print(f"Total: {len(catalog)} characters, {encoded_count} with voice states")

    # Check for untagged characters
    untagged = [e for e in catalog if e["gender"] == "unknown"]
    if untagged:
        print(f"\nWARNING: {len(untagged)} characters need manual trait tagging:")
        for e in untagged:
            print(f"  - {e['display_name']} ({e['source_anime']})")
        print(f"\nEdit {catalog_path} to add gender/age_category/vocal_traits for these characters.")

    print("\nSetup complete!")


if __name__ == "__main__":
    main()