"""One-time setup: download AnimeVox dataset, extract reference clips, encode voice states. Usage: python scripts/setup_animevox.py This script: 1. Downloads the AnimeVox dataset from HuggingFace (11,020 clips, 19 characters) 2. For each character, selects the best 15-20s reference clip 3. Exports reference WAV files to data/animevox_voices/{character}/reference.wav 4. Pre-encodes Pocket TTS voice states (.safetensors) for instant loading 5. Saves data/animevox_catalog.json with tagged character metadata """ import json import os import sys import time from pathlib import Path import numpy as np # Add project root to path PROJECT_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(PROJECT_ROOT)) # Known anime character metadata for trait tagging. # Characters not in this map get auto-tagged with defaults. KNOWN_CHARACTERS = { # --- Male characters --- "Naruto Uzumaki": { "gender": "male", "age_category": "teen", "vocal_traits": ["energetic", "brash", "determined"], "pitch_range": "mid", }, "Naruto": { "gender": "male", "age_category": "teen", "vocal_traits": ["energetic", "brash", "determined"], "pitch_range": "mid", }, "Sasuke Uchiha": { "gender": "male", "age_category": "teen", "vocal_traits": ["cold", "intense", "brooding"], "pitch_range": "mid-low", }, "Sasuke": { "gender": "male", "age_category": "teen", "vocal_traits": ["cold", "intense", "brooding"], "pitch_range": "mid-low", }, "Ichigo Kurosaki": { "gender": "male", "age_category": "teen", "vocal_traits": ["intense", "brash", "determined"], "pitch_range": "mid-low", }, "Ichigo": { "gender": "male", "age_category": "teen", "vocal_traits": ["intense", "brash", "determined"], "pitch_range": "mid-low", }, "Eren Yeager": { "gender": "male", "age_category": "teen", "vocal_traits": ["passionate", "angry", "intense"], "pitch_range": "mid", }, "Eren": { "gender": "male", "age_category": "teen", "vocal_traits": ["passionate", "angry", "intense"], "pitch_range": "mid", }, "Tanjiro Kamado": { "gender": "male", "age_category": "teen", "vocal_traits": ["gentle", "determined", "earnest"], "pitch_range": "mid", }, "Tanjiro": { "gender": "male", "age_category": "teen", "vocal_traits": ["gentle", "determined", "earnest"], "pitch_range": "mid", }, "Light Yagami": { "gender": "male", "age_category": "young_adult", "vocal_traits": ["cold", "calculating", "confident"], "pitch_range": "mid-low", }, "Light": { "gender": "male", "age_category": "young_adult", "vocal_traits": ["cold", "calculating", "confident"], "pitch_range": "mid-low", }, "Goku": { "gender": "male", "age_category": "adult", "vocal_traits": ["cheerful", "energetic", "simple"], "pitch_range": "mid", }, "Vegeta": { "gender": "male", "age_category": "adult", "vocal_traits": ["proud", "intense", "arrogant"], "pitch_range": "mid-low", }, "Levi Ackerman": { "gender": "male", "age_category": "adult", "vocal_traits": ["cold", "calm", "authoritative"], "pitch_range": "low", }, "Levi": { "gender": "male", "age_category": "adult", "vocal_traits": ["cold", "calm", "authoritative"], "pitch_range": "low", }, "L": { "gender": "male", "age_category": "young_adult", "vocal_traits": ["calm", "quirky", "soft"], "pitch_range": "mid", }, "L Lawliet": { "gender": "male", "age_category": "young_adult", "vocal_traits": ["calm", "quirky", "soft"], "pitch_range": "mid", }, "Izuku Midoriya": { "gender": "male", "age_category": "teen", "vocal_traits": ["nervous", "earnest", "determined"], "pitch_range": "mid-high", }, "Deku": { "gender": "male", "age_category": "teen", "vocal_traits": ["nervous", "earnest", "determined"], "pitch_range": "mid-high", }, "Kirito": { "gender": "male", "age_category": "teen", "vocal_traits": ["calm", "determined", "gentle"], "pitch_range": "mid", }, "Kaneki Ken": { "gender": "male", "age_category": "young_adult", "vocal_traits": ["soft", "melancholic", "intense"], "pitch_range": "mid", }, "Kaneki": { "gender": "male", "age_category": "young_adult", "vocal_traits": ["soft", "melancholic", "intense"], "pitch_range": "mid", }, "Gon Freecss": { "gender": "male", "age_category": "child", "vocal_traits": ["cheerful", "energetic", "innocent"], "pitch_range": "high", }, "Gon": { "gender": "male", "age_category": "child", "vocal_traits": ["cheerful", "energetic", "innocent"], "pitch_range": "high", }, "Killua Zoldyck": { "gender": "male", "age_category": "child", "vocal_traits": ["cool", "playful", "intense"], "pitch_range": "mid-high", }, "Killua": { "gender": "male", "age_category": "child", "vocal_traits": ["cool", "playful", "intense"], "pitch_range": "mid-high", }, "Edward Elric": { "gender": "male", "age_category": "teen", "vocal_traits": ["brash", "passionate", "determined"], "pitch_range": "mid", }, "Lelouch": { "gender": "male", "age_category": "young_adult", "vocal_traits": ["commanding", "calculating", "dramatic"], "pitch_range": "mid-low", }, "Lelouch Lamperouge": { "gender": "male", "age_category": "young_adult", "vocal_traits": ["commanding", "calculating", "dramatic"], "pitch_range": "mid-low", }, "Spike Spiegel": { "gender": "male", "age_category": "adult", "vocal_traits": ["cool", "laid-back", "witty"], "pitch_range": "mid-low", }, "Spike": { "gender": "male", "age_category": "adult", "vocal_traits": ["cool", "laid-back", "witty"], "pitch_range": "mid-low", }, "Luffy": { "gender": "male", "age_category": "teen", "vocal_traits": ["cheerful", "energetic", "carefree"], "pitch_range": "mid-high", }, "Monkey D. Luffy": { "gender": "male", "age_category": "teen", "vocal_traits": ["cheerful", "energetic", "carefree"], "pitch_range": "mid-high", }, "Bakugo": { "gender": "male", "age_category": "teen", "vocal_traits": ["aggressive", "loud", "angry"], "pitch_range": "mid", }, "Katsuki Bakugo": { "gender": "male", "age_category": "teen", "vocal_traits": ["aggressive", "loud", "angry"], "pitch_range": "mid", }, "Todoroki": { "gender": "male", "age_category": "teen", "vocal_traits": ["calm", "cold", "reserved"], "pitch_range": "mid-low", }, "Shoto Todoroki": { "gender": "male", "age_category": "teen", "vocal_traits": ["calm", "cold", "reserved"], "pitch_range": "mid-low", }, # --- Female characters --- "Frieren": { "gender": "female", "age_category": "adult", "vocal_traits": ["calm", "ethereal", "soft"], "pitch_range": "mid", }, "Mikasa Ackerman": { "gender": "female", "age_category": "teen", "vocal_traits": ["calm", "determined", "quiet"], "pitch_range": "mid-low", }, "Mikasa": { "gender": "female", "age_category": "teen", "vocal_traits": ["calm", "determined", "quiet"], "pitch_range": "mid-low", }, "Nezuko Kamado": { "gender": "female", "age_category": "teen", "vocal_traits": ["gentle", "soft", "innocent"], "pitch_range": "mid-high", }, "Nezuko": { "gender": "female", "age_category": "teen", "vocal_traits": ["gentle", "soft", "innocent"], "pitch_range": "mid-high", }, "Asuna": { "gender": "female", "age_category": "teen", "vocal_traits": ["gentle", "determined", "warm"], "pitch_range": "mid", }, "Asuna Yuuki": { "gender": "female", "age_category": "teen", "vocal_traits": ["gentle", "determined", "warm"], "pitch_range": "mid", }, "Sakura Haruno": { "gender": "female", "age_category": "teen", "vocal_traits": ["energetic", "emotional", "determined"], "pitch_range": "mid-high", }, "Sakura": { "gender": "female", "age_category": "teen", "vocal_traits": ["energetic", "emotional", "determined"], "pitch_range": "mid-high", }, "Hinata Hyuga": { "gender": "female", "age_category": "teen", "vocal_traits": ["shy", "soft", "gentle"], "pitch_range": "mid-high", }, "Hinata": { "gender": "female", "age_category": "teen", "vocal_traits": ["shy", "soft", "gentle"], "pitch_range": "mid-high", }, "Rukia Kuchiki": { "gender": "female", "age_category": "young_adult", "vocal_traits": ["assertive", "serious", "caring"], "pitch_range": "mid", }, "Rukia": { "gender": "female", "age_category": "young_adult", "vocal_traits": ["assertive", "serious", "caring"], "pitch_range": "mid", }, "Ochako Uraraka": { "gender": "female", "age_category": "teen", "vocal_traits": ["cheerful", "energetic", "warm"], "pitch_range": "mid-high", }, "Ochako": { "gender": "female", "age_category": "teen", "vocal_traits": ["cheerful", "energetic", "warm"], "pitch_range": "mid-high", }, "Misa Amane": { "gender": "female", "age_category": "young_adult", "vocal_traits": ["bubbly", "dramatic", "cute"], "pitch_range": "high", }, "Misa": { "gender": "female", "age_category": "young_adult", "vocal_traits": ["bubbly", "dramatic", "cute"], "pitch_range": "high", }, "Winry Rockbell": { "gender": "female", "age_category": "teen", "vocal_traits": ["fiery", "caring", "determined"], "pitch_range": "mid", }, "Fern": { "gender": "female", "age_category": "teen", "vocal_traits": ["serious", "diligent", "quiet"], "pitch_range": "mid", }, "Himmel": { "gender": "male", "age_category": "young_adult", "vocal_traits": ["heroic", "warm", "confident"], "pitch_range": "mid", }, # --- AnimeVox dataset characters (all female) --- "Rin Tohsaka": { "gender": "female", "age_category": "teen", "vocal_traits": ["assertive", "confident", "sharp"], "pitch_range": "mid", }, "Marin Kitagawa": { "gender": "female", "age_category": "teen", "vocal_traits": ["cheerful", "bubbly", "energetic"], "pitch_range": "mid-high", }, "Emilia": { "gender": "female", "age_category": "young_adult", "vocal_traits": ["gentle", "kind", "earnest"], "pitch_range": "mid-high", }, "Kurisu Makise": { "gender": "female", "age_category": "young_adult", "vocal_traits": ["sharp", "intellectual", "tsundere"], "pitch_range": "mid", }, "Megumin": { "gender": "female", "age_category": "teen", "vocal_traits": ["dramatic", "energetic", "theatrical"], "pitch_range": "mid-high", }, "Momo Ayase": { "gender": "female", "age_category": "teen", "vocal_traits": ["brash", "energetic", "bold"], "pitch_range": "mid", }, "Mai Sakurajima": { "gender": "female", "age_category": "teen", "vocal_traits": ["calm", "cool", "witty"], "pitch_range": "mid", }, "Madoka Kaname": { "gender": "female", "age_category": "teen", "vocal_traits": ["gentle", "innocent", "soft"], "pitch_range": "high", }, "Rem": { "gender": "female", "age_category": "teen", "vocal_traits": ["devoted", "gentle", "intense"], "pitch_range": "mid-high", }, "Saber": { "gender": "female", "age_category": "young_adult", "vocal_traits": ["noble", "commanding", "determined"], "pitch_range": "mid", }, "Homura Akemi": { "gender": "female", "age_category": "teen", "vocal_traits": ["cold", "mysterious", "intense"], "pitch_range": "mid-low", }, "Makima": { "gender": "female", "age_category": "adult", "vocal_traits": ["cold", "calm", "manipulative"], "pitch_range": "mid-low", }, "Shiro": { "gender": "female", "age_category": "child", "vocal_traits": ["calm", "monotone", "intellectual"], "pitch_range": "mid", }, "Lucy": { "gender": "female", "age_category": "young_adult", "vocal_traits": ["cool", "reserved", "melancholic"], "pitch_range": "mid", }, "Power": { "gender": "female", "age_category": "young_adult", "vocal_traits": ["loud", "brash", "chaotic"], "pitch_range": "mid-high", }, } def _sanitize_name(name: str) -> str: """Convert character name to filesystem-safe ID.""" return name.lower().replace(" ", "_").replace(".", "").replace("'", "") def _get_traits(character_name: str) -> dict: """Look up traits from known characters, or return defaults.""" if character_name in KNOWN_CHARACTERS: return KNOWN_CHARACTERS[character_name] # Try partial match (first name) first_name = character_name.split()[0] if " " in character_name else character_name if first_name in KNOWN_CHARACTERS: return KNOWN_CHARACTERS[first_name] # Default — will need manual correction return { "gender": "unknown", "age_category": "young_adult", "vocal_traits": ["neutral"], "pitch_range": "mid", } def _decode_audio(audio_bytes: bytes) -> tuple[np.ndarray, int]: """Decode audio bytes (WAV/FLAC/etc) to numpy array + sample rate.""" import soundfile as sf import io data, sr = sf.read(io.BytesIO(audio_bytes)) return data.astype(np.float32), sr def main(): print("=" * 60) print("AnimeVox Setup — Voice Reference Library") print("=" * 60) try: from huggingface_hub import hf_hub_download import pyarrow.parquet as pq except ImportError: print("\nERROR: 'huggingface-hub' or 'pyarrow' not installed.") print("Run: pip install huggingface-hub pyarrow") sys.exit(1) try: import soundfile as sf except ImportError: print("\nERROR: 'soundfile' not installed.") print("Run: pip install soundfile") sys.exit(1) try: import scipy.io.wavfile except ImportError: print("\nERROR: 'scipy' not installed.") print("Run: pip install scipy") sys.exit(1) voices_dir = PROJECT_ROOT / "data" / "animevox_voices" catalog_path = PROJECT_ROOT / "data" / "animevox_catalog.json" # Check if already set up if catalog_path.exists(): with open(catalog_path) as f: existing = json.load(f) print(f"\nCatalog already exists with {len(existing)} characters.") resp = input("Re-run setup? (y/N): ").strip().lower() if resp != "y": print("Skipped.") return # Step 1: Download parquet files directly (avoids torchcodec dependency) PARQUET_FILES = [f"data/train-{i:05d}-of-00007.parquet" for i in range(7)] print(f"\n[1/5] Downloading AnimeVox parquet files ({len(PARQUET_FILES)} shards)...") t0 = time.time() all_rows = [] # list of (char_name, anime, audio_bytes, transcription) for shard in PARQUET_FILES: print(f" Downloading {shard}...", end=" ", flush=True) path = hf_hub_download("taresh18/AnimeVox", shard, repo_type="dataset") table = pq.read_table(path, columns=["audio", "character_name", "anime", "transcription"]) for i in range(len(table)): audio_entry = table.column("audio")[i].as_py() all_rows.append({ "character_name": table.column("character_name")[i].as_py(), "anime": table.column("anime")[i].as_py(), "audio_bytes": audio_entry["bytes"], "transcription": table.column("transcription")[i].as_py(), }) print(f"{len(table)} rows") print(f" Total: {len(all_rows)} clips in {time.time() - t0:.1f}s") # Step 2: Discover characters print("\n[2/5] Discovering characters...") char_clips: dict[str, list[int]] = {} char_anime: dict[str, str] = {} for idx, row in enumerate(all_rows): name = row["character_name"] if name not in char_clips: char_clips[name] = [] char_anime[name] = row["anime"] char_clips[name].append(idx) print(f" Found {len(char_clips)} characters:") for name, clips in sorted(char_clips.items(), key=lambda x: -len(x[1])): print(f" {name} ({char_anime[name]}): {len(clips)} clips") # Step 3: Select best reference clip per character print("\n[3/5] Selecting best reference clips (10-20s, clean audio)...") voices_dir.mkdir(parents=True, exist_ok=True) import random selected_refs = {} for name, clip_indices in char_clips.items(): char_id = _sanitize_name(name) char_dir = voices_dir / char_id char_dir.mkdir(parents=True, exist_ok=True) ref_path = char_dir / "reference.wav" # Sample up to 80 clips to find a good reference sample_indices = clip_indices if len(clip_indices) <= 80 else random.sample(clip_indices, 80) best_idx = None best_duration = 0.0 best_score = -1 print(f" {name}: scanning {len(sample_indices)}/{len(clip_indices)} clips...", end=" ", flush=True) for idx in sample_indices: try: audio_bytes = all_rows[idx]["audio_bytes"] samples, sr = _decode_audio(audio_bytes) duration = len(samples) / sr # Score: prefer 10-20s clips, penalize very short or very long if 15 <= duration <= 20: score = 100 elif 10 <= duration <= 25: score = 80 elif 5 <= duration <= 30: score = 60 else: score = max(0, 40 - abs(duration - 15)) # Penalize quiet clips rms = np.sqrt(np.mean(samples ** 2)) if rms < 0.01: score -= 20 if score > best_score or (score == best_score and duration > best_duration): best_score = score best_duration = duration best_idx = idx if best_score >= 100: break except Exception as e: continue if best_idx is not None: samples, sr = _decode_audio(all_rows[best_idx]["audio_bytes"]) # Write as int16 WAV samples_int16 = np.clip(samples * 32767, -32768, 32767).astype(np.int16) scipy.io.wavfile.write(str(ref_path), sr, samples_int16) selected_refs[name] = { "path": str(ref_path), "duration": len(samples) / sr, "transcript": all_rows[best_idx].get("transcription", ""), } print(f"{len(samples)/sr:.1f}s saved (score={best_score})") else: print("NO SUITABLE CLIP FOUND") # Free memory — parquet data no longer needed del all_rows # Step 4: Encode voice states with Pocket TTS print("\n[4/5] Encoding voice states with Pocket TTS...") print(" (Loading model — ~2GB RAM, first run downloads weights)") try: from pocket_tts import TTSModel from pocket_tts.models.tts_model import export_model_state as _export_state except ImportError: print("\nERROR: 'pocket-tts' not installed.") print("Run: pip install pocket-tts") sys.exit(1) t0 = time.time() tts_model = TTSModel.load_model() print(f" Model loaded in {time.time() - t0:.1f}s") # Check if voice cloning is available by testing with a reference voice_cloning_available = False if selected_refs: first_ref = next(iter(selected_refs.values())) try: test_state = tts_model.get_state_for_audio_prompt(first_ref["path"]) voice_cloning_available = True print(" Voice cloning: AVAILABLE") except Exception as e: if "voice cloning" in str(e).lower(): print(" Voice cloning: NOT AVAILABLE (gated model)") print(" To enable: accept terms at https://huggingface.co/kyutai/pocket-tts") print(" then run: huggingface-cli login") print(" Using PRESET voices as fallback...") else: print(f" Voice cloning test failed: {e}") # Encode voice states encoded_count = 0 if voice_cloning_available: for name, ref_info in selected_refs.items(): char_id = _sanitize_name(name) char_dir = voices_dir / char_id state_path = char_dir / "voice_state.safetensors" try: t0 = time.time() voice_state = tts_model.get_state_for_audio_prompt(ref_info["path"]) _export_state(voice_state, str(state_path)) elapsed = time.time() - t0 print(f" {name}: voice state encoded in {elapsed:.1f}s") encoded_count += 1 except Exception as e: print(f" {name}: FAILED — {e}") else: # Fallback: assign preset voice names (no .safetensors export needed) # Pocket TTS non-cloning model supports these built-in voices: PRESET_VOICES_FEMALE = ["alba", "fantine", "cosette", "eponine", "azelma"] PRESET_VOICES_MALE = ["marius", "javert", "jean"] print("\n Assigning preset voices as fallback...") female_idx = 0 for name in sorted(selected_refs.keys()): char_id = _sanitize_name(name) char_dir = voices_dir / char_id traits = _get_traits(name) gender = traits.get("gender", "female") if gender == "male": preset = PRESET_VOICES_MALE[encoded_count % len(PRESET_VOICES_MALE)] else: preset = PRESET_VOICES_FEMALE[female_idx % len(PRESET_VOICES_FEMALE)] female_idx += 1 # Write preset name to a marker file (no .safetensors needed) preset_path = char_dir / "preset_voice.txt" with open(preset_path, "w") as f: f.write(preset) # Verify it works try: voice_state = tts_model.get_state_for_audio_prompt(preset) print(f" {name}: preset '{preset}' verified OK") encoded_count += 1 except Exception as e: print(f" {name}: preset '{preset}' FAILED — {e}") print(f"\n {encoded_count}/{len(selected_refs)} voice states encoded") # Step 5: Build catalog print("\n[5/5] Building voice catalog...") catalog = [] for name in sorted(char_clips.keys()): char_id = _sanitize_name(name) char_dir = voices_dir / char_id ref_path = char_dir / "reference.wav" state_path = char_dir / "voice_state.safetensors" preset_path = char_dir / "preset_voice.txt" traits = _get_traits(name) # Determine voice source: .safetensors (cloned) or preset name voice_state = None preset_voice = None if state_path.exists(): voice_state = str(state_path) elif preset_path.exists(): preset_voice = preset_path.read_text().strip() entry = { "id": char_id, "display_name": name, "source_anime": char_anime.get(name, "Unknown"), "gender": traits["gender"], "age_category": traits["age_category"], "vocal_traits": traits["vocal_traits"], "pitch_range": traits["pitch_range"], "clip_count": len(char_clips[name]), "reference_path": str(ref_path) if ref_path.exists() else None, "voice_state_path": voice_state, "preset_voice": preset_voice, } catalog.append(entry) if voice_state: status = "ready (cloned)" elif preset_voice: status = f"ready (preset: {preset_voice})" elif ref_path.exists(): status = "ref only" else: status = "MISSING" traits_flag = "" if traits["gender"] != "unknown" else " [NEEDS MANUAL TAGGING]" print(f" {name}: {status}{traits_flag}") with open(catalog_path, "w") as f: json.dump(catalog, f, indent=2) print(f"\nCatalog saved to {catalog_path}") print(f"Total: {len(catalog)} characters, {encoded_count} with voice states") # Check for untagged characters untagged = [e for e in catalog if e["gender"] == "unknown"] if untagged: print(f"\nWARNING: {len(untagged)} characters need manual trait tagging:") for e in untagged: print(f" - {e['display_name']} ({e['source_anime']})") print(f"\nEdit {catalog_path} to add gender/age_category/vocal_traits for these characters.") print("\nSetup complete!") if __name__ == "__main__": main()