Spaces:
Sleeping
Sleeping
| """One-time setup: download AnimeVox dataset, extract reference clips, encode voice states. | |
| Usage: | |
| python scripts/setup_animevox.py | |
| This script: | |
| 1. Downloads the AnimeVox dataset from HuggingFace (11,020 clips, 19 characters) | |
| 2. For each character, selects the best 15-20s reference clip | |
| 3. Exports reference WAV files to data/animevox_voices/{character}/reference.wav | |
| 4. Pre-encodes Pocket TTS voice states (.safetensors) for instant loading | |
| 5. Saves data/animevox_catalog.json with tagged character metadata | |
| """ | |
| import json | |
| import os | |
| import sys | |
| import time | |
| from pathlib import Path | |
| import numpy as np | |
| # Add project root to path | |
| PROJECT_ROOT = Path(__file__).resolve().parent.parent | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| # Known anime character metadata for trait tagging. | |
| # Characters not in this map get auto-tagged with defaults. | |
| KNOWN_CHARACTERS = { | |
| # --- Male characters --- | |
| "Naruto Uzumaki": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["energetic", "brash", "determined"], | |
| "pitch_range": "mid", | |
| }, | |
| "Naruto": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["energetic", "brash", "determined"], | |
| "pitch_range": "mid", | |
| }, | |
| "Sasuke Uchiha": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["cold", "intense", "brooding"], | |
| "pitch_range": "mid-low", | |
| }, | |
| "Sasuke": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["cold", "intense", "brooding"], | |
| "pitch_range": "mid-low", | |
| }, | |
| "Ichigo Kurosaki": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["intense", "brash", "determined"], | |
| "pitch_range": "mid-low", | |
| }, | |
| "Ichigo": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["intense", "brash", "determined"], | |
| "pitch_range": "mid-low", | |
| }, | |
| "Eren Yeager": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["passionate", "angry", "intense"], | |
| "pitch_range": "mid", | |
| }, | |
| "Eren": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["passionate", "angry", "intense"], | |
| "pitch_range": "mid", | |
| }, | |
| "Tanjiro Kamado": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["gentle", "determined", "earnest"], | |
| "pitch_range": "mid", | |
| }, | |
| "Tanjiro": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["gentle", "determined", "earnest"], | |
| "pitch_range": "mid", | |
| }, | |
| "Light Yagami": { | |
| "gender": "male", "age_category": "young_adult", | |
| "vocal_traits": ["cold", "calculating", "confident"], | |
| "pitch_range": "mid-low", | |
| }, | |
| "Light": { | |
| "gender": "male", "age_category": "young_adult", | |
| "vocal_traits": ["cold", "calculating", "confident"], | |
| "pitch_range": "mid-low", | |
| }, | |
| "Goku": { | |
| "gender": "male", "age_category": "adult", | |
| "vocal_traits": ["cheerful", "energetic", "simple"], | |
| "pitch_range": "mid", | |
| }, | |
| "Vegeta": { | |
| "gender": "male", "age_category": "adult", | |
| "vocal_traits": ["proud", "intense", "arrogant"], | |
| "pitch_range": "mid-low", | |
| }, | |
| "Levi Ackerman": { | |
| "gender": "male", "age_category": "adult", | |
| "vocal_traits": ["cold", "calm", "authoritative"], | |
| "pitch_range": "low", | |
| }, | |
| "Levi": { | |
| "gender": "male", "age_category": "adult", | |
| "vocal_traits": ["cold", "calm", "authoritative"], | |
| "pitch_range": "low", | |
| }, | |
| "L": { | |
| "gender": "male", "age_category": "young_adult", | |
| "vocal_traits": ["calm", "quirky", "soft"], | |
| "pitch_range": "mid", | |
| }, | |
| "L Lawliet": { | |
| "gender": "male", "age_category": "young_adult", | |
| "vocal_traits": ["calm", "quirky", "soft"], | |
| "pitch_range": "mid", | |
| }, | |
| "Izuku Midoriya": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["nervous", "earnest", "determined"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Deku": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["nervous", "earnest", "determined"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Kirito": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["calm", "determined", "gentle"], | |
| "pitch_range": "mid", | |
| }, | |
| "Kaneki Ken": { | |
| "gender": "male", "age_category": "young_adult", | |
| "vocal_traits": ["soft", "melancholic", "intense"], | |
| "pitch_range": "mid", | |
| }, | |
| "Kaneki": { | |
| "gender": "male", "age_category": "young_adult", | |
| "vocal_traits": ["soft", "melancholic", "intense"], | |
| "pitch_range": "mid", | |
| }, | |
| "Gon Freecss": { | |
| "gender": "male", "age_category": "child", | |
| "vocal_traits": ["cheerful", "energetic", "innocent"], | |
| "pitch_range": "high", | |
| }, | |
| "Gon": { | |
| "gender": "male", "age_category": "child", | |
| "vocal_traits": ["cheerful", "energetic", "innocent"], | |
| "pitch_range": "high", | |
| }, | |
| "Killua Zoldyck": { | |
| "gender": "male", "age_category": "child", | |
| "vocal_traits": ["cool", "playful", "intense"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Killua": { | |
| "gender": "male", "age_category": "child", | |
| "vocal_traits": ["cool", "playful", "intense"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Edward Elric": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["brash", "passionate", "determined"], | |
| "pitch_range": "mid", | |
| }, | |
| "Lelouch": { | |
| "gender": "male", "age_category": "young_adult", | |
| "vocal_traits": ["commanding", "calculating", "dramatic"], | |
| "pitch_range": "mid-low", | |
| }, | |
| "Lelouch Lamperouge": { | |
| "gender": "male", "age_category": "young_adult", | |
| "vocal_traits": ["commanding", "calculating", "dramatic"], | |
| "pitch_range": "mid-low", | |
| }, | |
| "Spike Spiegel": { | |
| "gender": "male", "age_category": "adult", | |
| "vocal_traits": ["cool", "laid-back", "witty"], | |
| "pitch_range": "mid-low", | |
| }, | |
| "Spike": { | |
| "gender": "male", "age_category": "adult", | |
| "vocal_traits": ["cool", "laid-back", "witty"], | |
| "pitch_range": "mid-low", | |
| }, | |
| "Luffy": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["cheerful", "energetic", "carefree"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Monkey D. Luffy": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["cheerful", "energetic", "carefree"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Bakugo": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["aggressive", "loud", "angry"], | |
| "pitch_range": "mid", | |
| }, | |
| "Katsuki Bakugo": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["aggressive", "loud", "angry"], | |
| "pitch_range": "mid", | |
| }, | |
| "Todoroki": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["calm", "cold", "reserved"], | |
| "pitch_range": "mid-low", | |
| }, | |
| "Shoto Todoroki": { | |
| "gender": "male", "age_category": "teen", | |
| "vocal_traits": ["calm", "cold", "reserved"], | |
| "pitch_range": "mid-low", | |
| }, | |
| # --- Female characters --- | |
| "Frieren": { | |
| "gender": "female", "age_category": "adult", | |
| "vocal_traits": ["calm", "ethereal", "soft"], | |
| "pitch_range": "mid", | |
| }, | |
| "Mikasa Ackerman": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["calm", "determined", "quiet"], | |
| "pitch_range": "mid-low", | |
| }, | |
| "Mikasa": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["calm", "determined", "quiet"], | |
| "pitch_range": "mid-low", | |
| }, | |
| "Nezuko Kamado": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["gentle", "soft", "innocent"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Nezuko": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["gentle", "soft", "innocent"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Asuna": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["gentle", "determined", "warm"], | |
| "pitch_range": "mid", | |
| }, | |
| "Asuna Yuuki": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["gentle", "determined", "warm"], | |
| "pitch_range": "mid", | |
| }, | |
| "Sakura Haruno": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["energetic", "emotional", "determined"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Sakura": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["energetic", "emotional", "determined"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Hinata Hyuga": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["shy", "soft", "gentle"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Hinata": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["shy", "soft", "gentle"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Rukia Kuchiki": { | |
| "gender": "female", "age_category": "young_adult", | |
| "vocal_traits": ["assertive", "serious", "caring"], | |
| "pitch_range": "mid", | |
| }, | |
| "Rukia": { | |
| "gender": "female", "age_category": "young_adult", | |
| "vocal_traits": ["assertive", "serious", "caring"], | |
| "pitch_range": "mid", | |
| }, | |
| "Ochako Uraraka": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["cheerful", "energetic", "warm"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Ochako": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["cheerful", "energetic", "warm"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Misa Amane": { | |
| "gender": "female", "age_category": "young_adult", | |
| "vocal_traits": ["bubbly", "dramatic", "cute"], | |
| "pitch_range": "high", | |
| }, | |
| "Misa": { | |
| "gender": "female", "age_category": "young_adult", | |
| "vocal_traits": ["bubbly", "dramatic", "cute"], | |
| "pitch_range": "high", | |
| }, | |
| "Winry Rockbell": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["fiery", "caring", "determined"], | |
| "pitch_range": "mid", | |
| }, | |
| "Fern": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["serious", "diligent", "quiet"], | |
| "pitch_range": "mid", | |
| }, | |
| "Himmel": { | |
| "gender": "male", "age_category": "young_adult", | |
| "vocal_traits": ["heroic", "warm", "confident"], | |
| "pitch_range": "mid", | |
| }, | |
| # --- AnimeVox dataset characters (all female) --- | |
| "Rin Tohsaka": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["assertive", "confident", "sharp"], | |
| "pitch_range": "mid", | |
| }, | |
| "Marin Kitagawa": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["cheerful", "bubbly", "energetic"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Emilia": { | |
| "gender": "female", "age_category": "young_adult", | |
| "vocal_traits": ["gentle", "kind", "earnest"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Kurisu Makise": { | |
| "gender": "female", "age_category": "young_adult", | |
| "vocal_traits": ["sharp", "intellectual", "tsundere"], | |
| "pitch_range": "mid", | |
| }, | |
| "Megumin": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["dramatic", "energetic", "theatrical"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Momo Ayase": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["brash", "energetic", "bold"], | |
| "pitch_range": "mid", | |
| }, | |
| "Mai Sakurajima": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["calm", "cool", "witty"], | |
| "pitch_range": "mid", | |
| }, | |
| "Madoka Kaname": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["gentle", "innocent", "soft"], | |
| "pitch_range": "high", | |
| }, | |
| "Rem": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["devoted", "gentle", "intense"], | |
| "pitch_range": "mid-high", | |
| }, | |
| "Saber": { | |
| "gender": "female", "age_category": "young_adult", | |
| "vocal_traits": ["noble", "commanding", "determined"], | |
| "pitch_range": "mid", | |
| }, | |
| "Homura Akemi": { | |
| "gender": "female", "age_category": "teen", | |
| "vocal_traits": ["cold", "mysterious", "intense"], | |
| "pitch_range": "mid-low", | |
| }, | |
| "Makima": { | |
| "gender": "female", "age_category": "adult", | |
| "vocal_traits": ["cold", "calm", "manipulative"], | |
| "pitch_range": "mid-low", | |
| }, | |
| "Shiro": { | |
| "gender": "female", "age_category": "child", | |
| "vocal_traits": ["calm", "monotone", "intellectual"], | |
| "pitch_range": "mid", | |
| }, | |
| "Lucy": { | |
| "gender": "female", "age_category": "young_adult", | |
| "vocal_traits": ["cool", "reserved", "melancholic"], | |
| "pitch_range": "mid", | |
| }, | |
| "Power": { | |
| "gender": "female", "age_category": "young_adult", | |
| "vocal_traits": ["loud", "brash", "chaotic"], | |
| "pitch_range": "mid-high", | |
| }, | |
| } | |
| def _sanitize_name(name: str) -> str: | |
| """Convert character name to filesystem-safe ID.""" | |
| return name.lower().replace(" ", "_").replace(".", "").replace("'", "") | |
| def _get_traits(character_name: str) -> dict: | |
| """Look up traits from known characters, or return defaults.""" | |
| if character_name in KNOWN_CHARACTERS: | |
| return KNOWN_CHARACTERS[character_name] | |
| # Try partial match (first name) | |
| first_name = character_name.split()[0] if " " in character_name else character_name | |
| if first_name in KNOWN_CHARACTERS: | |
| return KNOWN_CHARACTERS[first_name] | |
| # Default — will need manual correction | |
| return { | |
| "gender": "unknown", | |
| "age_category": "young_adult", | |
| "vocal_traits": ["neutral"], | |
| "pitch_range": "mid", | |
| } | |
| def _decode_audio(audio_bytes: bytes) -> tuple[np.ndarray, int]: | |
| """Decode audio bytes (WAV/FLAC/etc) to numpy array + sample rate.""" | |
| import soundfile as sf | |
| import io | |
| data, sr = sf.read(io.BytesIO(audio_bytes)) | |
| return data.astype(np.float32), sr | |
| def main(): | |
| print("=" * 60) | |
| print("AnimeVox Setup — Voice Reference Library") | |
| print("=" * 60) | |
| try: | |
| from huggingface_hub import hf_hub_download | |
| import pyarrow.parquet as pq | |
| except ImportError: | |
| print("\nERROR: 'huggingface-hub' or 'pyarrow' not installed.") | |
| print("Run: pip install huggingface-hub pyarrow") | |
| sys.exit(1) | |
| try: | |
| import soundfile as sf | |
| except ImportError: | |
| print("\nERROR: 'soundfile' not installed.") | |
| print("Run: pip install soundfile") | |
| sys.exit(1) | |
| try: | |
| import scipy.io.wavfile | |
| except ImportError: | |
| print("\nERROR: 'scipy' not installed.") | |
| print("Run: pip install scipy") | |
| sys.exit(1) | |
| voices_dir = PROJECT_ROOT / "data" / "animevox_voices" | |
| catalog_path = PROJECT_ROOT / "data" / "animevox_catalog.json" | |
| # Check if already set up | |
| if catalog_path.exists(): | |
| with open(catalog_path) as f: | |
| existing = json.load(f) | |
| print(f"\nCatalog already exists with {len(existing)} characters.") | |
| resp = input("Re-run setup? (y/N): ").strip().lower() | |
| if resp != "y": | |
| print("Skipped.") | |
| return | |
| # Step 1: Download parquet files directly (avoids torchcodec dependency) | |
| PARQUET_FILES = [f"data/train-{i:05d}-of-00007.parquet" for i in range(7)] | |
| print(f"\n[1/5] Downloading AnimeVox parquet files ({len(PARQUET_FILES)} shards)...") | |
| t0 = time.time() | |
| all_rows = [] # list of (char_name, anime, audio_bytes, transcription) | |
| for shard in PARQUET_FILES: | |
| print(f" Downloading {shard}...", end=" ", flush=True) | |
| path = hf_hub_download("taresh18/AnimeVox", shard, repo_type="dataset") | |
| table = pq.read_table(path, columns=["audio", "character_name", "anime", "transcription"]) | |
| for i in range(len(table)): | |
| audio_entry = table.column("audio")[i].as_py() | |
| all_rows.append({ | |
| "character_name": table.column("character_name")[i].as_py(), | |
| "anime": table.column("anime")[i].as_py(), | |
| "audio_bytes": audio_entry["bytes"], | |
| "transcription": table.column("transcription")[i].as_py(), | |
| }) | |
| print(f"{len(table)} rows") | |
| print(f" Total: {len(all_rows)} clips in {time.time() - t0:.1f}s") | |
| # Step 2: Discover characters | |
| print("\n[2/5] Discovering characters...") | |
| char_clips: dict[str, list[int]] = {} | |
| char_anime: dict[str, str] = {} | |
| for idx, row in enumerate(all_rows): | |
| name = row["character_name"] | |
| if name not in char_clips: | |
| char_clips[name] = [] | |
| char_anime[name] = row["anime"] | |
| char_clips[name].append(idx) | |
| print(f" Found {len(char_clips)} characters:") | |
| for name, clips in sorted(char_clips.items(), key=lambda x: -len(x[1])): | |
| print(f" {name} ({char_anime[name]}): {len(clips)} clips") | |
| # Step 3: Select best reference clip per character | |
| print("\n[3/5] Selecting best reference clips (10-20s, clean audio)...") | |
| voices_dir.mkdir(parents=True, exist_ok=True) | |
| import random | |
| selected_refs = {} | |
| for name, clip_indices in char_clips.items(): | |
| char_id = _sanitize_name(name) | |
| char_dir = voices_dir / char_id | |
| char_dir.mkdir(parents=True, exist_ok=True) | |
| ref_path = char_dir / "reference.wav" | |
| # Sample up to 80 clips to find a good reference | |
| sample_indices = clip_indices if len(clip_indices) <= 80 else random.sample(clip_indices, 80) | |
| best_idx = None | |
| best_duration = 0.0 | |
| best_score = -1 | |
| print(f" {name}: scanning {len(sample_indices)}/{len(clip_indices)} clips...", end=" ", flush=True) | |
| for idx in sample_indices: | |
| try: | |
| audio_bytes = all_rows[idx]["audio_bytes"] | |
| samples, sr = _decode_audio(audio_bytes) | |
| duration = len(samples) / sr | |
| # Score: prefer 10-20s clips, penalize very short or very long | |
| if 15 <= duration <= 20: | |
| score = 100 | |
| elif 10 <= duration <= 25: | |
| score = 80 | |
| elif 5 <= duration <= 30: | |
| score = 60 | |
| else: | |
| score = max(0, 40 - abs(duration - 15)) | |
| # Penalize quiet clips | |
| rms = np.sqrt(np.mean(samples ** 2)) | |
| if rms < 0.01: | |
| score -= 20 | |
| if score > best_score or (score == best_score and duration > best_duration): | |
| best_score = score | |
| best_duration = duration | |
| best_idx = idx | |
| if best_score >= 100: | |
| break | |
| except Exception as e: | |
| continue | |
| if best_idx is not None: | |
| samples, sr = _decode_audio(all_rows[best_idx]["audio_bytes"]) | |
| # Write as int16 WAV | |
| samples_int16 = np.clip(samples * 32767, -32768, 32767).astype(np.int16) | |
| scipy.io.wavfile.write(str(ref_path), sr, samples_int16) | |
| selected_refs[name] = { | |
| "path": str(ref_path), | |
| "duration": len(samples) / sr, | |
| "transcript": all_rows[best_idx].get("transcription", ""), | |
| } | |
| print(f"{len(samples)/sr:.1f}s saved (score={best_score})") | |
| else: | |
| print("NO SUITABLE CLIP FOUND") | |
| # Free memory — parquet data no longer needed | |
| del all_rows | |
| # Step 4: Encode voice states with Pocket TTS | |
| print("\n[4/5] Encoding voice states with Pocket TTS...") | |
| print(" (Loading model — ~2GB RAM, first run downloads weights)") | |
| try: | |
| from pocket_tts import TTSModel | |
| from pocket_tts.models.tts_model import export_model_state as _export_state | |
| except ImportError: | |
| print("\nERROR: 'pocket-tts' not installed.") | |
| print("Run: pip install pocket-tts") | |
| sys.exit(1) | |
| t0 = time.time() | |
| tts_model = TTSModel.load_model() | |
| print(f" Model loaded in {time.time() - t0:.1f}s") | |
| # Check if voice cloning is available by testing with a reference | |
| voice_cloning_available = False | |
| if selected_refs: | |
| first_ref = next(iter(selected_refs.values())) | |
| try: | |
| test_state = tts_model.get_state_for_audio_prompt(first_ref["path"]) | |
| voice_cloning_available = True | |
| print(" Voice cloning: AVAILABLE") | |
| except Exception as e: | |
| if "voice cloning" in str(e).lower(): | |
| print(" Voice cloning: NOT AVAILABLE (gated model)") | |
| print(" To enable: accept terms at https://huggingface.co/kyutai/pocket-tts") | |
| print(" then run: huggingface-cli login") | |
| print(" Using PRESET voices as fallback...") | |
| else: | |
| print(f" Voice cloning test failed: {e}") | |
| # Encode voice states | |
| encoded_count = 0 | |
| if voice_cloning_available: | |
| for name, ref_info in selected_refs.items(): | |
| char_id = _sanitize_name(name) | |
| char_dir = voices_dir / char_id | |
| state_path = char_dir / "voice_state.safetensors" | |
| try: | |
| t0 = time.time() | |
| voice_state = tts_model.get_state_for_audio_prompt(ref_info["path"]) | |
| _export_state(voice_state, str(state_path)) | |
| elapsed = time.time() - t0 | |
| print(f" {name}: voice state encoded in {elapsed:.1f}s") | |
| encoded_count += 1 | |
| except Exception as e: | |
| print(f" {name}: FAILED — {e}") | |
| else: | |
| # Fallback: assign preset voice names (no .safetensors export needed) | |
| # Pocket TTS non-cloning model supports these built-in voices: | |
| PRESET_VOICES_FEMALE = ["alba", "fantine", "cosette", "eponine", "azelma"] | |
| PRESET_VOICES_MALE = ["marius", "javert", "jean"] | |
| print("\n Assigning preset voices as fallback...") | |
| female_idx = 0 | |
| for name in sorted(selected_refs.keys()): | |
| char_id = _sanitize_name(name) | |
| char_dir = voices_dir / char_id | |
| traits = _get_traits(name) | |
| gender = traits.get("gender", "female") | |
| if gender == "male": | |
| preset = PRESET_VOICES_MALE[encoded_count % len(PRESET_VOICES_MALE)] | |
| else: | |
| preset = PRESET_VOICES_FEMALE[female_idx % len(PRESET_VOICES_FEMALE)] | |
| female_idx += 1 | |
| # Write preset name to a marker file (no .safetensors needed) | |
| preset_path = char_dir / "preset_voice.txt" | |
| with open(preset_path, "w") as f: | |
| f.write(preset) | |
| # Verify it works | |
| try: | |
| voice_state = tts_model.get_state_for_audio_prompt(preset) | |
| print(f" {name}: preset '{preset}' verified OK") | |
| encoded_count += 1 | |
| except Exception as e: | |
| print(f" {name}: preset '{preset}' FAILED — {e}") | |
| print(f"\n {encoded_count}/{len(selected_refs)} voice states encoded") | |
| # Step 5: Build catalog | |
| print("\n[5/5] Building voice catalog...") | |
| catalog = [] | |
| for name in sorted(char_clips.keys()): | |
| char_id = _sanitize_name(name) | |
| char_dir = voices_dir / char_id | |
| ref_path = char_dir / "reference.wav" | |
| state_path = char_dir / "voice_state.safetensors" | |
| preset_path = char_dir / "preset_voice.txt" | |
| traits = _get_traits(name) | |
| # Determine voice source: .safetensors (cloned) or preset name | |
| voice_state = None | |
| preset_voice = None | |
| if state_path.exists(): | |
| voice_state = str(state_path) | |
| elif preset_path.exists(): | |
| preset_voice = preset_path.read_text().strip() | |
| entry = { | |
| "id": char_id, | |
| "display_name": name, | |
| "source_anime": char_anime.get(name, "Unknown"), | |
| "gender": traits["gender"], | |
| "age_category": traits["age_category"], | |
| "vocal_traits": traits["vocal_traits"], | |
| "pitch_range": traits["pitch_range"], | |
| "clip_count": len(char_clips[name]), | |
| "reference_path": str(ref_path) if ref_path.exists() else None, | |
| "voice_state_path": voice_state, | |
| "preset_voice": preset_voice, | |
| } | |
| catalog.append(entry) | |
| if voice_state: | |
| status = "ready (cloned)" | |
| elif preset_voice: | |
| status = f"ready (preset: {preset_voice})" | |
| elif ref_path.exists(): | |
| status = "ref only" | |
| else: | |
| status = "MISSING" | |
| traits_flag = "" if traits["gender"] != "unknown" else " [NEEDS MANUAL TAGGING]" | |
| print(f" {name}: {status}{traits_flag}") | |
| with open(catalog_path, "w") as f: | |
| json.dump(catalog, f, indent=2) | |
| print(f"\nCatalog saved to {catalog_path}") | |
| print(f"Total: {len(catalog)} characters, {encoded_count} with voice states") | |
| # Check for untagged characters | |
| untagged = [e for e in catalog if e["gender"] == "unknown"] | |
| if untagged: | |
| print(f"\nWARNING: {len(untagged)} characters need manual trait tagging:") | |
| for e in untagged: | |
| print(f" - {e['display_name']} ({e['source_anime']})") | |
| print(f"\nEdit {catalog_path} to add gender/age_category/vocal_traits for these characters.") | |
| print("\nSetup complete!") | |
| if __name__ == "__main__": | |
| main() | |