anime-gen-api / scripts /setup_animevox.py
AswinMathew's picture
Upload folder using huggingface_hub
7190fd0 verified
"""One-time setup: download AnimeVox dataset, extract reference clips, encode voice states.
Usage:
python scripts/setup_animevox.py
This script:
1. Downloads the AnimeVox dataset from HuggingFace (11,020 clips, 19 characters)
2. For each character, selects the best 15-20s reference clip
3. Exports reference WAV files to data/animevox_voices/{character}/reference.wav
4. Pre-encodes Pocket TTS voice states (.safetensors) for instant loading
5. Saves data/animevox_catalog.json with tagged character metadata
"""
import json
import os
import sys
import time
from pathlib import Path
import numpy as np
# Add project root to path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
# Known anime character metadata for trait tagging.
# Characters not in this map get auto-tagged with defaults.
KNOWN_CHARACTERS = {
# --- Male characters ---
"Naruto Uzumaki": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["energetic", "brash", "determined"],
"pitch_range": "mid",
},
"Naruto": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["energetic", "brash", "determined"],
"pitch_range": "mid",
},
"Sasuke Uchiha": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["cold", "intense", "brooding"],
"pitch_range": "mid-low",
},
"Sasuke": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["cold", "intense", "brooding"],
"pitch_range": "mid-low",
},
"Ichigo Kurosaki": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["intense", "brash", "determined"],
"pitch_range": "mid-low",
},
"Ichigo": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["intense", "brash", "determined"],
"pitch_range": "mid-low",
},
"Eren Yeager": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["passionate", "angry", "intense"],
"pitch_range": "mid",
},
"Eren": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["passionate", "angry", "intense"],
"pitch_range": "mid",
},
"Tanjiro Kamado": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["gentle", "determined", "earnest"],
"pitch_range": "mid",
},
"Tanjiro": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["gentle", "determined", "earnest"],
"pitch_range": "mid",
},
"Light Yagami": {
"gender": "male", "age_category": "young_adult",
"vocal_traits": ["cold", "calculating", "confident"],
"pitch_range": "mid-low",
},
"Light": {
"gender": "male", "age_category": "young_adult",
"vocal_traits": ["cold", "calculating", "confident"],
"pitch_range": "mid-low",
},
"Goku": {
"gender": "male", "age_category": "adult",
"vocal_traits": ["cheerful", "energetic", "simple"],
"pitch_range": "mid",
},
"Vegeta": {
"gender": "male", "age_category": "adult",
"vocal_traits": ["proud", "intense", "arrogant"],
"pitch_range": "mid-low",
},
"Levi Ackerman": {
"gender": "male", "age_category": "adult",
"vocal_traits": ["cold", "calm", "authoritative"],
"pitch_range": "low",
},
"Levi": {
"gender": "male", "age_category": "adult",
"vocal_traits": ["cold", "calm", "authoritative"],
"pitch_range": "low",
},
"L": {
"gender": "male", "age_category": "young_adult",
"vocal_traits": ["calm", "quirky", "soft"],
"pitch_range": "mid",
},
"L Lawliet": {
"gender": "male", "age_category": "young_adult",
"vocal_traits": ["calm", "quirky", "soft"],
"pitch_range": "mid",
},
"Izuku Midoriya": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["nervous", "earnest", "determined"],
"pitch_range": "mid-high",
},
"Deku": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["nervous", "earnest", "determined"],
"pitch_range": "mid-high",
},
"Kirito": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["calm", "determined", "gentle"],
"pitch_range": "mid",
},
"Kaneki Ken": {
"gender": "male", "age_category": "young_adult",
"vocal_traits": ["soft", "melancholic", "intense"],
"pitch_range": "mid",
},
"Kaneki": {
"gender": "male", "age_category": "young_adult",
"vocal_traits": ["soft", "melancholic", "intense"],
"pitch_range": "mid",
},
"Gon Freecss": {
"gender": "male", "age_category": "child",
"vocal_traits": ["cheerful", "energetic", "innocent"],
"pitch_range": "high",
},
"Gon": {
"gender": "male", "age_category": "child",
"vocal_traits": ["cheerful", "energetic", "innocent"],
"pitch_range": "high",
},
"Killua Zoldyck": {
"gender": "male", "age_category": "child",
"vocal_traits": ["cool", "playful", "intense"],
"pitch_range": "mid-high",
},
"Killua": {
"gender": "male", "age_category": "child",
"vocal_traits": ["cool", "playful", "intense"],
"pitch_range": "mid-high",
},
"Edward Elric": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["brash", "passionate", "determined"],
"pitch_range": "mid",
},
"Lelouch": {
"gender": "male", "age_category": "young_adult",
"vocal_traits": ["commanding", "calculating", "dramatic"],
"pitch_range": "mid-low",
},
"Lelouch Lamperouge": {
"gender": "male", "age_category": "young_adult",
"vocal_traits": ["commanding", "calculating", "dramatic"],
"pitch_range": "mid-low",
},
"Spike Spiegel": {
"gender": "male", "age_category": "adult",
"vocal_traits": ["cool", "laid-back", "witty"],
"pitch_range": "mid-low",
},
"Spike": {
"gender": "male", "age_category": "adult",
"vocal_traits": ["cool", "laid-back", "witty"],
"pitch_range": "mid-low",
},
"Luffy": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["cheerful", "energetic", "carefree"],
"pitch_range": "mid-high",
},
"Monkey D. Luffy": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["cheerful", "energetic", "carefree"],
"pitch_range": "mid-high",
},
"Bakugo": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["aggressive", "loud", "angry"],
"pitch_range": "mid",
},
"Katsuki Bakugo": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["aggressive", "loud", "angry"],
"pitch_range": "mid",
},
"Todoroki": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["calm", "cold", "reserved"],
"pitch_range": "mid-low",
},
"Shoto Todoroki": {
"gender": "male", "age_category": "teen",
"vocal_traits": ["calm", "cold", "reserved"],
"pitch_range": "mid-low",
},
# --- Female characters ---
"Frieren": {
"gender": "female", "age_category": "adult",
"vocal_traits": ["calm", "ethereal", "soft"],
"pitch_range": "mid",
},
"Mikasa Ackerman": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["calm", "determined", "quiet"],
"pitch_range": "mid-low",
},
"Mikasa": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["calm", "determined", "quiet"],
"pitch_range": "mid-low",
},
"Nezuko Kamado": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["gentle", "soft", "innocent"],
"pitch_range": "mid-high",
},
"Nezuko": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["gentle", "soft", "innocent"],
"pitch_range": "mid-high",
},
"Asuna": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["gentle", "determined", "warm"],
"pitch_range": "mid",
},
"Asuna Yuuki": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["gentle", "determined", "warm"],
"pitch_range": "mid",
},
"Sakura Haruno": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["energetic", "emotional", "determined"],
"pitch_range": "mid-high",
},
"Sakura": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["energetic", "emotional", "determined"],
"pitch_range": "mid-high",
},
"Hinata Hyuga": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["shy", "soft", "gentle"],
"pitch_range": "mid-high",
},
"Hinata": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["shy", "soft", "gentle"],
"pitch_range": "mid-high",
},
"Rukia Kuchiki": {
"gender": "female", "age_category": "young_adult",
"vocal_traits": ["assertive", "serious", "caring"],
"pitch_range": "mid",
},
"Rukia": {
"gender": "female", "age_category": "young_adult",
"vocal_traits": ["assertive", "serious", "caring"],
"pitch_range": "mid",
},
"Ochako Uraraka": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["cheerful", "energetic", "warm"],
"pitch_range": "mid-high",
},
"Ochako": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["cheerful", "energetic", "warm"],
"pitch_range": "mid-high",
},
"Misa Amane": {
"gender": "female", "age_category": "young_adult",
"vocal_traits": ["bubbly", "dramatic", "cute"],
"pitch_range": "high",
},
"Misa": {
"gender": "female", "age_category": "young_adult",
"vocal_traits": ["bubbly", "dramatic", "cute"],
"pitch_range": "high",
},
"Winry Rockbell": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["fiery", "caring", "determined"],
"pitch_range": "mid",
},
"Fern": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["serious", "diligent", "quiet"],
"pitch_range": "mid",
},
"Himmel": {
"gender": "male", "age_category": "young_adult",
"vocal_traits": ["heroic", "warm", "confident"],
"pitch_range": "mid",
},
# --- AnimeVox dataset characters (all female) ---
"Rin Tohsaka": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["assertive", "confident", "sharp"],
"pitch_range": "mid",
},
"Marin Kitagawa": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["cheerful", "bubbly", "energetic"],
"pitch_range": "mid-high",
},
"Emilia": {
"gender": "female", "age_category": "young_adult",
"vocal_traits": ["gentle", "kind", "earnest"],
"pitch_range": "mid-high",
},
"Kurisu Makise": {
"gender": "female", "age_category": "young_adult",
"vocal_traits": ["sharp", "intellectual", "tsundere"],
"pitch_range": "mid",
},
"Megumin": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["dramatic", "energetic", "theatrical"],
"pitch_range": "mid-high",
},
"Momo Ayase": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["brash", "energetic", "bold"],
"pitch_range": "mid",
},
"Mai Sakurajima": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["calm", "cool", "witty"],
"pitch_range": "mid",
},
"Madoka Kaname": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["gentle", "innocent", "soft"],
"pitch_range": "high",
},
"Rem": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["devoted", "gentle", "intense"],
"pitch_range": "mid-high",
},
"Saber": {
"gender": "female", "age_category": "young_adult",
"vocal_traits": ["noble", "commanding", "determined"],
"pitch_range": "mid",
},
"Homura Akemi": {
"gender": "female", "age_category": "teen",
"vocal_traits": ["cold", "mysterious", "intense"],
"pitch_range": "mid-low",
},
"Makima": {
"gender": "female", "age_category": "adult",
"vocal_traits": ["cold", "calm", "manipulative"],
"pitch_range": "mid-low",
},
"Shiro": {
"gender": "female", "age_category": "child",
"vocal_traits": ["calm", "monotone", "intellectual"],
"pitch_range": "mid",
},
"Lucy": {
"gender": "female", "age_category": "young_adult",
"vocal_traits": ["cool", "reserved", "melancholic"],
"pitch_range": "mid",
},
"Power": {
"gender": "female", "age_category": "young_adult",
"vocal_traits": ["loud", "brash", "chaotic"],
"pitch_range": "mid-high",
},
}
def _sanitize_name(name: str) -> str:
"""Convert character name to filesystem-safe ID."""
return name.lower().replace(" ", "_").replace(".", "").replace("'", "")
def _get_traits(character_name: str) -> dict:
"""Look up traits from known characters, or return defaults."""
if character_name in KNOWN_CHARACTERS:
return KNOWN_CHARACTERS[character_name]
# Try partial match (first name)
first_name = character_name.split()[0] if " " in character_name else character_name
if first_name in KNOWN_CHARACTERS:
return KNOWN_CHARACTERS[first_name]
# Default — will need manual correction
return {
"gender": "unknown",
"age_category": "young_adult",
"vocal_traits": ["neutral"],
"pitch_range": "mid",
}
def _decode_audio(audio_bytes: bytes) -> tuple[np.ndarray, int]:
"""Decode audio bytes (WAV/FLAC/etc) to numpy array + sample rate."""
import soundfile as sf
import io
data, sr = sf.read(io.BytesIO(audio_bytes))
return data.astype(np.float32), sr
def main():
print("=" * 60)
print("AnimeVox Setup — Voice Reference Library")
print("=" * 60)
try:
from huggingface_hub import hf_hub_download
import pyarrow.parquet as pq
except ImportError:
print("\nERROR: 'huggingface-hub' or 'pyarrow' not installed.")
print("Run: pip install huggingface-hub pyarrow")
sys.exit(1)
try:
import soundfile as sf
except ImportError:
print("\nERROR: 'soundfile' not installed.")
print("Run: pip install soundfile")
sys.exit(1)
try:
import scipy.io.wavfile
except ImportError:
print("\nERROR: 'scipy' not installed.")
print("Run: pip install scipy")
sys.exit(1)
voices_dir = PROJECT_ROOT / "data" / "animevox_voices"
catalog_path = PROJECT_ROOT / "data" / "animevox_catalog.json"
# Check if already set up
if catalog_path.exists():
with open(catalog_path) as f:
existing = json.load(f)
print(f"\nCatalog already exists with {len(existing)} characters.")
resp = input("Re-run setup? (y/N): ").strip().lower()
if resp != "y":
print("Skipped.")
return
# Step 1: Download parquet files directly (avoids torchcodec dependency)
PARQUET_FILES = [f"data/train-{i:05d}-of-00007.parquet" for i in range(7)]
print(f"\n[1/5] Downloading AnimeVox parquet files ({len(PARQUET_FILES)} shards)...")
t0 = time.time()
all_rows = [] # list of (char_name, anime, audio_bytes, transcription)
for shard in PARQUET_FILES:
print(f" Downloading {shard}...", end=" ", flush=True)
path = hf_hub_download("taresh18/AnimeVox", shard, repo_type="dataset")
table = pq.read_table(path, columns=["audio", "character_name", "anime", "transcription"])
for i in range(len(table)):
audio_entry = table.column("audio")[i].as_py()
all_rows.append({
"character_name": table.column("character_name")[i].as_py(),
"anime": table.column("anime")[i].as_py(),
"audio_bytes": audio_entry["bytes"],
"transcription": table.column("transcription")[i].as_py(),
})
print(f"{len(table)} rows")
print(f" Total: {len(all_rows)} clips in {time.time() - t0:.1f}s")
# Step 2: Discover characters
print("\n[2/5] Discovering characters...")
char_clips: dict[str, list[int]] = {}
char_anime: dict[str, str] = {}
for idx, row in enumerate(all_rows):
name = row["character_name"]
if name not in char_clips:
char_clips[name] = []
char_anime[name] = row["anime"]
char_clips[name].append(idx)
print(f" Found {len(char_clips)} characters:")
for name, clips in sorted(char_clips.items(), key=lambda x: -len(x[1])):
print(f" {name} ({char_anime[name]}): {len(clips)} clips")
# Step 3: Select best reference clip per character
print("\n[3/5] Selecting best reference clips (10-20s, clean audio)...")
voices_dir.mkdir(parents=True, exist_ok=True)
import random
selected_refs = {}
for name, clip_indices in char_clips.items():
char_id = _sanitize_name(name)
char_dir = voices_dir / char_id
char_dir.mkdir(parents=True, exist_ok=True)
ref_path = char_dir / "reference.wav"
# Sample up to 80 clips to find a good reference
sample_indices = clip_indices if len(clip_indices) <= 80 else random.sample(clip_indices, 80)
best_idx = None
best_duration = 0.0
best_score = -1
print(f" {name}: scanning {len(sample_indices)}/{len(clip_indices)} clips...", end=" ", flush=True)
for idx in sample_indices:
try:
audio_bytes = all_rows[idx]["audio_bytes"]
samples, sr = _decode_audio(audio_bytes)
duration = len(samples) / sr
# Score: prefer 10-20s clips, penalize very short or very long
if 15 <= duration <= 20:
score = 100
elif 10 <= duration <= 25:
score = 80
elif 5 <= duration <= 30:
score = 60
else:
score = max(0, 40 - abs(duration - 15))
# Penalize quiet clips
rms = np.sqrt(np.mean(samples ** 2))
if rms < 0.01:
score -= 20
if score > best_score or (score == best_score and duration > best_duration):
best_score = score
best_duration = duration
best_idx = idx
if best_score >= 100:
break
except Exception as e:
continue
if best_idx is not None:
samples, sr = _decode_audio(all_rows[best_idx]["audio_bytes"])
# Write as int16 WAV
samples_int16 = np.clip(samples * 32767, -32768, 32767).astype(np.int16)
scipy.io.wavfile.write(str(ref_path), sr, samples_int16)
selected_refs[name] = {
"path": str(ref_path),
"duration": len(samples) / sr,
"transcript": all_rows[best_idx].get("transcription", ""),
}
print(f"{len(samples)/sr:.1f}s saved (score={best_score})")
else:
print("NO SUITABLE CLIP FOUND")
# Free memory — parquet data no longer needed
del all_rows
# Step 4: Encode voice states with Pocket TTS
print("\n[4/5] Encoding voice states with Pocket TTS...")
print(" (Loading model — ~2GB RAM, first run downloads weights)")
try:
from pocket_tts import TTSModel
from pocket_tts.models.tts_model import export_model_state as _export_state
except ImportError:
print("\nERROR: 'pocket-tts' not installed.")
print("Run: pip install pocket-tts")
sys.exit(1)
t0 = time.time()
tts_model = TTSModel.load_model()
print(f" Model loaded in {time.time() - t0:.1f}s")
# Check if voice cloning is available by testing with a reference
voice_cloning_available = False
if selected_refs:
first_ref = next(iter(selected_refs.values()))
try:
test_state = tts_model.get_state_for_audio_prompt(first_ref["path"])
voice_cloning_available = True
print(" Voice cloning: AVAILABLE")
except Exception as e:
if "voice cloning" in str(e).lower():
print(" Voice cloning: NOT AVAILABLE (gated model)")
print(" To enable: accept terms at https://huggingface.co/kyutai/pocket-tts")
print(" then run: huggingface-cli login")
print(" Using PRESET voices as fallback...")
else:
print(f" Voice cloning test failed: {e}")
# Encode voice states
encoded_count = 0
if voice_cloning_available:
for name, ref_info in selected_refs.items():
char_id = _sanitize_name(name)
char_dir = voices_dir / char_id
state_path = char_dir / "voice_state.safetensors"
try:
t0 = time.time()
voice_state = tts_model.get_state_for_audio_prompt(ref_info["path"])
_export_state(voice_state, str(state_path))
elapsed = time.time() - t0
print(f" {name}: voice state encoded in {elapsed:.1f}s")
encoded_count += 1
except Exception as e:
print(f" {name}: FAILED — {e}")
else:
# Fallback: assign preset voice names (no .safetensors export needed)
# Pocket TTS non-cloning model supports these built-in voices:
PRESET_VOICES_FEMALE = ["alba", "fantine", "cosette", "eponine", "azelma"]
PRESET_VOICES_MALE = ["marius", "javert", "jean"]
print("\n Assigning preset voices as fallback...")
female_idx = 0
for name in sorted(selected_refs.keys()):
char_id = _sanitize_name(name)
char_dir = voices_dir / char_id
traits = _get_traits(name)
gender = traits.get("gender", "female")
if gender == "male":
preset = PRESET_VOICES_MALE[encoded_count % len(PRESET_VOICES_MALE)]
else:
preset = PRESET_VOICES_FEMALE[female_idx % len(PRESET_VOICES_FEMALE)]
female_idx += 1
# Write preset name to a marker file (no .safetensors needed)
preset_path = char_dir / "preset_voice.txt"
with open(preset_path, "w") as f:
f.write(preset)
# Verify it works
try:
voice_state = tts_model.get_state_for_audio_prompt(preset)
print(f" {name}: preset '{preset}' verified OK")
encoded_count += 1
except Exception as e:
print(f" {name}: preset '{preset}' FAILED — {e}")
print(f"\n {encoded_count}/{len(selected_refs)} voice states encoded")
# Step 5: Build catalog
print("\n[5/5] Building voice catalog...")
catalog = []
for name in sorted(char_clips.keys()):
char_id = _sanitize_name(name)
char_dir = voices_dir / char_id
ref_path = char_dir / "reference.wav"
state_path = char_dir / "voice_state.safetensors"
preset_path = char_dir / "preset_voice.txt"
traits = _get_traits(name)
# Determine voice source: .safetensors (cloned) or preset name
voice_state = None
preset_voice = None
if state_path.exists():
voice_state = str(state_path)
elif preset_path.exists():
preset_voice = preset_path.read_text().strip()
entry = {
"id": char_id,
"display_name": name,
"source_anime": char_anime.get(name, "Unknown"),
"gender": traits["gender"],
"age_category": traits["age_category"],
"vocal_traits": traits["vocal_traits"],
"pitch_range": traits["pitch_range"],
"clip_count": len(char_clips[name]),
"reference_path": str(ref_path) if ref_path.exists() else None,
"voice_state_path": voice_state,
"preset_voice": preset_voice,
}
catalog.append(entry)
if voice_state:
status = "ready (cloned)"
elif preset_voice:
status = f"ready (preset: {preset_voice})"
elif ref_path.exists():
status = "ref only"
else:
status = "MISSING"
traits_flag = "" if traits["gender"] != "unknown" else " [NEEDS MANUAL TAGGING]"
print(f" {name}: {status}{traits_flag}")
with open(catalog_path, "w") as f:
json.dump(catalog, f, indent=2)
print(f"\nCatalog saved to {catalog_path}")
print(f"Total: {len(catalog)} characters, {encoded_count} with voice states")
# Check for untagged characters
untagged = [e for e in catalog if e["gender"] == "unknown"]
if untagged:
print(f"\nWARNING: {len(untagged)} characters need manual trait tagging:")
for e in untagged:
print(f" - {e['display_name']} ({e['source_anime']})")
print(f"\nEdit {catalog_path} to add gender/age_category/vocal_traits for these characters.")
print("\nSetup complete!")
if __name__ == "__main__":
main()