Spaces:
Sleeping
Sleeping
File size: 1,607 Bytes
248479c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
#!/bin/bash
# Data preparation pipeline for Learnable-Speech training
echo "=== Learnable-Speech Data Preparation Pipeline ==="
# Configuration
DATASET_ROOT="/path/to/your/dataset" # Change this to your dataset path
OUTPUT_DIR="/path/to/processed/data" # Change this to your output path
# Create output directories
mkdir -p $OUTPUT_DIR/{fsq,dac_latents,lists}
echo "Step 1: Extract FSQ tokens using S3Tokenizer..."
cd speech/tools/S3Tokenizer
pip install .
# Extract FSQ tokens (25Hz)
torchrun --nproc_per_node=4 --nnodes=1 --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
`which s3tokenizer` \
--root_path $DATASET_ROOT \
--model speech_tokenizer_v2_25hz \
--device "cuda" \
--batch_size 64 \
--file_list ../../../files_test.txt \
--skip_existing
echo "Step 2: Extract DAC-VAE latents..."
cd ../../../dac-vae
# Download DAC-VAE checkpoint
wget -O checkpoint.pt "https://github.com/primepake/learnable-speech/releases/download/dac-vae/dac_vae_checkpoint.pt"
# Extract DAC latents
python extract_dac_latents.py \
--checkpoint checkpoint.pt \
--config configs/config.yml \
--root_path $DATASET_ROOT \
--output_dir $OUTPUT_DIR/dac_latents
echo "Step 3: Create data lists..."
cd ../speech
python tools/create_data_list.py \
--src_dir $OUTPUT_DIR \
--output_dir $OUTPUT_DIR/lists
echo "Data preparation completed!"
echo "Your dataset should now have:"
echo " - Original audio files (.wav)"
echo " - Text transcriptions (.txt)"
echo " - FSQ tokens (*_fsq.pt)"
echo " - DAC latents (*_latent.pt)"
echo " - Data list files"
|