Spaces:
Running
Running
File size: 4,243 Bytes
15389e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
#!/bin/bash
DATA_SETTING=$1
MODE_TYPE=$2
TOKENIZER_MODEL=$3
AUDIO_PROMPT_MODES=($4)
if [ -z "$4" ]; then
AUDIO_PROMPT_MODES=('dual' 'inst' 'vocal' 'mixture')
fi
if [ -z "$DATA_SETTING" ] || [ -z "$MODE_TYPE" ]; then
echo "Usage: $0 <setting> <mode_type>"
echo " <setting>: e.g., dummy"
echo " <mode_type>: cot or icl_cot"
exit 1
fi
# Common settings based on DATA_SETTING
if [ "$DATA_SETTING" == "dummy" ]; then
DATA_ROOT=example
NAME_PREFIX=dummy.msa.xcodec_16k
CODEC_TYPE=xcodec
INSTRUCTION="Generate music from the given lyrics segment by segment."
ORDER=textfirst
DROPOUT=0.0
KEEP_SEQUENTIAL_SAMPLES=true
QUANTIZER_BEGIN_IDX=0
NUM_QUANTIZERS=1
else
echo "Invalid setting: $DATA_SETTING"
exit 1
fi
JSONL_NAME=jsonl/$NAME_PREFIX.jsonl
# Mode-specific settings and execution
if [ "$MODE_TYPE" == "cot" ]; then
echo "Running in 'cot' mode..."
NAME_SUFFIX=stage_1_token_level_interleave_cot_xcodec
MMAP_NAME=mmap/${NAME_PREFIX}_${NAME_SUFFIX}_$ORDER
rm -f $DATA_ROOT/jsonl/${NAME_PREFIX}_*.jsonl # Use -f to avoid error if files don't exist
mkdir -p $DATA_ROOT/$MMAP_NAME
args="python core/preprocess_data_conditional_xcodec_segment.py \
--input $DATA_ROOT/$JSONL_NAME \
--output-prefix $DATA_ROOT/$MMAP_NAME \
--tokenizer-model $TOKENIZER_MODEL \
--tokenizer-type MMSentencePieceTokenizer \
--codec-type $CODEC_TYPE \
--workers 8 \
--partitions 1 \
--instruction \"$INSTRUCTION\" \
--instruction-dropout-rate $DROPOUT \
--order $ORDER \
--append-eod \
--quantizer-begin $QUANTIZER_BEGIN_IDX \
--n-quantizer $NUM_QUANTIZERS \
--use-token-level-interleave \
--keep-sequential-samples \
--cot
"
echo "$args"
sleep 5
eval $args
rm -f $DATA_ROOT/jsonl/${NAME_PREFIX}_*.jsonl # Use -f
rm -f $DATA_ROOT/${MMAP_NAME}_*_text_document.bin # Use -f
rm -f $DATA_ROOT/${MMAP_NAME}_*_text_document.idx # Use -f
elif [ "$MODE_TYPE" == "icl_cot" ]; then
echo "Running in 'icl_cot' mode..."
NAME_SUFFIX=stage_1_token_level_interleave_long_prompt_msa
MMAP_NAME=mmap/${NAME_PREFIX}_${NAME_SUFFIX}_$ORDER # Define MMAP_NAME base for this mode
PROMPT_LEN=30
rm -f $DATA_ROOT/jsonl/${NAME_PREFIX}_*.jsonl # Use -f
mkdir -p $DATA_ROOT/$MMAP_NAME # Ensure base MMAP dir exists
for mode in "${AUDIO_PROMPT_MODES[@]}"; do
echo "Processing mode: $mode"
MODE_MMAP_NAME=${MMAP_NAME}_${mode} # Mode specific path
mkdir -p $DATA_ROOT/$MODE_MMAP_NAME # Ensure mode-specific dir exists
args="python core/preprocess_data_conditional_xcodec_segment.py \
--input $DATA_ROOT/$JSONL_NAME \
--output-prefix $DATA_ROOT/$MODE_MMAP_NAME \
--tokenizer-model $TOKENIZER_MODEL \
--tokenizer-type MMSentencePieceTokenizer \
--codec-type $CODEC_TYPE \
--workers 8 \
--partitions 1 \
--instruction \"$INSTRUCTION\" \
--instruction-dropout-rate $DROPOUT \
--order $ORDER \
--append-eod \
--quantizer-begin $QUANTIZER_BEGIN_IDX \
--n-quantizer $NUM_QUANTIZERS \
--cot \
--use-token-level-interleave \
--use-audio-icl \
--audio-prompt-mode $mode \
--audio-prompt-len $PROMPT_LEN \
--keep-sequential-samples
"
echo "$args"
sleep 5
eval $args
# Clean up mode-specific files
rm -f $DATA_ROOT/jsonl/${NAME_PREFIX}_*.jsonl # Use -f
rm -f $DATA_ROOT/${MODE_MMAP_NAME}_*_text_document.bin # Use -f
rm -f $DATA_ROOT/${MODE_MMAP_NAME}_*_text_document.idx # Use -f
done
else
echo "Invalid mode_type: $MODE_TYPE. Use 'cot' or 'icl_cot'."
exit 1
fi
echo "Preprocessing finished for setting '$DATA_SETTING' and mode_type '$MODE_TYPE'." |