File size: 4,243 Bytes
15389e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/bin/bash

DATA_SETTING=$1
MODE_TYPE=$2
TOKENIZER_MODEL=$3
AUDIO_PROMPT_MODES=($4)
if [ -z "$4" ]; then
    AUDIO_PROMPT_MODES=('dual' 'inst' 'vocal' 'mixture')
fi

if [ -z "$DATA_SETTING" ] || [ -z "$MODE_TYPE" ]; then
    echo "Usage: $0 <setting> <mode_type>"
    echo "  <setting>: e.g., dummy"
    echo "  <mode_type>: cot or icl_cot"
    exit 1
fi

# Common settings based on DATA_SETTING
if [ "$DATA_SETTING" == "dummy" ]; then
       DATA_ROOT=example
       NAME_PREFIX=dummy.msa.xcodec_16k
       CODEC_TYPE=xcodec
       INSTRUCTION="Generate music from the given lyrics segment by segment."
       ORDER=textfirst
       DROPOUT=0.0
       KEEP_SEQUENTIAL_SAMPLES=true
       QUANTIZER_BEGIN_IDX=0
       NUM_QUANTIZERS=1
else
    echo "Invalid setting: $DATA_SETTING"
    exit 1
fi

JSONL_NAME=jsonl/$NAME_PREFIX.jsonl

# Mode-specific settings and execution
if [ "$MODE_TYPE" == "cot" ]; then
    echo "Running in 'cot' mode..."
    NAME_SUFFIX=stage_1_token_level_interleave_cot_xcodec
    MMAP_NAME=mmap/${NAME_PREFIX}_${NAME_SUFFIX}_$ORDER

    rm -f $DATA_ROOT/jsonl/${NAME_PREFIX}_*.jsonl # Use -f to avoid error if files don't exist
    mkdir -p $DATA_ROOT/$MMAP_NAME

    args="python core/preprocess_data_conditional_xcodec_segment.py \
           --input $DATA_ROOT/$JSONL_NAME \
           --output-prefix $DATA_ROOT/$MMAP_NAME \
           --tokenizer-model $TOKENIZER_MODEL \
           --tokenizer-type MMSentencePieceTokenizer \
           --codec-type $CODEC_TYPE \
           --workers 8 \
           --partitions 1 \
           --instruction \"$INSTRUCTION\" \
           --instruction-dropout-rate $DROPOUT \
           --order $ORDER \
           --append-eod \
           --quantizer-begin $QUANTIZER_BEGIN_IDX \
           --n-quantizer $NUM_QUANTIZERS \
           --use-token-level-interleave \
           --keep-sequential-samples \
           --cot
           "

    echo "$args"
    sleep 5
    eval $args

    rm -f $DATA_ROOT/jsonl/${NAME_PREFIX}_*.jsonl # Use -f
    rm -f $DATA_ROOT/${MMAP_NAME}_*_text_document.bin # Use -f
    rm -f $DATA_ROOT/${MMAP_NAME}_*_text_document.idx # Use -f

elif [ "$MODE_TYPE" == "icl_cot" ]; then
    echo "Running in 'icl_cot' mode..."
    NAME_SUFFIX=stage_1_token_level_interleave_long_prompt_msa
    MMAP_NAME=mmap/${NAME_PREFIX}_${NAME_SUFFIX}_$ORDER # Define MMAP_NAME base for this mode
    PROMPT_LEN=30

    rm -f $DATA_ROOT/jsonl/${NAME_PREFIX}_*.jsonl # Use -f
    mkdir -p $DATA_ROOT/$MMAP_NAME # Ensure base MMAP dir exists

    
    for mode in "${AUDIO_PROMPT_MODES[@]}"; do
           echo "Processing mode: $mode"
           MODE_MMAP_NAME=${MMAP_NAME}_${mode} # Mode specific path
           mkdir -p $DATA_ROOT/$MODE_MMAP_NAME # Ensure mode-specific dir exists

           args="python core/preprocess_data_conditional_xcodec_segment.py \
                  --input $DATA_ROOT/$JSONL_NAME \
                  --output-prefix $DATA_ROOT/$MODE_MMAP_NAME \
                  --tokenizer-model $TOKENIZER_MODEL \
                  --tokenizer-type MMSentencePieceTokenizer \
                  --codec-type $CODEC_TYPE \
                  --workers 8 \
                  --partitions 1 \
                  --instruction \"$INSTRUCTION\" \
                  --instruction-dropout-rate $DROPOUT \
                  --order $ORDER \
                  --append-eod \
                  --quantizer-begin $QUANTIZER_BEGIN_IDX \
                  --n-quantizer $NUM_QUANTIZERS \
                  --cot \
                  --use-token-level-interleave \
                  --use-audio-icl \
                  --audio-prompt-mode $mode \
                  --audio-prompt-len $PROMPT_LEN \
                  --keep-sequential-samples
                  "

           echo "$args"
           sleep 5
           eval $args

           # Clean up mode-specific files
           rm -f $DATA_ROOT/jsonl/${NAME_PREFIX}_*.jsonl # Use -f
           rm -f $DATA_ROOT/${MODE_MMAP_NAME}_*_text_document.bin # Use -f
           rm -f $DATA_ROOT/${MODE_MMAP_NAME}_*_text_document.idx # Use -f
    done

else
    echo "Invalid mode_type: $MODE_TYPE. Use 'cot' or 'icl_cot'."
    exit 1
fi

echo "Preprocessing finished for setting '$DATA_SETTING' and mode_type '$MODE_TYPE'."