File size: 9,814 Bytes
d1b63e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
{
    "asr": [
        "<Speech><SpeechHere></Speech> Can you transcribe the speech into a written format?",
        "<Speech><SpeechHere></Speech> Listen to the speech and write down its content.",
        "<Speech><SpeechHere></Speech> What is the content of the speech you heard?",
        "<Speech><SpeechHere></Speech> Please write down the transcription of the speech.",
        "<Speech><SpeechHere></Speech> Please transcribe the speech into a written format.",
        "<Speech><SpeechHere></Speech> Write down the content of the speech you heard.",
        "<Speech><SpeechHere></Speech> Can you write down the transcription of the speech?",
        "<Speech><SpeechHere></Speech> Put the speech into a written format.",
        "<Speech><SpeechHere></Speech> Please help me to transcribe the speech into a written format.",
        "<Speech><SpeechHere></Speech> Recognize the content of the speech you heard.",
        "<Speech><SpeechHere></Speech> Can you recognize what you heard in the speech?",
        "<Speech><SpeechHere></Speech> Recognize the speech and write it down in a written format.",
        "<Speech><SpeechHere></Speech> Listen to the speech and recognize its content."
    ],
    "audiocaption": [
        "<Speech><SpeechHere></Speech> Listen to this audio clip and provide its caption.",
        "<Speech><SpeechHere></Speech> Describe the following audio in a caption.",
        "<Speech><SpeechHere></Speech> Based on the sound you hear, create a caption for this audio.",
        "<Speech><SpeechHere></Speech> Can you describe the scene or event depicted in this audio?",
        "<Speech><SpeechHere></Speech> Could you summarise what's happening in this audio?",
        "<Speech><SpeechHere></Speech> What does this audio describe?",
        "<Speech><SpeechHere></Speech> Please describe the audio."
    ],
    "audiocaption_v2": [
        "<Speech><SpeechHere></Speech> Please write down what your hear in the audio."
    ],
    "QA": [
        "<Speech><SpeechHere></Speech> {}"
    ],
    "inference_QA": [
        "<Speech><SpeechHere></Speech> {}"
    ],
    "gender_QA": [
        "<Speech><SpeechHere></Speech> {}"
    ],
    "gender_recognition": [
        "<Speech><SpeechHere></Speech> What is the gender of the speaker?",
        "<Speech><SpeechHere></Speech> Use one word to describe the speaker's gender.",
        "<Speech><SpeechHere></Speech> Describe the speaker's gender.",
        "<Speech><SpeechHere></Speech> Can you accurately identify the gender of the speaker?",
        "<Speech><SpeechHere></Speech> Can you distinguish the gender of the speaker?",
        "<Speech><SpeechHere></Speech> Describe the gender of the person speaking.",
        "<Speech><SpeechHere></Speech> What is the speaker's gender based on the audio?",
        "<Speech><SpeechHere></Speech> Tell me about the gender of the person you hear.",
        "<Speech><SpeechHere></Speech> Is the speaker male or female?"
    ],
    "emotion_recognition": [
        "<Speech><SpeechHere></Speech> Describe the emotion of the speaker in one word.",
        "<Speech><SpeechHere></Speech> Use one word to describe the speaker's emotion."
    ],
    "emotion_recognitions": [
        "<Speech><SpeechHere></Speech> Describe the emotion of the speaker in one word.",
        "<Speech><SpeechHere></Speech> Use one word to describe the speaker's emotion."
    ],
    "music_caption": [
        "<Speech><SpeechHere></Speech> Listen to this music clip and describe the music.",
        "<Speech><SpeechHere></Speech> Please describe the music.",
        "<Speech><SpeechHere></Speech> Provide a description of the music.",
        "<Speech><SpeechHere></Speech> Analyze the music in this clip and offer a description.",
        "<Speech><SpeechHere></Speech> Give me a description of the music in this clip."
    ],
    "lyric_recognize": [
        "<Speech><SpeechHere></Speech> Listen to the music and write down music's lyric.",
        "<Speech><SpeechHere></Speech> What is the lyric of the music you heard?",
        "<Speech><SpeechHere></Speech> Please write down the lyric of the music.",
        "<Speech><SpeechHere></Speech> Write down the lyric of the music you heard.",
        "<Speech><SpeechHere></Speech> Recognize the lyric of the music you heard.",
        "<Speech><SpeechHere></Speech> Recognize the music and give me the lyric."
    ],
    "speaker_verification": [
        "<Speech><SpeechHere></Speech> Are the two people speaking successively the same person? Answer yes or no.",
        "<Speech><SpeechHere></Speech> Do you only hear the same person talking? Answer yes or no.",
        "<Speech><SpeechHere></Speech> Is only one person speaking in the audio? Answer yes or no."
    ],
    "music_type_classification": [
        "<Speech><SpeechHere></Speech> Describe the music type of the music in one word. Choose from ['reggae', 'disco', 'rock', 'metal', 'blues', 'classical', 'country', 'hiphop', 'jazz', 'pop']"
    ],
    "VocalSound_classification": [
        "<Speech><SpeechHere></Speech> Listen the audio, and describe the vocal sound type. Choose from [laughter, sighs, coughs, throat clearing, sneezes, sniffs]."
    ],
    "zerospeech_recognition": [
        "<Speech><SpeechHere></Speech> Listen the audio, judge whether the audio is mute or not. Answer Yes or No"
    ],
    "speech_caption": [
        "<Speech><SpeechHere></Speech> Listen the speech, describe it's timbre, speaking speed, style.",
        "<Speech><SpeechHere></Speech> Listen to this speech and provide its caption (timbre, speaking speed, style, and so on).",
        "<Speech><SpeechHere></Speech> Describe the following speech in a caption ((timbre, speaking speed, style, and so on))."
    ],
    "music_analysis": [
        "<Speech><SpeechHere></Speech> Listen carefully to the song. Identify: (1) main genre, (2) vocal gender & timbre, (3) instrumentation, (4) lyric, (5) mood progression.",
        "<Speech><SpeechHere></Speech> Please analyse the audio as a musical piece. Report the singer’s voice type, dominant instruments, tempo/BPM, key or mode, and summarise the lyric content.",    
        "<Speech><SpeechHere></Speech> After listening, give a detailed breakdown: lyric, style (e.g., pop, jazz), vocalist identity clues (gender, age range), production quality, emotional tone.",
        "<Speech><SpeechHere></Speech> Evaluate this music clip. Describe the genre, arrangement (rhythm section, melodic instruments), vocal technique, recording ambience, and provide the lyric summary."
    ],
    "speech_analysis": [
        "<Speech><SpeechHere></Speech> Listen the speech, and tell us the speech's transcription, speaker information, emation information, and so on.",
        "<Speech><SpeechHere></Speech> Listen to the spoken audio. Output: full transcription, speaker count & gender, speech rate (fast/medium/slow), emotional tone...",
        "<Speech><SpeechHere></Speech> Analyse this speech clip: transcribe it, classify emotion (e.g., calm, excited), describe prosody features (pauses, emphasis), and estimate speaker gender."
    ],
    "audio_caption": [
        "<Speech><SpeechHere></Speech> Listen to this audio clip and provide its caption.",
        "<Speech><SpeechHere></Speech> Describe the following audio in a caption.",
        "<Speech><SpeechHere></Speech> Based on the sound you hear, create a caption for this audio.",
        "<Speech><SpeechHere></Speech> Can you describe the scene or event depicted in this audio?",
        "<Speech><SpeechHere></Speech> Could you summarise what's happening in this audio?",
        "<Speech><SpeechHere></Speech> What does this audio describe?",
        "<Speech><SpeechHere></Speech> Please describe the audio."
    ],
    "audio caption": [
        "<Speech><SpeechHere></Speech> Listen to this audio clip and provide its caption.",
        "<Speech><SpeechHere></Speech> Describe the following audio in a caption.",
        "<Speech><SpeechHere></Speech> Based on the sound you hear, create a caption for this audio.",
        "<Speech><SpeechHere></Speech> Can you describe the scene or event depicted in this audio?",
        "<Speech><SpeechHere></Speech> Could you summarise what's happening in this audio?",
        "<Speech><SpeechHere></Speech> What does this audio describe?",
        "<Speech><SpeechHere></Speech> Please describe the audio."
    ],
    "speech_reasoning": [
        "<Speech><SpeechHere></Speech> Provide a multi-level analysis of this speech: Identify basic audio properties, Analyse phonetic structure and prosody, Transcribe the speech and extract key semantics.."
    ],
    "sound_reasoning": [
        "<Speech><SpeechHere></Speech> Provide a fine-grained reasoning analysis for this sound: Level1: Coarse / Surface. Level 2: Event / Acoustic; Level 3:Semantic"
    ],
    "music_reasoning": [
        "<Speech><SpeechHere></Speech> Provide a fine-grained understanding for this music: e.g. Audio quality, Genre family, Main instruments / presence of vocals, Musical / Acoustic, Semantic / Expressive"
    ],
    "music_3level": [
        "<Speech><SpeechHere></Speech> Perform a three-level music analysis: Level1 (Audio quality, Genre, Instruments/Vocals), Level2 (Tempo, Key, Meter, Melody, Timbre, Dynamics), Level3 (Mood, Lyrics, Techniques, Hook, Structure)"
    ],
    "sound_3level": [
        "<Speech><SpeechHere></Speech> Conduct three-level soundscape analysis: Level1 (Scene type, Spectral band), Level2 (Events, Patterns, Acoustics, Localization), Level3 (Sources, Emotion, Interactions)"
    ],
    "speech_3level": [
        "<Speech><SpeechHere></Speech> Conduct three-level speech analysis: Level1 (Audio quality, Speaker gender, Language), Level2 (Prosody, Rhythm, Pronunciation), Level3 (Transcription, Semantics, Emotion)"
    ]
}