{ "asr": [ " Can you transcribe the speech into a written format?", " Listen to the speech and write down its content.", " What is the content of the speech you heard?", " Please write down the transcription of the speech.", " Please transcribe the speech into a written format.", " Write down the content of the speech you heard.", " Can you write down the transcription of the speech?", " Put the speech into a written format.", " Please help me to transcribe the speech into a written format.", " Recognize the content of the speech you heard.", " Can you recognize what you heard in the speech?", " Recognize the speech and write it down in a written format.", " Listen to the speech and recognize its content." ], "audiocaption": [ " Listen to this audio clip and provide its caption.", " Describe the following audio in a caption.", " Based on the sound you hear, create a caption for this audio.", " Can you describe the scene or event depicted in this audio?", " Could you summarise what's happening in this audio?", " What does this audio describe?", " Please describe the audio." ], "audiocaption_v2": [ " Please write down what your hear in the audio." ], "QA": [ " {}" ], "inference_QA": [ " {}" ], "gender_QA": [ " {}" ], "gender_recognition": [ " What is the gender of the speaker?", " Use one word to describe the speaker's gender.", " Describe the speaker's gender.", " Can you accurately identify the gender of the speaker?", " Can you distinguish the gender of the speaker?", " Describe the gender of the person speaking.", " What is the speaker's gender based on the audio?", " Tell me about the gender of the person you hear.", " Is the speaker male or female?" ], "emotion_recognition": [ " Describe the emotion of the speaker in one word.", " Use one word to describe the speaker's emotion." ], "emotion_recognitions": [ " Describe the emotion of the speaker in one word.", " Use one word to describe the speaker's emotion." ], "music_caption": [ " Listen to this music clip and describe the music.", " Please describe the music.", " Provide a description of the music.", " Analyze the music in this clip and offer a description.", " Give me a description of the music in this clip." ], "lyric_recognize": [ " Listen to the music and write down music's lyric.", " What is the lyric of the music you heard?", " Please write down the lyric of the music.", " Write down the lyric of the music you heard.", " Recognize the lyric of the music you heard.", " Recognize the music and give me the lyric." ], "speaker_verification": [ " Are the two people speaking successively the same person? Answer yes or no.", " Do you only hear the same person talking? Answer yes or no.", " Is only one person speaking in the audio? Answer yes or no." ], "music_type_classification": [ " Describe the music type of the music in one word. Choose from ['reggae', 'disco', 'rock', 'metal', 'blues', 'classical', 'country', 'hiphop', 'jazz', 'pop']" ], "VocalSound_classification": [ " Listen the audio, and describe the vocal sound type. Choose from [laughter, sighs, coughs, throat clearing, sneezes, sniffs]." ], "zerospeech_recognition": [ " Listen the audio, judge whether the audio is mute or not. Answer Yes or No" ], "speech_caption": [ " Listen the speech, describe it's timbre, speaking speed, style.", " Listen to this speech and provide its caption (timbre, speaking speed, style, and so on).", " Describe the following speech in a caption ((timbre, speaking speed, style, and so on))." ], "music_analysis": [ " Listen carefully to the song. Identify: (1) main genre, (2) vocal gender & timbre, (3) instrumentation, (4) lyric, (5) mood progression.", " Please analyse the audio as a musical piece. Report the singer’s voice type, dominant instruments, tempo/BPM, key or mode, and summarise the lyric content.", " After listening, give a detailed breakdown: lyric, style (e.g., pop, jazz), vocalist identity clues (gender, age range), production quality, emotional tone.", " Evaluate this music clip. Describe the genre, arrangement (rhythm section, melodic instruments), vocal technique, recording ambience, and provide the lyric summary." ], "speech_analysis": [ " Listen the speech, and tell us the speech's transcription, speaker information, emation information, and so on.", " Listen to the spoken audio. Output: full transcription, speaker count & gender, speech rate (fast/medium/slow), emotional tone...", " Analyse this speech clip: transcribe it, classify emotion (e.g., calm, excited), describe prosody features (pauses, emphasis), and estimate speaker gender." ], "audio_caption": [ " Listen to this audio clip and provide its caption.", " Describe the following audio in a caption.", " Based on the sound you hear, create a caption for this audio.", " Can you describe the scene or event depicted in this audio?", " Could you summarise what's happening in this audio?", " What does this audio describe?", " Please describe the audio." ], "audio caption": [ " Listen to this audio clip and provide its caption.", " Describe the following audio in a caption.", " Based on the sound you hear, create a caption for this audio.", " Can you describe the scene or event depicted in this audio?", " Could you summarise what's happening in this audio?", " What does this audio describe?", " Please describe the audio." ], "speech_reasoning": [ " Provide a multi-level analysis of this speech: Identify basic audio properties, Analyse phonetic structure and prosody, Transcribe the speech and extract key semantics.." ], "sound_reasoning": [ " Provide a fine-grained reasoning analysis for this sound: Level1: Coarse / Surface. Level 2: Event / Acoustic; Level 3:Semantic" ], "music_reasoning": [ " Provide a fine-grained understanding for this music: e.g. Audio quality, Genre family, Main instruments / presence of vocals, Musical / Acoustic, Semantic / Expressive" ], "music_3level": [ " Perform a three-level music analysis: Level1 (Audio quality, Genre, Instruments/Vocals), Level2 (Tempo, Key, Meter, Melody, Timbre, Dynamics), Level3 (Mood, Lyrics, Techniques, Hook, Structure)" ], "sound_3level": [ " Conduct three-level soundscape analysis: Level1 (Scene type, Spectral band), Level2 (Events, Patterns, Acoustics, Localization), Level3 (Sources, Emotion, Interactions)" ], "speech_3level": [ " Conduct three-level speech analysis: Level1 (Audio quality, Speaker gender, Language), Level2 (Prosody, Rhythm, Pronunciation), Level3 (Transcription, Semantics, Emotion)" ] }