| { | |
| "asr": [ | |
| "<Speech><SpeechHere></Speech> Can you transcribe the speech into a written format?", | |
| "<Speech><SpeechHere></Speech> Listen to the speech and write down its content.", | |
| "<Speech><SpeechHere></Speech> What is the content of the speech you heard?", | |
| "<Speech><SpeechHere></Speech> Please write down the transcription of the speech.", | |
| "<Speech><SpeechHere></Speech> Please transcribe the speech into a written format.", | |
| "<Speech><SpeechHere></Speech> Write down the content of the speech you heard.", | |
| "<Speech><SpeechHere></Speech> Can you write down the transcription of the speech?", | |
| "<Speech><SpeechHere></Speech> Put the speech into a written format.", | |
| "<Speech><SpeechHere></Speech> Please help me to transcribe the speech into a written format.", | |
| "<Speech><SpeechHere></Speech> Recognize the content of the speech you heard.", | |
| "<Speech><SpeechHere></Speech> Can you recognize what you heard in the speech?", | |
| "<Speech><SpeechHere></Speech> Recognize the speech and write it down in a written format.", | |
| "<Speech><SpeechHere></Speech> Listen to the speech and recognize its content." | |
| ], | |
| "audiocaption": [ | |
| "<Speech><SpeechHere></Speech> Listen to this audio clip and provide its caption.", | |
| "<Speech><SpeechHere></Speech> Describe the following audio in a caption.", | |
| "<Speech><SpeechHere></Speech> Based on the sound you hear, create a caption for this audio.", | |
| "<Speech><SpeechHere></Speech> Can you describe the scene or event depicted in this audio?", | |
| "<Speech><SpeechHere></Speech> Could you summarise what's happening in this audio?", | |
| "<Speech><SpeechHere></Speech> What does this audio describe?", | |
| "<Speech><SpeechHere></Speech> Please describe the audio." | |
| ], | |
| "audiocaption_v2": [ | |
| "<Speech><SpeechHere></Speech> Please write down what your hear in the audio." | |
| ], | |
| "QA": [ | |
| "<Speech><SpeechHere></Speech> {}" | |
| ], | |
| "inference_QA": [ | |
| "<Speech><SpeechHere></Speech> {}" | |
| ], | |
| "gender_QA": [ | |
| "<Speech><SpeechHere></Speech> {}" | |
| ], | |
| "gender_recognition": [ | |
| "<Speech><SpeechHere></Speech> What is the gender of the speaker?", | |
| "<Speech><SpeechHere></Speech> Use one word to describe the speaker's gender.", | |
| "<Speech><SpeechHere></Speech> Describe the speaker's gender.", | |
| "<Speech><SpeechHere></Speech> Can you accurately identify the gender of the speaker?", | |
| "<Speech><SpeechHere></Speech> Can you distinguish the gender of the speaker?", | |
| "<Speech><SpeechHere></Speech> Describe the gender of the person speaking.", | |
| "<Speech><SpeechHere></Speech> What is the speaker's gender based on the audio?", | |
| "<Speech><SpeechHere></Speech> Tell me about the gender of the person you hear.", | |
| "<Speech><SpeechHere></Speech> Is the speaker male or female?" | |
| ], | |
| "emotion_recognition": [ | |
| "<Speech><SpeechHere></Speech> Describe the emotion of the speaker in one word.", | |
| "<Speech><SpeechHere></Speech> Use one word to describe the speaker's emotion." | |
| ], | |
| "emotion_recognitions": [ | |
| "<Speech><SpeechHere></Speech> Describe the emotion of the speaker in one word.", | |
| "<Speech><SpeechHere></Speech> Use one word to describe the speaker's emotion." | |
| ], | |
| "music_caption": [ | |
| "<Speech><SpeechHere></Speech> Listen to this music clip and describe the music.", | |
| "<Speech><SpeechHere></Speech> Please describe the music.", | |
| "<Speech><SpeechHere></Speech> Provide a description of the music.", | |
| "<Speech><SpeechHere></Speech> Analyze the music in this clip and offer a description.", | |
| "<Speech><SpeechHere></Speech> Give me a description of the music in this clip." | |
| ], | |
| "lyric_recognize": [ | |
| "<Speech><SpeechHere></Speech> Listen to the music and write down music's lyric.", | |
| "<Speech><SpeechHere></Speech> What is the lyric of the music you heard?", | |
| "<Speech><SpeechHere></Speech> Please write down the lyric of the music.", | |
| "<Speech><SpeechHere></Speech> Write down the lyric of the music you heard.", | |
| "<Speech><SpeechHere></Speech> Recognize the lyric of the music you heard.", | |
| "<Speech><SpeechHere></Speech> Recognize the music and give me the lyric." | |
| ], | |
| "speaker_verification": [ | |
| "<Speech><SpeechHere></Speech> Are the two people speaking successively the same person? Answer yes or no.", | |
| "<Speech><SpeechHere></Speech> Do you only hear the same person talking? Answer yes or no.", | |
| "<Speech><SpeechHere></Speech> Is only one person speaking in the audio? Answer yes or no." | |
| ], | |
| "music_type_classification": [ | |
| "<Speech><SpeechHere></Speech> Describe the music type of the music in one word. Choose from ['reggae', 'disco', 'rock', 'metal', 'blues', 'classical', 'country', 'hiphop', 'jazz', 'pop']" | |
| ], | |
| "VocalSound_classification": [ | |
| "<Speech><SpeechHere></Speech> Listen the audio, and describe the vocal sound type. Choose from [laughter, sighs, coughs, throat clearing, sneezes, sniffs]." | |
| ], | |
| "zerospeech_recognition": [ | |
| "<Speech><SpeechHere></Speech> Listen the audio, judge whether the audio is mute or not. Answer Yes or No" | |
| ], | |
| "speech_caption": [ | |
| "<Speech><SpeechHere></Speech> Listen the speech, describe it's timbre, speaking speed, style.", | |
| "<Speech><SpeechHere></Speech> Listen to this speech and provide its caption (timbre, speaking speed, style, and so on).", | |
| "<Speech><SpeechHere></Speech> Describe the following speech in a caption ((timbre, speaking speed, style, and so on))." | |
| ], | |
| "music_analysis": [ | |
| "<Speech><SpeechHere></Speech> Listen carefully to the song. Identify: (1) main genre, (2) vocal gender & timbre, (3) instrumentation, (4) lyric, (5) mood progression.", | |
| "<Speech><SpeechHere></Speech> Please analyse the audio as a musical piece. Report the singer’s voice type, dominant instruments, tempo/BPM, key or mode, and summarise the lyric content.", | |
| "<Speech><SpeechHere></Speech> After listening, give a detailed breakdown: lyric, style (e.g., pop, jazz), vocalist identity clues (gender, age range), production quality, emotional tone.", | |
| "<Speech><SpeechHere></Speech> Evaluate this music clip. Describe the genre, arrangement (rhythm section, melodic instruments), vocal technique, recording ambience, and provide the lyric summary." | |
| ], | |
| "speech_analysis": [ | |
| "<Speech><SpeechHere></Speech> Listen the speech, and tell us the speech's transcription, speaker information, emation information, and so on.", | |
| "<Speech><SpeechHere></Speech> Listen to the spoken audio. Output: full transcription, speaker count & gender, speech rate (fast/medium/slow), emotional tone...", | |
| "<Speech><SpeechHere></Speech> Analyse this speech clip: transcribe it, classify emotion (e.g., calm, excited), describe prosody features (pauses, emphasis), and estimate speaker gender." | |
| ], | |
| "audio_caption": [ | |
| "<Speech><SpeechHere></Speech> Listen to this audio clip and provide its caption.", | |
| "<Speech><SpeechHere></Speech> Describe the following audio in a caption.", | |
| "<Speech><SpeechHere></Speech> Based on the sound you hear, create a caption for this audio.", | |
| "<Speech><SpeechHere></Speech> Can you describe the scene or event depicted in this audio?", | |
| "<Speech><SpeechHere></Speech> Could you summarise what's happening in this audio?", | |
| "<Speech><SpeechHere></Speech> What does this audio describe?", | |
| "<Speech><SpeechHere></Speech> Please describe the audio." | |
| ], | |
| "audio caption": [ | |
| "<Speech><SpeechHere></Speech> Listen to this audio clip and provide its caption.", | |
| "<Speech><SpeechHere></Speech> Describe the following audio in a caption.", | |
| "<Speech><SpeechHere></Speech> Based on the sound you hear, create a caption for this audio.", | |
| "<Speech><SpeechHere></Speech> Can you describe the scene or event depicted in this audio?", | |
| "<Speech><SpeechHere></Speech> Could you summarise what's happening in this audio?", | |
| "<Speech><SpeechHere></Speech> What does this audio describe?", | |
| "<Speech><SpeechHere></Speech> Please describe the audio." | |
| ], | |
| "speech_reasoning": [ | |
| "<Speech><SpeechHere></Speech> Provide a multi-level analysis of this speech: Identify basic audio properties, Analyse phonetic structure and prosody, Transcribe the speech and extract key semantics.." | |
| ], | |
| "sound_reasoning": [ | |
| "<Speech><SpeechHere></Speech> Provide a fine-grained reasoning analysis for this sound: Level1: Coarse / Surface. Level 2: Event / Acoustic; Level 3:Semantic" | |
| ], | |
| "music_reasoning": [ | |
| "<Speech><SpeechHere></Speech> Provide a fine-grained understanding for this music: e.g. Audio quality, Genre family, Main instruments / presence of vocals, Musical / Acoustic, Semantic / Expressive" | |
| ], | |
| "music_3level": [ | |
| "<Speech><SpeechHere></Speech> Perform a three-level music analysis: Level1 (Audio quality, Genre, Instruments/Vocals), Level2 (Tempo, Key, Meter, Melody, Timbre, Dynamics), Level3 (Mood, Lyrics, Techniques, Hook, Structure)" | |
| ], | |
| "sound_3level": [ | |
| "<Speech><SpeechHere></Speech> Conduct three-level soundscape analysis: Level1 (Scene type, Spectral band), Level2 (Events, Patterns, Acoustics, Localization), Level3 (Sources, Emotion, Interactions)" | |
| ], | |
| "speech_3level": [ | |
| "<Speech><SpeechHere></Speech> Conduct three-level speech analysis: Level1 (Audio quality, Speaker gender, Language), Level2 (Prosody, Rhythm, Pronunciation), Level3 (Transcription, Semantics, Emotion)" | |
| ] | |
| } |