{
  "text_to_audio": [
    "Generate an audio clip based on the given text description.",
    "Synthesize an audio signal from the given text, ensuring the fidelity of sound event representation and the naturalness of the audio output.",
    "Convert the input text into an audio format, ensuring that the temporal dynamics and spectral characteristics of the resulting audio match the described sound events.",
    "Synthesize a realistic audio clip from the text, capturing detailed sound event attributes (loudness, spatial position, repetition rate, duration), scene context, and temporal relationships between events.",
    "Generate a high-fidelity audio output from the text, capturing detailed acoustic characteristics, environmental context, and the temporal relationships between different sound events.",
    "Create an audio clip from the provided text, accurately representing all described sound events with precise attributes and maintaining natural scene context.",
    "Produce an audio representation of the given text, ensuring accurate sound event synthesis with proper spatial, temporal, and spectral attributes, as well as scene-appropriate acoustics.",
    "Convert the given text into a natural-sounding audio clip, maintaining high fidelity in sound event reproduction (volume, positioning, timing, repetition) and ensuring realistic scene acoustics and event relationships.",
    "Convert the input text into a realistic audio file, where sound events are rendered with correct dynamics (volume, location, duration, repetition) and fit naturally within the described acoustic scene.",
    "Create a lifelike audio segment from the provided text, faithfully replicating sound event details."
  ],
  "singing_voice_synthesis": [
    "Generate a singing voice from musical score input (phonemes, notes, durations, slurs) while ensuring accurate pronunciation and allowing control over speaker timbre.",
    "Produce a singing performance from the given symbolic notation comprising phonemes, notes, durations, and slurs.",
    "Generate a singing voice based on the provided phoneme sequences, note pitches, temporal durations, and slur connections.",
    "Synthesize a singing performance from the input musical score data, including phonemes, notes, durations, and slurs.",
    "Convert musical notation (phonemes, notes, durations, slurs) into a natural singing voice that faithfully reproduces the melodic contours while adapting to the textual style of the input.",
    "Generate a singing voice from phoneme and note information, with particular attention to correct slur articulation and speaker timbre customization options.",
    "Synthesize a singing voice that matches the input musica score's specifications (phonemes, notes, durations, slurs) while adapting phoneme durations for natural flow and preserving textual emotional tone.",
    "Render a singing performance from musical notation, including phonemes, notes, durations, and slurs.",
    "Produce a singing voice rendering derived from the notated score that maintains parametric fidelity to the given phonemes, notes, durations, and slurs.",
    "Synthesize a vocal output from the symbolic score input that demonstrates accurate correspondence to the provided phoneme sequences, pitch values, temporal durations, and legato markings."
  ],
  "text_to_speech": [
    "Generate natural speech from speaker embeddings and phoneme sequences while maintaining accurate pronunciation.",
    "Synthesize speech waveforms conditioned on speaker embeddings and phonemes.",
    "Produce human-like speech from phoneme inputs and speaker representations.",
    "Convert phoneme sequences into natural speech using speaker embeddings, with precise articulation of words and adaptation to the textual emotional content.",
    "Generate contextually appropriate speech from phonemic inputs that matches the target regional accent while maintaining natural rhythm and pacing.",
    "Synthesize expressive speech conditioned on speaker identity embeddings and phonemes, with accurate pronunciation.",
    "Create speech outputs from phoneme sequences and speaker vectors that exhibit appropriate emotional expressiveness.",
    "Generate prosodically natural speech from phoneme inputs using speaker embeddings, with precise control over vocal timbre.",
    "Produce intelligible speech from phoneme sequences conditioned on speaker embeddings.",
    "Synthesize emotionally appropriate speech from phoneme inputs and speaker representations that maintains natural prosodic contours and clear pronunciation."
  ],
  "speech_enhancement": [
    "Enhance noisy speech signals by reducing background noise and reverberation.",
    "Improve degraded speech quality by suppressing noise and reverberation while preserving natural voice characteristics.",
    "Boost speech intelligibility in adverse recordings by suppressing ambient noise and minimizing reverberation, preserving the natural vocal attributes of the speaker.",
    "Enhance speech signals by dynamically suppressing diverse noise types (environmental/mechanical) and reverberation, preserving tonal qualities and timbre across varying SNR conditions.",
    "Enhance degraded speech signals by suppressing background noise and reverberation while preserving the natural voice characteristics including pitch and timbre.",
    "Improve speech clarity by reducing acoustic noise and reverberation effects in corrupted recordings, maintaining the speaker's original vocal qualities.",
    "Restore degraded speech by attenuating environmental noise and echo artifacts while conserving the fundamental acoustic properties of the human voice.",
    "Enhance speech recordings to increase signal-to-noise ratio and reduce reverberation.",
    "Clean contaminated speech signals by removing additive noise and suppressing reverberation effects.",
    "Enhance distorted speech signals through noise reduction and dereverberation processing while protecting the integrity of vocal pitch and speaker timbre."
  ],
  "audio_super_resolution": [
    "Enhance audio quality by increasing its sampling rate or resolution.",
    "Upsample low-resolution audio while preserving its original quality and details.",
    "Convert low-sampling-rate audio to high-resolution output, recovering lost high-frequency components and subtle sonic characteristics.",
    "Perform audio super-resolution that restores missing high-frequency details and subtle features in low-SR inputs while minimizing artifacts, applicable to various audio types.",
    "Upsample low-resolution audio signals to higher sampling rates while preserving original signal details and recovering high-frequency components without introducing audible artifacts.",
    "Enhance the resolution of low-quality audio by reconstructing missing high-frequency information and subtle acoustic features.",
    "Perform audio super-resolution to increase the sampling rate of input signals, carefully restoring spectral details.",
    "Transform low-sampling-rate audio into high-resolution output by accurately reconstructing high-frequency content.",
    "Generate high-resolution audio from low-quality inputs through precise upsampling.",
    "Perform audio super-resolution that enhances signal quality by recovering high-frequency details."
  ],
  "video_to_audio": [
    "Generate audio for a video, matching visible sound sources.",
    "Create synchronized audio for the video, ensuring sounds align precisely with visual events.",
    "Produce high-quality audio that matches the video\u2019s scene, with accurate timing, spatial positioning, and realistic sound properties.",
    "Generate high-fidelity audio for the video, ensuring strict temporal alignment, correct spatial direction, loudness, and frequency of sounds, while maintaining realism and coherence with visual content.",
    "Generate realistic audio from video input by predicting sound events for visible objects, ensuring precise temporal alignment with visual events and maintaining high audio fidelity.",
    "Synthesize spatially-accurate audio for video content that matches visible sound sources while maintaining temporal synchronization.",
    "Produce plausible sound effects for video scenes by analyzing visual content, with generated audio properly reflecting the physical properties of sound sources and their environment.",
    "Create high-quality audio tracks synchronized to video frames.",
    "Generate high-fidelity audio synchronized to video.",
    "Synthesize realistic sound effects for video that are temporally aligned to visual events."
  ],
  "text_to_music": [
    "Develop a music clip that precisely matches the textual description in all aspects.",
    "Construct a musical segment that accurately interprets the provided text description.",
    "Create music from text with accurate instrumentation, stylistic elements, emotional tone, and vocal attributes (including gender, age and vocal style).",
    "Produce a musical piece that faithfully represents the given description, incorporating all specified instruments, intended emotions, genre characteristics, and vocal properties.",
    "Generate an audio track that accurately reflects the text description, including specified instruments, musical styles, and emotional dynamics, while maintaining audio quality and adhering to vocal specifications when provided.",
    "Synthesize music based on the given prompt by correctly integrating required instruments and musical style, while applying appropriate vocal characteristics (gender and texture).",
    "Produce a song from the text input that precisely adheres to specified instruments, genre conventions, vocal properties (gender, age, style), expressive qualities, harmonic progressions, and audio quality.",
    "Create a music track that accurately matches the textual prompt by correctly implementing all required elements: instrumentation, musical style, dynamic mood variations, and vocal styling.",
    "Generate a musical composition based on the provided text description that accurately incorporates specified instruments, genre, emotional tone, and melodic development, including vocal characteristics (gender/age/timbre/style) if mentioned, and ensure the audio quality meets description.",
    "Generate a musical output that perfectly matches the provided text, incorporating the exact instruments mentioned, upholding authentic stylistic qualities, and delivering the desired emotional impact. If vocals are required, precisely implement the described gender, age, vocal properties, and singing manner."
  ]
}