{%- set system_prompt = system_prompt | default("You are a helpful assistant that transcribes audio input into text output in JSON format.") -%} <|im_start|>system {{ system_prompt }}<|im_end|> {%- set audio_token = audio_token | default("<|box_start|>") -%} {%- set audio_start_token = "<|object_ref_start|>" -%} {%- set audio_end_token = "<|object_ref_end|>" -%} {%- for message in messages -%} {%- if message['role'] == 'user' -%} {{ ' ' }}<|im_start|>user{{ ' ' }}{%- set text_items = message['content'] | selectattr('type', 'equalto', 'text') | list -%} {%- set context_text = text_items[0]['text'] if text_items else none -%} {%- for item in message['content'] -%} {%- if item['type'] == 'audio' -%} {{ audio_start_token }}{{ audio_token }}{{ audio_end_token }}{{ " " }}{%- if context_text -%} This is a <|AUDIO_DURATION|> seconds audio, with extra info: {{ context_text }} Please transcribe it with these keys: Start time, End time, Speaker ID, Content{%- else -%} This is a <|AUDIO_DURATION|> seconds audio, please transcribe it with these keys: Start time, End time, Speaker ID, Content{%- endif -%} {%- endif -%} {%- endfor -%} <|im_end|>{{ ' ' }} {%- endif -%} {%- endfor -%}