Spaces:
Running
Running
File size: 7,170 Bytes
5a780b7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | """
Gradio UI for Borealis Audio-Language Model (CPU Version)
"""
import os
os.environ["HF_AUDIO_DECODER_BACKEND"] = "soundfile"
import torch
import gradio as gr
from transformers import AutoModel
# Force CPU
DEVICE = "cpu"
# Global model variable
model = None
def load_model():
global model
if model is None:
print("Loading Borealis model on CPU...")
model = AutoModel.from_pretrained(
"Vikhrmodels/Borealis-5b-it",
trust_remote_code=True,
device=DEVICE,
torch_dtype=torch.float32,
)
model.eval()
print("Model loaded!")
return model
def process_audio(audio, system_prompt, user_prompt, max_tokens, temperature, top_p):
"""Process audio and generate response."""
if audio is None:
return "Please upload or record an audio file."
m = load_model()
sr, audio_array = audio
# Convert to torch tensor and normalize
audio_tensor = torch.tensor(audio_array).float()
if audio_tensor.dim() > 1:
audio_tensor = audio_tensor.mean(dim=-1) # Convert stereo to mono
# Normalize to [-1, 1] if needed
if audio_tensor.abs().max() > 1.0:
audio_tensor = audio_tensor / 32768.0
# Resample if needed
if sr != 16000:
import torchaudio
audio_tensor = torchaudio.functional.resample(audio_tensor, sr, 16000)
# Ensure audio tags in prompt
if "<|start_of_audio|>" not in user_prompt:
user_prompt = f"{user_prompt} <|start_of_audio|><|end_of_audio|>"
with torch.inference_mode():
output = m.generate(
audio=audio_tensor,
system_prompt=system_prompt,
user_prompt=user_prompt,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=temperature > 0,
)
response = m.decode(output[0])
return response
# Preset prompts
PRESET_PROMPTS = {
"Transcription (EN)": {
"system": "You are a speech recognition assistant. Accurately transcribe audio to text.",
"user": "Transcribe this audio: <|start_of_audio|><|end_of_audio|>"
},
"Transcription (RU)": {
"system": "Ты ассистент по распознаванию речи. Точно транскрибируй аудио в текст.",
"user": "Транскрибируй это аудио: <|start_of_audio|><|end_of_audio|>"
},
"Summarization (EN)": {
"system": "You are a helpful voice assistant.",
"user": "Summarize what is said in this recording: <|start_of_audio|><|end_of_audio|>"
},
"Summarization (RU)": {
"system": "Ты полезный голосовой ассистент.",
"user": "Кратко перескажи содержание аудио: <|start_of_audio|><|end_of_audio|>"
},
"Q&A (EN)": {
"system": "You are a helpful voice assistant. Listen to the audio and respond appropriately.",
"user": "What is being discussed in this audio? <|start_of_audio|><|end_of_audio|>"
},
"Q&A (RU)": {
"system": "Ты полезный голосовой ассистент. Слушай аудио и отвечай на вопросы.",
"user": "О чём говорится в этой аудиозаписи? <|start_of_audio|><|end_of_audio|>"
},
"Description (EN)": {
"system": "You are an attentive listener.",
"user": "Describe in detail what you hear: <|start_of_audio|><|end_of_audio|>"
},
"Description (RU)": {
"system": "Ты внимательный слушатель.",
"user": "Опиши подробно, что ты слышишь: <|start_of_audio|><|end_of_audio|>"
},
"Custom": {
"system": "You are a helpful voice assistant.",
"user": "<|start_of_audio|><|end_of_audio|>"
}
}
def update_prompts(preset):
"""Update prompts based on selected preset."""
prompts = PRESET_PROMPTS.get(preset, PRESET_PROMPTS["Custom"])
return prompts["system"], prompts["user"]
# Build Gradio interface
with gr.Blocks(title="Borealis Audio-Language Model") as demo:
gr.Markdown("""
# Borealis-5B-IT
Audio-Language Model for Speech Understanding
Upload or record audio, select a prompt preset or write your own, and generate a response.
**Note**: Running on CPU, generation may take a while.
""")
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(
label="Audio Input",
type="numpy",
sources=["upload", "microphone"]
)
preset_dropdown = gr.Dropdown(
choices=list(PRESET_PROMPTS.keys()),
value="Q&A (EN)",
label="Prompt Preset"
)
system_prompt = gr.Textbox(
label="System Prompt",
value=PRESET_PROMPTS["Q&A (EN)"]["system"],
lines=2
)
user_prompt = gr.Textbox(
label="User Prompt",
value=PRESET_PROMPTS["Q&A (EN)"]["user"],
lines=2,
info="Include <|start_of_audio|><|end_of_audio|> tags where audio should be placed"
)
with gr.Row():
max_tokens = gr.Slider(
minimum=32,
maximum=512,
value=128,
step=32,
label="Max Tokens"
)
with gr.Row():
temperature = gr.Slider(
minimum=0.0,
maximum=1.5,
value=0.7,
step=0.1,
label="Temperature"
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.05,
label="Top-p"
)
submit_btn = gr.Button("Generate", variant="primary")
with gr.Column(scale=1):
output_text = gr.Textbox(
label="Model Response",
lines=15
)
# Event handlers
preset_dropdown.change(
fn=update_prompts,
inputs=[preset_dropdown],
outputs=[system_prompt, user_prompt]
)
submit_btn.click(
fn=process_audio,
inputs=[audio_input, system_prompt, user_prompt, max_tokens, temperature, top_p],
outputs=[output_text]
)
gr.Markdown("""
---
**Model**: [Vikhrmodels/Borealis-5b-it](https://huggingface.co/Vikhrmodels/Borealis-5b-it)
**Architecture**: Whisper Large V3 (encoder) + Qwen3-4B (LLM)
**Training Data**: [Speech-Instructions](https://huggingface.co/datasets/Vikhrmodels/Speech-Instructions), [Speech-Describe](https://huggingface.co/datasets/Vikhrmodels/Speech-Describe), [ToneBooks](https://huggingface.co/datasets/Vikhrmodels/ToneBooks), [AudioBooksInstructGemini2.5](https://huggingface.co/datasets/Vikhrmodels/AudioBooksInstructGemini2.5)
""")
if __name__ == "__main__":
demo.launch()
|