AlexWortega's picture
Upload folder using huggingface_hub
5a780b7 verified
"""
Gradio UI for Borealis Audio-Language Model (CPU Version)
"""
import os
os.environ["HF_AUDIO_DECODER_BACKEND"] = "soundfile"
import torch
import gradio as gr
from transformers import AutoModel
# Force CPU
DEVICE = "cpu"
# Global model variable
model = None
def load_model():
global model
if model is None:
print("Loading Borealis model on CPU...")
model = AutoModel.from_pretrained(
"Vikhrmodels/Borealis-5b-it",
trust_remote_code=True,
device=DEVICE,
torch_dtype=torch.float32,
)
model.eval()
print("Model loaded!")
return model
def process_audio(audio, system_prompt, user_prompt, max_tokens, temperature, top_p):
"""Process audio and generate response."""
if audio is None:
return "Please upload or record an audio file."
m = load_model()
sr, audio_array = audio
# Convert to torch tensor and normalize
audio_tensor = torch.tensor(audio_array).float()
if audio_tensor.dim() > 1:
audio_tensor = audio_tensor.mean(dim=-1) # Convert stereo to mono
# Normalize to [-1, 1] if needed
if audio_tensor.abs().max() > 1.0:
audio_tensor = audio_tensor / 32768.0
# Resample if needed
if sr != 16000:
import torchaudio
audio_tensor = torchaudio.functional.resample(audio_tensor, sr, 16000)
# Ensure audio tags in prompt
if "<|start_of_audio|>" not in user_prompt:
user_prompt = f"{user_prompt} <|start_of_audio|><|end_of_audio|>"
with torch.inference_mode():
output = m.generate(
audio=audio_tensor,
system_prompt=system_prompt,
user_prompt=user_prompt,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=temperature > 0,
)
response = m.decode(output[0])
return response
# Preset prompts
PRESET_PROMPTS = {
"Transcription (EN)": {
"system": "You are a speech recognition assistant. Accurately transcribe audio to text.",
"user": "Transcribe this audio: <|start_of_audio|><|end_of_audio|>"
},
"Transcription (RU)": {
"system": "Ты ассистент по распознаванию речи. Точно транскрибируй аудио в текст.",
"user": "Транскрибируй это аудио: <|start_of_audio|><|end_of_audio|>"
},
"Summarization (EN)": {
"system": "You are a helpful voice assistant.",
"user": "Summarize what is said in this recording: <|start_of_audio|><|end_of_audio|>"
},
"Summarization (RU)": {
"system": "Ты полезный голосовой ассистент.",
"user": "Кратко перескажи содержание аудио: <|start_of_audio|><|end_of_audio|>"
},
"Q&A (EN)": {
"system": "You are a helpful voice assistant. Listen to the audio and respond appropriately.",
"user": "What is being discussed in this audio? <|start_of_audio|><|end_of_audio|>"
},
"Q&A (RU)": {
"system": "Ты полезный голосовой ассистент. Слушай аудио и отвечай на вопросы.",
"user": "О чём говорится в этой аудиозаписи? <|start_of_audio|><|end_of_audio|>"
},
"Description (EN)": {
"system": "You are an attentive listener.",
"user": "Describe in detail what you hear: <|start_of_audio|><|end_of_audio|>"
},
"Description (RU)": {
"system": "Ты внимательный слушатель.",
"user": "Опиши подробно, что ты слышишь: <|start_of_audio|><|end_of_audio|>"
},
"Custom": {
"system": "You are a helpful voice assistant.",
"user": "<|start_of_audio|><|end_of_audio|>"
}
}
def update_prompts(preset):
"""Update prompts based on selected preset."""
prompts = PRESET_PROMPTS.get(preset, PRESET_PROMPTS["Custom"])
return prompts["system"], prompts["user"]
# Build Gradio interface
with gr.Blocks(title="Borealis Audio-Language Model") as demo:
gr.Markdown("""
# Borealis-5B-IT
Audio-Language Model for Speech Understanding
Upload or record audio, select a prompt preset or write your own, and generate a response.
**Note**: Running on CPU, generation may take a while.
""")
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(
label="Audio Input",
type="numpy",
sources=["upload", "microphone"]
)
preset_dropdown = gr.Dropdown(
choices=list(PRESET_PROMPTS.keys()),
value="Q&A (EN)",
label="Prompt Preset"
)
system_prompt = gr.Textbox(
label="System Prompt",
value=PRESET_PROMPTS["Q&A (EN)"]["system"],
lines=2
)
user_prompt = gr.Textbox(
label="User Prompt",
value=PRESET_PROMPTS["Q&A (EN)"]["user"],
lines=2,
info="Include <|start_of_audio|><|end_of_audio|> tags where audio should be placed"
)
with gr.Row():
max_tokens = gr.Slider(
minimum=32,
maximum=512,
value=128,
step=32,
label="Max Tokens"
)
with gr.Row():
temperature = gr.Slider(
minimum=0.0,
maximum=1.5,
value=0.7,
step=0.1,
label="Temperature"
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.05,
label="Top-p"
)
submit_btn = gr.Button("Generate", variant="primary")
with gr.Column(scale=1):
output_text = gr.Textbox(
label="Model Response",
lines=15
)
# Event handlers
preset_dropdown.change(
fn=update_prompts,
inputs=[preset_dropdown],
outputs=[system_prompt, user_prompt]
)
submit_btn.click(
fn=process_audio,
inputs=[audio_input, system_prompt, user_prompt, max_tokens, temperature, top_p],
outputs=[output_text]
)
gr.Markdown("""
---
**Model**: [Vikhrmodels/Borealis-5b-it](https://huggingface.co/Vikhrmodels/Borealis-5b-it)
**Architecture**: Whisper Large V3 (encoder) + Qwen3-4B (LLM)
**Training Data**: [Speech-Instructions](https://huggingface.co/datasets/Vikhrmodels/Speech-Instructions), [Speech-Describe](https://huggingface.co/datasets/Vikhrmodels/Speech-Describe), [ToneBooks](https://huggingface.co/datasets/Vikhrmodels/ToneBooks), [AudioBooksInstructGemini2.5](https://huggingface.co/datasets/Vikhrmodels/AudioBooksInstructGemini2.5)
""")
if __name__ == "__main__":
demo.launch()