Spaces:

Vikhrmodels
/

Borealis-inference

Running

App Files Files Community

AlexWortega commited on Dec 15, 2025

Commit

5a780b7

verified ·

1 Parent(s): 60425b2

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +37 -5
app.py +213 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,12 +1,44 @@
 ---
 title: Borealis Inference
-emoji: 🦀
-colorFrom: yellow
-colorTo: yellow
 sdk: gradio
-sdk_version: 6.1.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Borealis Inference
+emoji: 🎙️
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 5.9.1
 app_file: app.py
 pinned: false
+license: apache-2.0
+models:
+  - Vikhrmodels/Borealis-5b-it
 ---
+# Borealis-5B-IT Inference
+Audio-Language Model for Speech Understanding.
+## Features
+- Upload audio or record from microphone
+- Multiple prompt presets (transcription, summarization, Q&A)
+- Support for Russian and English
+- Customizable generation parameters
+## Model
+- **Architecture**: Whisper Large V3 (encoder) + Qwen3-4B (LLM)
+- **Parameters**: ~5B
+- **Languages**: Russian, English
+## Usage
+1. Upload an audio file or record using microphone
+2. Select a prompt preset or write custom prompts
+3. Adjust generation parameters if needed
+4. Click "Generate" to get the response
+**Note**: Running on CPU, generation may take some time.
+## Links
+- [Model Card](https://huggingface.co/Vikhrmodels/Borealis-5b-it)
+- [Training Datasets](https://huggingface.co/datasets/Vikhrmodels/Speech-Instructions)

app.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""
+Gradio UI for Borealis Audio-Language Model (CPU Version)
+"""
+import os
+os.environ["HF_AUDIO_DECODER_BACKEND"] = "soundfile"
+import torch
+import gradio as gr
+from transformers import AutoModel
+# Force CPU
+DEVICE = "cpu"
+# Global model variable
+model = None
+def load_model():
+    global model
+    if model is None:
+        print("Loading Borealis model on CPU...")
+        model = AutoModel.from_pretrained(
+            "Vikhrmodels/Borealis-5b-it",
+            trust_remote_code=True,
+            device=DEVICE,
+            torch_dtype=torch.float32,
+        )
+        model.eval()
+        print("Model loaded!")
+    return model
+def process_audio(audio, system_prompt, user_prompt, max_tokens, temperature, top_p):
+    """Process audio and generate response."""
+    if audio is None:
+        return "Please upload or record an audio file."
+    m = load_model()
+    sr, audio_array = audio
+    # Convert to torch tensor and normalize
+    audio_tensor = torch.tensor(audio_array).float()
+    if audio_tensor.dim() > 1:
+        audio_tensor = audio_tensor.mean(dim=-1)  # Convert stereo to mono
+    # Normalize to [-1, 1] if needed
+    if audio_tensor.abs().max() > 1.0:
+        audio_tensor = audio_tensor / 32768.0
+    # Resample if needed
+    if sr != 16000:
+        import torchaudio
+        audio_tensor = torchaudio.functional.resample(audio_tensor, sr, 16000)
+    # Ensure audio tags in prompt
+    if "<|start_of_audio|>" not in user_prompt:
+        user_prompt = f"{user_prompt} <|start_of_audio|><|end_of_audio|>"
+    with torch.inference_mode():
+        output = m.generate(
+            audio=audio_tensor,
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=temperature > 0,
+        )
+    response = m.decode(output[0])
+    return response
+# Preset prompts
+PRESET_PROMPTS = {
+    "Transcription (EN)": {
+        "system": "You are a speech recognition assistant. Accurately transcribe audio to text.",
+        "user": "Transcribe this audio: <|start_of_audio|><|end_of_audio|>"
+    },
+    "Transcription (RU)": {
+        "system": "Ты ассистент по распознаванию речи. Точно транскрибируй аудио в текст.",
+        "user": "Транскрибируй это аудио: <|start_of_audio|><|end_of_audio|>"
+    },
+    "Summarization (EN)": {
+        "system": "You are a helpful voice assistant.",
+        "user": "Summarize what is said in this recording: <|start_of_audio|><|end_of_audio|>"
+    },
+    "Summarization (RU)": {
+        "system": "Ты полезный голосовой ассистент.",
+        "user": "Кратко перескажи содержание аудио: <|start_of_audio|><|end_of_audio|>"
+    },
+    "Q&A (EN)": {
+        "system": "You are a helpful voice assistant. Listen to the audio and respond appropriately.",
+        "user": "What is being discussed in this audio? <|start_of_audio|><|end_of_audio|>"
+    },
+    "Q&A (RU)": {
+        "system": "Ты полезный голосовой ассистент. Слушай аудио и отвечай на вопросы.",
+        "user": "О чём говорится в этой аудиозаписи? <|start_of_audio|><|end_of_audio|>"
+    },
+    "Description (EN)": {
+        "system": "You are an attentive listener.",
+        "user": "Describe in detail what you hear: <|start_of_audio|><|end_of_audio|>"
+    },
+    "Description (RU)": {
+        "system": "Ты внимательный слушатель.",
+        "user": "Опиши подробно, что ты слышишь: <|start_of_audio|><|end_of_audio|>"
+    },
+    "Custom": {
+        "system": "You are a helpful voice assistant.",
+        "user": "<|start_of_audio|><|end_of_audio|>"
+    }
+}
+def update_prompts(preset):
+    """Update prompts based on selected preset."""
+    prompts = PRESET_PROMPTS.get(preset, PRESET_PROMPTS["Custom"])
+    return prompts["system"], prompts["user"]
+# Build Gradio interface
+with gr.Blocks(title="Borealis Audio-Language Model") as demo:
+    gr.Markdown("""
+    # Borealis-5B-IT
+    Audio-Language Model for Speech Understanding
+    Upload or record audio, select a prompt preset or write your own, and generate a response.
+    **Note**: Running on CPU, generation may take a while.
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            audio_input = gr.Audio(
+                label="Audio Input",
+                type="numpy",
+                sources=["upload", "microphone"]
+            )
+            preset_dropdown = gr.Dropdown(
+                choices=list(PRESET_PROMPTS.keys()),
+                value="Q&A (EN)",
+                label="Prompt Preset"
+            )
+            system_prompt = gr.Textbox(
+                label="System Prompt",
+                value=PRESET_PROMPTS["Q&A (EN)"]["system"],
+                lines=2
+            )
+            user_prompt = gr.Textbox(
+                label="User Prompt",
+                value=PRESET_PROMPTS["Q&A (EN)"]["user"],
+                lines=2,
+                info="Include <|start_of_audio|><|end_of_audio|> tags where audio should be placed"
+            )
+            with gr.Row():
+                max_tokens = gr.Slider(
+                    minimum=32,
+                    maximum=512,
+                    value=128,
+                    step=32,
+                    label="Max Tokens"
+                )
+            with gr.Row():
+                temperature = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.5,
+                    value=0.7,
+                    step=0.1,
+                    label="Temperature"
+                )
+                top_p = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.0,
+                    value=0.9,
+                    step=0.05,
+                    label="Top-p"
+                )
+            submit_btn = gr.Button("Generate", variant="primary")
+        with gr.Column(scale=1):
+            output_text = gr.Textbox(
+                label="Model Response",
+                lines=15
+            )
+    # Event handlers
+    preset_dropdown.change(
+        fn=update_prompts,
+        inputs=[preset_dropdown],
+        outputs=[system_prompt, user_prompt]
+    )
+    submit_btn.click(
+        fn=process_audio,
+        inputs=[audio_input, system_prompt, user_prompt, max_tokens, temperature, top_p],
+        outputs=[output_text]
+    )
+    gr.Markdown("""
+    ---
+    **Model**: [Vikhrmodels/Borealis-5b-it](https://huggingface.co/Vikhrmodels/Borealis-5b-it)
+    **Architecture**: Whisper Large V3 (encoder) + Qwen3-4B (LLM)
+    **Training Data**: [Speech-Instructions](https://huggingface.co/datasets/Vikhrmodels/Speech-Instructions), [Speech-Describe](https://huggingface.co/datasets/Vikhrmodels/Speech-Describe), [ToneBooks](https://huggingface.co/datasets/Vikhrmodels/ToneBooks), [AudioBooksInstructGemini2.5](https://huggingface.co/datasets/Vikhrmodels/AudioBooksInstructGemini2.5)
+    """)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+torchaudio
+transformers>=4.40.0
+safetensors
+soundfile
+librosa