AlexWortega commited on
Commit
5a780b7
·
verified ·
1 Parent(s): 60425b2

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +37 -5
  2. app.py +213 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,12 +1,44 @@
1
  ---
2
  title: Borealis Inference
3
- emoji: 🦀
4
- colorFrom: yellow
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 6.1.0
8
  app_file: app.py
9
  pinned: false
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Borealis Inference
3
+ emoji: 🎙️
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.9.1
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
+ models:
12
+ - Vikhrmodels/Borealis-5b-it
13
  ---
14
 
15
+ # Borealis-5B-IT Inference
16
+
17
+ Audio-Language Model for Speech Understanding.
18
+
19
+ ## Features
20
+
21
+ - Upload audio or record from microphone
22
+ - Multiple prompt presets (transcription, summarization, Q&A)
23
+ - Support for Russian and English
24
+ - Customizable generation parameters
25
+
26
+ ## Model
27
+
28
+ - **Architecture**: Whisper Large V3 (encoder) + Qwen3-4B (LLM)
29
+ - **Parameters**: ~5B
30
+ - **Languages**: Russian, English
31
+
32
+ ## Usage
33
+
34
+ 1. Upload an audio file or record using microphone
35
+ 2. Select a prompt preset or write custom prompts
36
+ 3. Adjust generation parameters if needed
37
+ 4. Click "Generate" to get the response
38
+
39
+ **Note**: Running on CPU, generation may take some time.
40
+
41
+ ## Links
42
+
43
+ - [Model Card](https://huggingface.co/Vikhrmodels/Borealis-5b-it)
44
+ - [Training Datasets](https://huggingface.co/datasets/Vikhrmodels/Speech-Instructions)
app.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio UI for Borealis Audio-Language Model (CPU Version)
3
+ """
4
+
5
+ import os
6
+ os.environ["HF_AUDIO_DECODER_BACKEND"] = "soundfile"
7
+
8
+ import torch
9
+ import gradio as gr
10
+ from transformers import AutoModel
11
+
12
+ # Force CPU
13
+ DEVICE = "cpu"
14
+
15
+ # Global model variable
16
+ model = None
17
+
18
+ def load_model():
19
+ global model
20
+ if model is None:
21
+ print("Loading Borealis model on CPU...")
22
+ model = AutoModel.from_pretrained(
23
+ "Vikhrmodels/Borealis-5b-it",
24
+ trust_remote_code=True,
25
+ device=DEVICE,
26
+ torch_dtype=torch.float32,
27
+ )
28
+ model.eval()
29
+ print("Model loaded!")
30
+ return model
31
+
32
+ def process_audio(audio, system_prompt, user_prompt, max_tokens, temperature, top_p):
33
+ """Process audio and generate response."""
34
+ if audio is None:
35
+ return "Please upload or record an audio file."
36
+
37
+ m = load_model()
38
+
39
+ sr, audio_array = audio
40
+
41
+ # Convert to torch tensor and normalize
42
+ audio_tensor = torch.tensor(audio_array).float()
43
+ if audio_tensor.dim() > 1:
44
+ audio_tensor = audio_tensor.mean(dim=-1) # Convert stereo to mono
45
+
46
+ # Normalize to [-1, 1] if needed
47
+ if audio_tensor.abs().max() > 1.0:
48
+ audio_tensor = audio_tensor / 32768.0
49
+
50
+ # Resample if needed
51
+ if sr != 16000:
52
+ import torchaudio
53
+ audio_tensor = torchaudio.functional.resample(audio_tensor, sr, 16000)
54
+
55
+ # Ensure audio tags in prompt
56
+ if "<|start_of_audio|>" not in user_prompt:
57
+ user_prompt = f"{user_prompt} <|start_of_audio|><|end_of_audio|>"
58
+
59
+ with torch.inference_mode():
60
+ output = m.generate(
61
+ audio=audio_tensor,
62
+ system_prompt=system_prompt,
63
+ user_prompt=user_prompt,
64
+ max_new_tokens=max_tokens,
65
+ temperature=temperature,
66
+ top_p=top_p,
67
+ do_sample=temperature > 0,
68
+ )
69
+
70
+ response = m.decode(output[0])
71
+ return response
72
+
73
+ # Preset prompts
74
+ PRESET_PROMPTS = {
75
+ "Transcription (EN)": {
76
+ "system": "You are a speech recognition assistant. Accurately transcribe audio to text.",
77
+ "user": "Transcribe this audio: <|start_of_audio|><|end_of_audio|>"
78
+ },
79
+ "Transcription (RU)": {
80
+ "system": "Ты ассистент по распознаванию речи. Точно транскрибируй аудио в текст.",
81
+ "user": "Транскрибируй это аудио: <|start_of_audio|><|end_of_audio|>"
82
+ },
83
+ "Summarization (EN)": {
84
+ "system": "You are a helpful voice assistant.",
85
+ "user": "Summarize what is said in this recording: <|start_of_audio|><|end_of_audio|>"
86
+ },
87
+ "Summarization (RU)": {
88
+ "system": "Ты полезный голосовой ассистент.",
89
+ "user": "Кратко перескажи содержание аудио: <|start_of_audio|><|end_of_audio|>"
90
+ },
91
+ "Q&A (EN)": {
92
+ "system": "You are a helpful voice assistant. Listen to the audio and respond appropriately.",
93
+ "user": "What is being discussed in this audio? <|start_of_audio|><|end_of_audio|>"
94
+ },
95
+ "Q&A (RU)": {
96
+ "system": "Ты полезный голосовой ассистент. Слушай аудио и отвечай на вопросы.",
97
+ "user": "О чём говорится в этой аудиозаписи? <|start_of_audio|><|end_of_audio|>"
98
+ },
99
+ "Description (EN)": {
100
+ "system": "You are an attentive listener.",
101
+ "user": "Describe in detail what you hear: <|start_of_audio|><|end_of_audio|>"
102
+ },
103
+ "Description (RU)": {
104
+ "system": "Ты внимательный слушатель.",
105
+ "user": "Опиши подробно, что ты слышишь: <|start_of_audio|><|end_of_audio|>"
106
+ },
107
+ "Custom": {
108
+ "system": "You are a helpful voice assistant.",
109
+ "user": "<|start_of_audio|><|end_of_audio|>"
110
+ }
111
+ }
112
+
113
+ def update_prompts(preset):
114
+ """Update prompts based on selected preset."""
115
+ prompts = PRESET_PROMPTS.get(preset, PRESET_PROMPTS["Custom"])
116
+ return prompts["system"], prompts["user"]
117
+
118
+ # Build Gradio interface
119
+ with gr.Blocks(title="Borealis Audio-Language Model") as demo:
120
+ gr.Markdown("""
121
+ # Borealis-5B-IT
122
+
123
+ Audio-Language Model for Speech Understanding
124
+
125
+ Upload or record audio, select a prompt preset or write your own, and generate a response.
126
+
127
+ **Note**: Running on CPU, generation may take a while.
128
+ """)
129
+
130
+ with gr.Row():
131
+ with gr.Column(scale=1):
132
+ audio_input = gr.Audio(
133
+ label="Audio Input",
134
+ type="numpy",
135
+ sources=["upload", "microphone"]
136
+ )
137
+
138
+ preset_dropdown = gr.Dropdown(
139
+ choices=list(PRESET_PROMPTS.keys()),
140
+ value="Q&A (EN)",
141
+ label="Prompt Preset"
142
+ )
143
+
144
+ system_prompt = gr.Textbox(
145
+ label="System Prompt",
146
+ value=PRESET_PROMPTS["Q&A (EN)"]["system"],
147
+ lines=2
148
+ )
149
+
150
+ user_prompt = gr.Textbox(
151
+ label="User Prompt",
152
+ value=PRESET_PROMPTS["Q&A (EN)"]["user"],
153
+ lines=2,
154
+ info="Include <|start_of_audio|><|end_of_audio|> tags where audio should be placed"
155
+ )
156
+
157
+ with gr.Row():
158
+ max_tokens = gr.Slider(
159
+ minimum=32,
160
+ maximum=512,
161
+ value=128,
162
+ step=32,
163
+ label="Max Tokens"
164
+ )
165
+
166
+ with gr.Row():
167
+ temperature = gr.Slider(
168
+ minimum=0.0,
169
+ maximum=1.5,
170
+ value=0.7,
171
+ step=0.1,
172
+ label="Temperature"
173
+ )
174
+ top_p = gr.Slider(
175
+ minimum=0.1,
176
+ maximum=1.0,
177
+ value=0.9,
178
+ step=0.05,
179
+ label="Top-p"
180
+ )
181
+
182
+ submit_btn = gr.Button("Generate", variant="primary")
183
+
184
+ with gr.Column(scale=1):
185
+ output_text = gr.Textbox(
186
+ label="Model Response",
187
+ lines=15
188
+ )
189
+
190
+ # Event handlers
191
+ preset_dropdown.change(
192
+ fn=update_prompts,
193
+ inputs=[preset_dropdown],
194
+ outputs=[system_prompt, user_prompt]
195
+ )
196
+
197
+ submit_btn.click(
198
+ fn=process_audio,
199
+ inputs=[audio_input, system_prompt, user_prompt, max_tokens, temperature, top_p],
200
+ outputs=[output_text]
201
+ )
202
+
203
+ gr.Markdown("""
204
+ ---
205
+ **Model**: [Vikhrmodels/Borealis-5b-it](https://huggingface.co/Vikhrmodels/Borealis-5b-it)
206
+
207
+ **Architecture**: Whisper Large V3 (encoder) + Qwen3-4B (LLM)
208
+
209
+ **Training Data**: [Speech-Instructions](https://huggingface.co/datasets/Vikhrmodels/Speech-Instructions), [Speech-Describe](https://huggingface.co/datasets/Vikhrmodels/Speech-Describe), [ToneBooks](https://huggingface.co/datasets/Vikhrmodels/ToneBooks), [AudioBooksInstructGemini2.5](https://huggingface.co/datasets/Vikhrmodels/AudioBooksInstructGemini2.5)
210
+ """)
211
+
212
+ if __name__ == "__main__":
213
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ transformers>=4.40.0
4
+ safetensors
5
+ soundfile
6
+ librosa