Spaces:
Sleeping
Sleeping
| """ | |
| Gradio UI for Borealis Audio-Language Model (CPU Version) | |
| """ | |
| import os | |
| os.environ["HF_AUDIO_DECODER_BACKEND"] = "soundfile" | |
| import torch | |
| import gradio as gr | |
| from transformers import AutoModel | |
| # Force CPU | |
| DEVICE = "cpu" | |
| # Global model variable | |
| model = None | |
| def load_model(): | |
| global model | |
| if model is None: | |
| print("Loading Borealis model on CPU...") | |
| model = AutoModel.from_pretrained( | |
| "Vikhrmodels/Borealis-5b-it", | |
| trust_remote_code=True, | |
| device=DEVICE, | |
| torch_dtype=torch.float32, | |
| ) | |
| model.eval() | |
| print("Model loaded!") | |
| return model | |
| def process_audio(audio, system_prompt, user_prompt, max_tokens, temperature, top_p): | |
| """Process audio and generate response.""" | |
| if audio is None: | |
| return "Please upload or record an audio file." | |
| m = load_model() | |
| sr, audio_array = audio | |
| # Convert to torch tensor and normalize | |
| audio_tensor = torch.tensor(audio_array).float() | |
| if audio_tensor.dim() > 1: | |
| audio_tensor = audio_tensor.mean(dim=-1) # Convert stereo to mono | |
| # Normalize to [-1, 1] if needed | |
| if audio_tensor.abs().max() > 1.0: | |
| audio_tensor = audio_tensor / 32768.0 | |
| # Resample if needed | |
| if sr != 16000: | |
| import torchaudio | |
| audio_tensor = torchaudio.functional.resample(audio_tensor, sr, 16000) | |
| # Ensure audio tags in prompt | |
| if "<|start_of_audio|>" not in user_prompt: | |
| user_prompt = f"{user_prompt} <|start_of_audio|><|end_of_audio|>" | |
| with torch.inference_mode(): | |
| output = m.generate( | |
| audio=audio_tensor, | |
| system_prompt=system_prompt, | |
| user_prompt=user_prompt, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| do_sample=temperature > 0, | |
| ) | |
| response = m.decode(output[0]) | |
| return response | |
| # Preset prompts | |
| PRESET_PROMPTS = { | |
| "Transcription (EN)": { | |
| "system": "You are a speech recognition assistant. Accurately transcribe audio to text.", | |
| "user": "Transcribe this audio: <|start_of_audio|><|end_of_audio|>" | |
| }, | |
| "Transcription (RU)": { | |
| "system": "Ты ассистент по распознаванию речи. Точно транскрибируй аудио в текст.", | |
| "user": "Транскрибируй это аудио: <|start_of_audio|><|end_of_audio|>" | |
| }, | |
| "Summarization (EN)": { | |
| "system": "You are a helpful voice assistant.", | |
| "user": "Summarize what is said in this recording: <|start_of_audio|><|end_of_audio|>" | |
| }, | |
| "Summarization (RU)": { | |
| "system": "Ты полезный голосовой ассистент.", | |
| "user": "Кратко перескажи содержание аудио: <|start_of_audio|><|end_of_audio|>" | |
| }, | |
| "Q&A (EN)": { | |
| "system": "You are a helpful voice assistant. Listen to the audio and respond appropriately.", | |
| "user": "What is being discussed in this audio? <|start_of_audio|><|end_of_audio|>" | |
| }, | |
| "Q&A (RU)": { | |
| "system": "Ты полезный голосовой ассистент. Слушай аудио и отвечай на вопросы.", | |
| "user": "О чём говорится в этой аудиозаписи? <|start_of_audio|><|end_of_audio|>" | |
| }, | |
| "Description (EN)": { | |
| "system": "You are an attentive listener.", | |
| "user": "Describe in detail what you hear: <|start_of_audio|><|end_of_audio|>" | |
| }, | |
| "Description (RU)": { | |
| "system": "Ты внимательный слушатель.", | |
| "user": "Опиши подробно, что ты слышишь: <|start_of_audio|><|end_of_audio|>" | |
| }, | |
| "Custom": { | |
| "system": "You are a helpful voice assistant.", | |
| "user": "<|start_of_audio|><|end_of_audio|>" | |
| } | |
| } | |
| def update_prompts(preset): | |
| """Update prompts based on selected preset.""" | |
| prompts = PRESET_PROMPTS.get(preset, PRESET_PROMPTS["Custom"]) | |
| return prompts["system"], prompts["user"] | |
| # Build Gradio interface | |
| with gr.Blocks(title="Borealis Audio-Language Model") as demo: | |
| gr.Markdown(""" | |
| # Borealis-5B-IT | |
| Audio-Language Model for Speech Understanding | |
| Upload or record audio, select a prompt preset or write your own, and generate a response. | |
| **Note**: Running on CPU, generation may take a while. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| audio_input = gr.Audio( | |
| label="Audio Input", | |
| type="numpy", | |
| sources=["upload", "microphone"] | |
| ) | |
| preset_dropdown = gr.Dropdown( | |
| choices=list(PRESET_PROMPTS.keys()), | |
| value="Q&A (EN)", | |
| label="Prompt Preset" | |
| ) | |
| system_prompt = gr.Textbox( | |
| label="System Prompt", | |
| value=PRESET_PROMPTS["Q&A (EN)"]["system"], | |
| lines=2 | |
| ) | |
| user_prompt = gr.Textbox( | |
| label="User Prompt", | |
| value=PRESET_PROMPTS["Q&A (EN)"]["user"], | |
| lines=2, | |
| info="Include <|start_of_audio|><|end_of_audio|> tags where audio should be placed" | |
| ) | |
| with gr.Row(): | |
| max_tokens = gr.Slider( | |
| minimum=32, | |
| maximum=512, | |
| value=128, | |
| step=32, | |
| label="Max Tokens" | |
| ) | |
| with gr.Row(): | |
| temperature = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.5, | |
| value=0.7, | |
| step=0.1, | |
| label="Temperature" | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.9, | |
| step=0.05, | |
| label="Top-p" | |
| ) | |
| submit_btn = gr.Button("Generate", variant="primary") | |
| with gr.Column(scale=1): | |
| output_text = gr.Textbox( | |
| label="Model Response", | |
| lines=15 | |
| ) | |
| # Event handlers | |
| preset_dropdown.change( | |
| fn=update_prompts, | |
| inputs=[preset_dropdown], | |
| outputs=[system_prompt, user_prompt] | |
| ) | |
| submit_btn.click( | |
| fn=process_audio, | |
| inputs=[audio_input, system_prompt, user_prompt, max_tokens, temperature, top_p], | |
| outputs=[output_text] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| **Model**: [Vikhrmodels/Borealis-5b-it](https://huggingface.co/Vikhrmodels/Borealis-5b-it) | |
| **Architecture**: Whisper Large V3 (encoder) + Qwen3-4B (LLM) | |
| **Training Data**: [Speech-Instructions](https://huggingface.co/datasets/Vikhrmodels/Speech-Instructions), [Speech-Describe](https://huggingface.co/datasets/Vikhrmodels/Speech-Describe), [ToneBooks](https://huggingface.co/datasets/Vikhrmodels/ToneBooks), [AudioBooksInstructGemini2.5](https://huggingface.co/datasets/Vikhrmodels/AudioBooksInstructGemini2.5) | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |