Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

README.md +174 -0
config.json +16 -0
model.safetensors +3 -0
modeling_borealis.py +436 -0
push_model.py +63 -0

README.md ADDED Viewed

	@@ -0,0 +1,174 @@

+---
+license: apache-2.0
+language:
+- ru
+- en
+pipeline_tag: audio-text-to-text
+tags:
+- audio
+- speech
+- multimodal
+- whisper
+- qwen
+library_name: transformers
+---
+# Borealis-5B-IT
+Borealis is an audio-language model that combines Whisper encoder with Qwen3-4B LLM for speech understanding and instruction-following tasks.
+## Model Description
+- **Audio Encoder**: Whisper Large V3 (frozen)
+- **Language Model**: Qwen3-4B (fine-tuned)
+- **Adapter**: 2-layer MLP projecting audio embeddings to LLM space
+- **Total Parameters**: ~5B
+- **Languages**: Russian, English
+## Installation
+```bash
+pip install transformers torch torchaudio safetensors
+```
+## Quick Start
+```python
+import torch
+import torchaudio
+from transformers import AutoModel
+# Load model
+model = AutoModel.from_pretrained(
+    "Vikhrmodels/Borealis-5b-it",
+    trust_remote_code=True,
+    device="cuda"
+)
+model.eval()
+# Load audio
+audio, sr = torchaudio.load("your_audio.wav")
+if sr != 16000:
+    audio = torchaudio.functional.resample(audio, sr, 16000)
+audio = audio.squeeze()
+# Generate response
+with torch.inference_mode():
+    output_ids = model.generate(
+        audio=audio,
+        user_prompt="What is being said in this audio? <|start_of_audio|><|end_of_audio|>",
+        system_prompt="You are a helpful voice assistant.",
+        max_new_tokens=256,
+        temperature=0.7,
+    )
+response = model.decode(output_ids[0])
+print(response)
+```
+## Prompt Examples
+### Audio Transcription
+```python
+output = model.generate(
+    audio=audio,
+    user_prompt="Transcribe this audio: <|start_of_audio|><|end_of_audio|>",
+    system_prompt="You are a speech recognition assistant. Accurately transcribe audio to text."
+)
+```
+### Audio Summarization
+```python
+output = model.generate(
+    audio=audio,
+    user_prompt="Summarize what is said in this recording: <|start_of_audio|><|end_of_audio|>",
+    system_prompt="You are a helpful voice assistant."
+)
+```
+### Audio Q&A (Russian)
+```python
+output = model.generate(
+    audio=audio,
+    user_prompt="О чём говорится в этой аудиозаписи? <|start_of_audio|><|end_of_audio|>",
+    system_prompt="Ты полезный голосовой ассистент."
+)
+```
+### Content Description
+```python
+output = model.generate(
+    audio=audio,
+    user_prompt="Describe in detail what you hear: <|start_of_audio|><|end_of_audio|>",
+    system_prompt="You are an attentive listener."
+)
+```
+### Emotion Analysis
+```python
+output = model.generate(
+    audio=audio,
+    user_prompt="What emotions does the speaker express? <|start_of_audio|><|end_of_audio|>",
+    system_prompt="You are an expert in audio analysis."
+)
+```
+## Training Data
+The model was fine-tuned on a diverse mix of audio-instruction datasets:
+| Dataset | Description | Size |
+|---------|-------------|------|
+| [Vikhrmodels/Speech-Instructions](https://huggingface.co/datasets/Vikhrmodels/Speech-Instructions) | General speech instruction-following | 70k |
+| [Vikhrmodels/Speech-Describe](https://huggingface.co/datasets/Vikhrmodels/Speech-Describe) | Audio description tasks (speech & non-speech) | ~2M |
+| [Vikhrmodels/ToneBooks](https://huggingface.co/datasets/Vikhrmodels/ToneBooks) | Russian audiobook excerpts | - |
+| [Vikhrmodels/AudioBooksInstructGemini2.5](https://huggingface.co/datasets/Vikhrmodels/AudioBooksInstructGemini2.5) | Instruction data generated with Gemini 2.5 | - |
+## Model Architecture
+```
+Audio Input (16kHz)
+       │
+       ▼
+┌─────────────────┐
+│ Whisper Large V3│  (Frozen)
+│    Encoder      │
+└────────┬────────┘
+         │ (1280-dim embeddings)
+         ▼
+┌─────────────────┐
+│   Downsampler   │  (4x temporal reduction)
+│   + Adapter     │
+└────────┬────────┘
+         │ (2560-dim embeddings)
+         ▼
+┌─────────────────┐
+│   Qwen3-4B      │  (Fine-tuned)
+│      LLM        │
+└────────┬────────┘
+         │
+         ▼
+    Text Output
+```
+## Limitations
+- Optimized for audio up to 30 seconds
+- Best performance on Russian and English
+- May not handle heavily noisy audio well
+## Citation
+```bibtex
+@misc{borealis2025,
+  title={Borealis: Audio-Language Model for Speech Understanding},
+  author={VikhrModels},
+  year={2025},
+  publisher={HuggingFace},
+  url={https://huggingface.co/Vikhrmodels/Borealis-5b-it}
+}
+```
+## License
+Apache 2.0

config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "architectures": ["BorealisForConditionalGeneration"],
+  "model_type": "borealis",
+  "whisper_model_name": "openai/whisper-large-v3",
+  "llm_model_name": "Qwen/Qwen3-4B",
+  "downsample_factor": 4,
+  "audio_hidden_size": 1280,
+  "llm_hidden_size": 2560,
+  "torch_dtype": "bfloat16",
+  "auto_map": {
+    "AutoConfig": "modeling_borealis.BorealisConfig",
+    "AutoModel": "modeling_borealis.BorealisForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_borealis.BorealisForConditionalGeneration"
+  },
+  "transformers_version": "4.48.0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4de2e0360cdf08396a69adb3c7f78c3db5a27998ce7effbe02f57676649a82b
+size 10133496400

modeling_borealis.py ADDED Viewed

	@@ -0,0 +1,436 @@

+"""
+Borealis: Audio-Language Model for Speech Understanding
+This model combines a Whisper encoder with a Qwen3 LLM for audio understanding tasks.
+"""
+import math
+from typing import Optional, List, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import (
+    PreTrainedModel,
+    PretrainedConfig,
+    WhisperModel,
+    WhisperFeatureExtractor,
+    Qwen3ForCausalLM,
+    AutoTokenizer,
+)
+class BorealisConfig(PretrainedConfig):
+    """Configuration class for Borealis model."""
+    model_type = "borealis"
+    def __init__(
+        self,
+        whisper_model_name: str = "openai/whisper-large-v3",
+        llm_model_name: str = "Qwen/Qwen3-4B",
+        downsample_factor: int = 4,
+        audio_hidden_size: int = 1280,
+        llm_hidden_size: int = 2560,
+        torch_dtype: str = "bfloat16",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.whisper_model_name = whisper_model_name
+        self.llm_model_name = llm_model_name
+        self.downsample_factor = downsample_factor
+        self.audio_hidden_size = audio_hidden_size
+        self.llm_hidden_size = llm_hidden_size
+        self.torch_dtype = torch_dtype
+class AudioLanguageAdapter(nn.Module):
+    """Adapter module that projects audio embeddings to LLM embedding space."""
+    def __init__(self, hidden_size: int, dim: int) -> None:
+        super().__init__()
+        self.w_in = nn.Linear(hidden_size, dim, bias=False)
+        self.gelu = nn.GELU()
+        self.w_out = nn.Linear(dim, dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w_out(self.gelu(self.w_in(x)))
+class BorealisForConditionalGeneration(PreTrainedModel):
+    """
+    Borealis model for audio-to-text generation.
+    Combines Whisper encoder for audio processing with Qwen3 LLM for text generation.
+    Supports instruction-following tasks on audio input.
+    """
+    config_class = BorealisConfig
+    base_model_prefix = "borealis"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["AudioLanguageAdapter"]
+    def __init__(self, config: BorealisConfig):
+        super().__init__(config)
+        self.config = config
+        # These will be loaded in from_pretrained or set manually
+        self.encoder = None
+        self.llm = None
+        self.tokenizer = None
+        self.feature_extractor = None
+        self.downsample_factor = config.downsample_factor
+        # Initialize adapter
+        self.adapter = AudioLanguageAdapter(
+            hidden_size=config.audio_hidden_size * config.downsample_factor,
+            dim=config.llm_hidden_size,
+        )
+        # Special token IDs (will be set after tokenizer is loaded)
+        self.audio_start_id = None
+        self.audio_end_id = None
+        self.im_start_id = None
+        self.im_end_id = None
+    def _setup_special_tokens(self):
+        """Setup special token IDs after tokenizer is loaded."""
+        if self.tokenizer is not None:
+            self.audio_start_id = self.tokenizer.convert_tokens_to_ids("<|start_of_audio|>")
+            self.audio_end_id = self.tokenizer.convert_tokens_to_ids("<|end_of_audio|>")
+            self.im_start_id = self.tokenizer.convert_tokens_to_ids("<|im_start|>")
+            self.im_end_id = self.tokenizer.convert_tokens_to_ids("<|im_end|>")
+    def _downsample(self, seq: torch.Tensor) -> torch.Tensor:
+        """Downsample audio sequence by concatenating adjacent frames."""
+        k, (T, d) = self.downsample_factor, seq.shape
+        target = k * math.ceil(T / k)
+        if target != T:
+            seq = F.pad(seq, (0, 0, 0, target - T))
+        return seq.contiguous().view(target // k, d * k)
+    def _process_audio(self, mel) -> tuple:
+        """Process mel spectrograms through encoder and adapter."""
+        B, device = len(mel), mel[0][0].device
+        audio_embs = []
+        audio_mask = []
+        per_sample_T = []
+        max_T = 0
+        for b in range(B):
+            chunk_stack = torch.stack(mel[b])
+            enc_chunks = self.encoder(
+                input_features=chunk_stack, return_dict=True
+            ).last_hidden_state
+            enc_long = enc_chunks.view(-1, enc_chunks.size(-1))
+            ds_long = self._downsample(enc_long)
+            audio_embs.append(ds_long)
+            per_sample_T.append(ds_long.size(0))
+            max_T = max(max_T, ds_long.size(0))
+        for i in range(B):
+            pad = max_T - per_sample_T[i]
+            if pad > 0:
+                audio_embs[i] = F.pad(audio_embs[i], (0, 0, 0, pad))
+                audio_mask.append(
+                    torch.ones(per_sample_T[i], dtype=torch.long, device=device)
+                )
+                audio_mask[i] = F.pad(audio_mask[i], (0, pad), value=0)
+            else:
+                audio_mask.append(
+                    torch.ones(per_sample_T[i], dtype=torch.long, device=device)
+                )
+        audio_embeddings = torch.stack(audio_embs)
+        audio_mask = torch.stack(audio_mask)
+        audio_embeddings = self.adapter(audio_embeddings)
+        return audio_embeddings, audio_mask, per_sample_T
+    def prepare_audio(
+        self,
+        audio: Union[torch.Tensor, List[torch.Tensor]],
+        sampling_rate: int = 16000,
+    ) -> List[List[torch.Tensor]]:
+        """
+        Prepare raw audio waveforms for the model.
+        Args:
+            audio: Audio waveform(s) as tensor(s). Can be:
+                - Single tensor of shape (samples,)
+                - List of tensors
+            sampling_rate: Audio sampling rate (default: 16000)
+        Returns:
+            List of mel spectrogram chunks ready for the model
+        """
+        if self.feature_extractor is None:
+            raise ValueError("Feature extractor not loaded. Call load_components() first.")
+        if isinstance(audio, torch.Tensor) and audio.dim() == 1:
+            audio = [audio]
+        device = next(self.parameters()).device
+        mel_chunks = []
+        for audio_sample in audio:
+            if isinstance(audio_sample, torch.Tensor):
+                audio_np = audio_sample.cpu().numpy()
+            else:
+                audio_np = audio_sample
+            mel = self.feature_extractor(
+                audio_np,
+                sampling_rate=sampling_rate,
+                return_tensors="pt",
+                padding="max_length",
+                max_length=30 * sampling_rate,
+                truncation=True,
+            ).input_features.to(device).to(self.dtype)
+            mel_chunks.append([mel.squeeze(0)])
+        return mel_chunks
+    def load_components(self, device: str = "cuda"):
+        """
+        Load Whisper encoder, LLM, tokenizer, and feature extractor.
+        Args:
+            device: Device to load models on
+        """
+        dtype = getattr(torch, self.config.torch_dtype)
+        # Load Whisper encoder
+        whisper = WhisperModel.from_pretrained(
+            self.config.whisper_model_name,
+            torch_dtype=dtype,
+        )
+        self.encoder = whisper.encoder.to(device)
+        self.encoder.eval()
+        for p in self.encoder.parameters():
+            p.requires_grad = False
+        # Load feature extractor
+        self.feature_extractor = WhisperFeatureExtractor.from_pretrained(
+            self.config.whisper_model_name
+        )
+        # Load LLM
+        self.llm = Qwen3ForCausalLM.from_pretrained(
+            self.config.llm_model_name,
+            torch_dtype=dtype,
+            attn_implementation="sdpa",
+        ).to(device)
+        # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.config.llm_model_name,
+            trust_remote_code=True,
+        )
+        self.tokenizer.add_special_tokens({
+            "additional_special_tokens": ["<|start_of_audio|>", "<|end_of_audio|>"]
+        })
+        self.llm.resize_token_embeddings(len(self.tokenizer))
+        # Setup special tokens
+        self._setup_special_tokens()
+        # Move adapter to device
+        self.adapter = self.adapter.to(device).to(dtype)
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        *model_args,
+        device: str = "cuda",
+        load_components: bool = True,
+        **kwargs,
+    ):
+        """
+        Load a pretrained Borealis model.
+        Args:
+            pretrained_model_name_or_path: Path or HuggingFace model ID
+            device: Device to load on
+            load_components: Whether to automatically load Whisper/LLM components
+            **kwargs: Additional arguments passed to PreTrainedModel.from_pretrained
+        Returns:
+            BorealisForConditionalGeneration model
+        """
+        config = kwargs.pop("config", None)
+        if config is None:
+            config = BorealisConfig.from_pretrained(pretrained_model_name_or_path)
+        model = cls(config)
+        # Load adapter weights from checkpoint
+        import os
+        from safetensors.torch import load_file
+        if os.path.isdir(pretrained_model_name_or_path):
+            weights_path = os.path.join(pretrained_model_name_or_path, "model.safetensors")
+            if not os.path.exists(weights_path):
+                weights_path = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin")
+        else:
+            from huggingface_hub import hf_hub_download
+            try:
+                weights_path = hf_hub_download(
+                    repo_id=pretrained_model_name_or_path,
+                    filename="model.safetensors",
+                )
+            except:
+                weights_path = hf_hub_download(
+                    repo_id=pretrained_model_name_or_path,
+                    filename="pytorch_model.bin",
+                )
+        if weights_path.endswith(".safetensors"):
+            state_dict = load_file(weights_path)
+        else:
+            state_dict = torch.load(weights_path, map_location="cpu", weights_only=False)
+        # Load adapter weights
+        adapter_state = {
+            k.replace("adapter.", ""): v
+            for k, v in state_dict.items()
+            if k.startswith("adapter.")
+        }
+        model.adapter.load_state_dict(adapter_state)
+        if load_components:
+            model.load_components(device=device)
+            # Load encoder weights if present in checkpoint
+            encoder_state = {
+                k.replace("encoder.", ""): v
+                for k, v in state_dict.items()
+                if k.startswith("encoder.")
+            }
+            if encoder_state:
+                model.encoder.load_state_dict(encoder_state, strict=False)
+            # Load LLM weights if present
+            llm_state = {
+                k.replace("llm.", ""): v
+                for k, v in state_dict.items()
+                if k.startswith("llm.")
+            }
+            if llm_state:
+                model.llm.load_state_dict(llm_state, strict=False)
+        return model.to(device)
+    @torch.inference_mode()
+    def generate(
+        self,
+        audio: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+        mel: Optional[List[List[torch.Tensor]]] = None,
+        system_prompt: Optional[str] = None,
+        user_prompt: Optional[str] = None,
+        max_new_tokens: int = 512,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        do_sample: bool = True,
+        **kwargs,
+    ):
+        """
+        Generate text response for audio input.
+        Args:
+            audio: Raw audio waveform(s). Either audio or mel must be provided.
+            mel: Pre-processed mel spectrograms. Either audio or mel must be provided.
+            system_prompt: System prompt for the model
+            user_prompt: User prompt (should contain <|start_of_audio|><|end_of_audio|> tags)
+            max_new_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_p: Top-p sampling parameter
+            do_sample: Whether to use sampling
+            **kwargs: Additional generation arguments
+        Returns:
+            Generated token IDs
+        """
+        if audio is not None:
+            mel = self.prepare_audio(audio)
+        elif mel is not None:
+            if not isinstance(mel, list) or len(mel) == 0 or not isinstance(mel[0], list):
+                mel = [mel]
+            mel = [[c.to(self.dtype) for c in m] for m in mel]
+        else:
+            raise ValueError("Either audio or mel must be provided")
+        B, device = len(mel), mel[0][0].device
+        audio_embeddings, audio_mask, per_sample_T = self._process_audio(mel)
+        if system_prompt is None:
+            system_prompt = "You are a helpful voice assistant. Listen to the audio and respond appropriately."
+        if user_prompt is None:
+            user_prompt = "What is being said in this audio? <|start_of_audio|><|end_of_audio|>"
+        elif "<|start_of_audio|>" not in user_prompt:
+            user_prompt = f"{user_prompt}\n<|start_of_audio|><|end_of_audio|>"
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ]
+        chat_text = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        model_inputs = self.tokenizer(chat_text, return_tensors="pt").to(device)
+        input_ids = model_inputs.input_ids.repeat(B, 1)
+        text_att_mask = model_inputs.attention_mask.repeat(B, 1)
+        text_embeddings = self.llm.get_input_embeddings()(input_ids)
+        sa_idx = (input_ids[0] == self.audio_start_id).nonzero(as_tuple=True)[0].item()
+        ea_idx = (input_ids[0] == self.audio_end_id).nonzero(as_tuple=True)[0].item()
+        inputs_embeds = []
+        full_att_mask = []
+        for b in range(B):
+            prefix_emb = text_embeddings[b, : sa_idx + 1]
+            postfix_emb = text_embeddings[b, ea_idx:]
+            emb = torch.cat([prefix_emb, audio_embeddings[b], postfix_emb], dim=0)
+            prefix_mask = text_att_mask[b, : sa_idx + 1]
+            postfix_mask = text_att_mask[b, ea_idx:]
+            mask = torch.cat([prefix_mask, audio_mask[b], postfix_mask], dim=0)
+            inputs_embeds.append(emb)
+            full_att_mask.append(mask)
+        inputs_embeds = torch.nn.utils.rnn.pad_sequence(
+            inputs_embeds, batch_first=True, padding_value=0.0
+        )
+        att_mask = torch.nn.utils.rnn.pad_sequence(
+            full_att_mask, batch_first=True, padding_value=0
+        )
+        gen_ids = self.llm.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=att_mask,
+            max_new_tokens=max_new_tokens,
+            eos_token_id=self.tokenizer.eos_token_id,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=do_sample,
+            **kwargs,
+        )
+        return gen_ids
+    def decode(self, token_ids: torch.Tensor, skip_special_tokens: bool = True) -> str:
+        """Decode token IDs to text."""
+        return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)

push_model.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""Push Borealis model to HuggingFace Hub."""
+import os
+import torch
+from huggingface_hub import HfApi, create_repo, upload_folder
+from safetensors.torch import save_model
+# Config
+HF_REPO = "Vikhrmodels/Borealis-5b-it"
+CHECKPOINT_PATH = "/home/alex/Borealis/borealis_instruct_ckpts/checkpoint-2898/pytorch_model.bin"
+OUTPUT_DIR = "/home/alex/Borealis/hf_upload"
+class DictModule(torch.nn.Module):
+    """Wrapper to use save_model with state_dict."""
+    def __init__(self, state_dict):
+        super().__init__()
+        for k, v in state_dict.items():
+            # Replace dots with underscores for valid attr names
+            self.register_buffer(k.replace(".", "__DOT__"), v)
+    def state_dict(self, *args, **kwargs):
+        sd = super().state_dict(*args, **kwargs)
+        return {k.replace("__DOT__", "."): v for k, v in sd.items()}
+def main():
+    print(f"Loading checkpoint from {CHECKPOINT_PATH}...")
+    state_dict = torch.load(CHECKPOINT_PATH, map_location="cpu", weights_only=False)
+    print(f"Loaded {len(state_dict)} keys")
+    # Handle shared tensors by cloning
+    print("Handling shared tensors...")
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        new_state_dict[k] = v.clone()
+    # Convert to safetensors using save_model
+    print("Converting to safetensors format...")
+    safetensors_path = os.path.join(OUTPUT_DIR, "model.safetensors")
+    from safetensors.torch import save_file
+    save_file(new_state_dict, safetensors_path)
+    print(f"Saved to {safetensors_path}")
+    # Create repo
+    print(f"\nCreating/accessing repo: {HF_REPO}")
+    api = HfApi()
+    try:
+        create_repo(HF_REPO, repo_type="model", exist_ok=True)
+    except Exception as e:
+        print(f"Repo note: {e}")
+    # Upload folder
+    print(f"\nUploading to {HF_REPO}...")
+    api.upload_folder(
+        folder_path=OUTPUT_DIR,
+        repo_id=HF_REPO,
+        repo_type="model",
+    )
+    print(f"\nDone! Model available at: https://huggingface.co/{HF_REPO}")
+if __name__ == "__main__":
+    main()