In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoFeatureExtractor
import torch
import librosa
import gradio as gr
import numpy as np
from scipy.signal import resample

In [None]:
model = AutoModelForCausalLM.from_pretrained("Vikhrmodels/Borealis", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("Vikhrmodels/Borealis")
extractor = AutoFeatureExtractor.from_pretrained("Vikhrmodels/Borealis")

In [None]:
model.eval()
model = model.to("cuda")

In [None]:
def transcribe(audio):
    if audio is None:
        return "Аудио не предоставлено."

    sr, waveform = audio


    if waveform.ndim > 1:
        waveform = np.mean(waveform, axis=1)


    waveform = waveform.astype(np.float32) / 32768.0

    target_sr = 16000
    if sr != target_sr:
        num_samples = int(len(waveform) * target_sr / sr)
        waveform = resample(waveform, num_samples)
    sr = target_sr

    proc = extractor(
        waveform,
        sampling_rate=sr,
        padding="max_length",
        max_length=480_000,
        return_attention_mask=True,
        return_tensors="pt",
    )

    mel = proc.input_features.squeeze(0).to("cuda")
    att_mask = proc.attention_mask.squeeze(0).to("cuda")

    with torch.inference_mode():
        transcript = model.generate(mel=mel, att_mask=att_mask, **generation_params)

    return transcript

In [None]:
generation_params = {
    "max_new_tokens": 350,
    "do_sample": True,
    "top_p": 0.9,
    "top_k": 50,
    "temperature": 0.2,
}

In [None]:
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("<h1 style='text-align: center; margin-bottom: 20px;'>Демо Borealis</h1>")
    with gr.Row():
        with gr.Column(scale=2):
            audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="Запишите аудио или загрузите файл", interactive=True)
        with gr.Column(scale=1):
            btn = gr.Button("Распознать", variant="primary", size="lg")
    output = gr.Textbox(label="Расшифровка аудио", lines=6, show_copy_button=True, interactive=False)
    btn.click(transcribe, inputs=audio_input, outputs=output)

In [None]:
demo.launch(share=True)