{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "MgzOBm5ggGts"
      },
      "outputs": [],
      "source": [
        "from transformers import AutoModelForCausalLM, AutoTokenizer, AutoFeatureExtractor\n",
        "import torch\n",
        "import librosa\n",
        "import gradio as gr\n",
        "import numpy as np\n",
        "from scipy.signal import resample"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "model = AutoModelForCausalLM.from_pretrained(\"Vikhrmodels/Borealis\", trust_remote_code=True)\n",
        "tokenizer = AutoTokenizer.from_pretrained(\"Vikhrmodels/Borealis\")\n",
        "extractor = AutoFeatureExtractor.from_pretrained(\"Vikhrmodels/Borealis\")"
      ],
      "metadata": {
        "id": "-jATl7uegLVb"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "model.eval()\n",
        "model = model.to(\"cuda\")"
      ],
      "metadata": {
        "id": "y78mNR_6gLX1"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def transcribe(audio):\n",
        "    if audio is None:\n",
        "        return \"Аудио не предоставлено.\"\n",
        "\n",
        "    sr, waveform = audio\n",
        "\n",
        "\n",
        "    if waveform.ndim > 1:\n",
        "        waveform = np.mean(waveform, axis=1)\n",
        "\n",
        "\n",
        "    waveform = waveform.astype(np.float32) / 32768.0\n",
        "\n",
        "    target_sr = 16000\n",
        "    if sr != target_sr:\n",
        "        num_samples = int(len(waveform) * target_sr / sr)\n",
        "        waveform = resample(waveform, num_samples)\n",
        "    sr = target_sr\n",
        "\n",
        "    proc = extractor(\n",
        "        waveform,\n",
        "        sampling_rate=sr,\n",
        "        padding=\"max_length\",\n",
        "        max_length=480_000,\n",
        "        return_attention_mask=True,\n",
        "        return_tensors=\"pt\",\n",
        "    )\n",
        "\n",
        "    mel = proc.input_features.squeeze(0).to(\"cuda\")\n",
        "    att_mask = proc.attention_mask.squeeze(0).to(\"cuda\")\n",
        "\n",
        "    with torch.inference_mode():\n",
        "        transcript = model.generate(mel=mel, att_mask=att_mask, **generation_params)\n",
        "\n",
        "    return transcript"
      ],
      "metadata": {
        "id": "q890Jhp3gLaB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "generation_params = {\n",
        "    \"max_new_tokens\": 350,\n",
        "    \"do_sample\": True,\n",
        "    \"top_p\": 0.9,\n",
        "    \"top_k\": 50,\n",
        "    \"temperature\": 0.2,\n",
        "}"
      ],
      "metadata": {
        "id": "jl4M9fXVjpLC"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "with gr.Blocks(theme=gr.themes.Soft()) as demo:\n",
        "    gr.Markdown(\"<h1 style='text-align: center; margin-bottom: 20px;'>Демо Borealis</h1>\")\n",
        "    with gr.Row():\n",
        "        with gr.Column(scale=2):\n",
        "            audio_input = gr.Audio(sources=[\"microphone\", \"upload\"], type=\"numpy\", label=\"Запишите аудио или загрузите файл\", interactive=True)\n",
        "        with gr.Column(scale=1):\n",
        "            btn = gr.Button(\"Распознать\", variant=\"primary\", size=\"lg\")\n",
        "    output = gr.Textbox(label=\"Расшифровка аудио\", lines=6, show_copy_button=True, interactive=False)\n",
        "    btn.click(transcribe, inputs=audio_input, outputs=output)"
      ],
      "metadata": {
        "id": "jJ-aDtBNgLcM"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "demo.launch(share=True)"
      ],
      "metadata": {
        "id": "WJehoSe9gLeI",
        "collapsed": true
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "oXquIX2QgLgI"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}