{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "MgzOBm5ggGts" }, "outputs": [], "source": [ "from transformers import AutoModelForCausalLM, AutoTokenizer, AutoFeatureExtractor\n", "import torch\n", "import librosa\n", "import gradio as gr\n", "import numpy as np\n", "from scipy.signal import resample" ] }, { "cell_type": "code", "source": [ "model = AutoModelForCausalLM.from_pretrained(\"Vikhrmodels/Borealis\", trust_remote_code=True)\n", "tokenizer = AutoTokenizer.from_pretrained(\"Vikhrmodels/Borealis\")\n", "extractor = AutoFeatureExtractor.from_pretrained(\"Vikhrmodels/Borealis\")" ], "metadata": { "id": "-jATl7uegLVb" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model.eval()\n", "model = model.to(\"cuda\")" ], "metadata": { "id": "y78mNR_6gLX1" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def transcribe(audio):\n", " if audio is None:\n", " return \"Аудио не предоставлено.\"\n", "\n", " sr, waveform = audio\n", "\n", "\n", " if waveform.ndim > 1:\n", " waveform = np.mean(waveform, axis=1)\n", "\n", "\n", " waveform = waveform.astype(np.float32) / 32768.0\n", "\n", " target_sr = 16000\n", " if sr != target_sr:\n", " num_samples = int(len(waveform) * target_sr / sr)\n", " waveform = resample(waveform, num_samples)\n", " sr = target_sr\n", "\n", " proc = extractor(\n", " waveform,\n", " sampling_rate=sr,\n", " padding=\"max_length\",\n", " max_length=480_000,\n", " return_attention_mask=True,\n", " return_tensors=\"pt\",\n", " )\n", "\n", " mel = proc.input_features.squeeze(0).to(\"cuda\")\n", " att_mask = proc.attention_mask.squeeze(0).to(\"cuda\")\n", "\n", " with torch.inference_mode():\n", " transcript = model.generate(mel=mel, att_mask=att_mask, **generation_params)\n", "\n", " return transcript" ], "metadata": { "id": "q890Jhp3gLaB" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "generation_params = {\n", " \"max_new_tokens\": 350,\n", " \"do_sample\": True,\n", " \"top_p\": 0.9,\n", " \"top_k\": 50,\n", " \"temperature\": 0.2,\n", "}" ], "metadata": { "id": "jl4M9fXVjpLC" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "with gr.Blocks(theme=gr.themes.Soft()) as demo:\n", " gr.Markdown(\"

Демо Borealis

\")\n", " with gr.Row():\n", " with gr.Column(scale=2):\n", " audio_input = gr.Audio(sources=[\"microphone\", \"upload\"], type=\"numpy\", label=\"Запишите аудио или загрузите файл\", interactive=True)\n", " with gr.Column(scale=1):\n", " btn = gr.Button(\"Распознать\", variant=\"primary\", size=\"lg\")\n", " output = gr.Textbox(label=\"Расшифровка аудио\", lines=6, show_copy_button=True, interactive=False)\n", " btn.click(transcribe, inputs=audio_input, outputs=output)" ], "metadata": { "id": "jJ-aDtBNgLcM" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "demo.launch(share=True)" ], "metadata": { "id": "WJehoSe9gLeI", "collapsed": true }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "oXquIX2QgLgI" }, "execution_count": null, "outputs": [] } ] }