| | import os |
| | import time |
| | import streamlit as st |
| | import whisperx |
| | import torch |
| | from utils import convert_segments_object_to_text, check_password, convert_segments_object_to_text_simple, get_gpu_memory_info |
| | from gigiachat_requests import get_access_token, get_completion_from_gigachat, get_number_of_tokens, process_transcribation_with_gigachat |
| | from openai_requests import get_completion_from_openai, process_transcribation_with_assistant |
| |
|
| | if check_password(): |
| | if torch.cuda.is_available(): |
| | print('GPU доступен') |
| | else: |
| | print('GPU не доступен') |
| |
|
| | print(f'Версия торча: {torch.__version__}') |
| | print(f'Версия cuda: {torch.version.cuda}') |
| | print(f'Версия cudnn: {torch.backends.cudnn.version()}') |
| | |
| | st.title('Audio Transcription App') |
| | st.sidebar.title("Settings") |
| | |
| | device = os.getenv('DEVICE') |
| | batch_size = int(os.getenv('BATCH_SIZE')) |
| | compute_type = os.getenv('COMPUTE_TYPE') |
| |
|
| | initial_base_prompt = os.getenv('BASE_PROMPT') |
| | initial_processing_prompt = os.getenv('PROCCESS_PROMPT') |
| |
|
| | min_speakers = st.sidebar.number_input("Минимальное количество спикеров", min_value=1, value=2) |
| | max_speakers = st.sidebar.number_input("Максимальное количество спикеров", min_value=1, value=2) |
| | llm = st.sidebar.selectbox("Производитель LLM", ["Сбер", "OpenAI", "Qwen"], index=0) |
| |
|
| | if llm == "Сбер": |
| | options = ["GigaChat-Plus", "GigaChat", "GigaChat-Pro"] |
| | elif llm == "OpenAI": |
| | options = ["gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-4", "gpt-3.5-turbo"] |
| | elif llm == "Qwen": |
| | options = ["Qwen/Qwen2-7B-Instruct"] |
| | else: |
| | options = [] |
| |
|
| | llm_model = st.sidebar.selectbox("Модель", options, index=0) |
| | base_prompt = st.sidebar.text_area("Промпт для резюмирования", value=initial_base_prompt) |
| |
|
| | enable_processing = st.sidebar.checkbox("Добавить обработку транскрибации", value=False) |
| | processing_prompt = st.sidebar.text_area("Промпт для обработки транскрибации", value=initial_processing_prompt) |
| |
|
| | ACCESS_TOKEN = st.secrets["HF_TOKEN"] |
| |
|
| | uploaded_file = st.file_uploader("Загрузите аудиофайл", type=["mp4", "wav", "m4a"]) |
| |
|
| | if uploaded_file is not None: |
| | file_name = uploaded_file.name |
| |
|
| | if 'file_name' not in st.session_state or st.session_state.file_name != file_name: |
| | st.session_state.transcript = '' |
| | st.session_state.file_name = file_name |
| |
|
| | |
| | st.audio(uploaded_file) |
| | file_extension = uploaded_file.name.split(".")[-1] |
| | temp_file_path = f"temp_file.{file_extension}" |
| | |
| | with open(temp_file_path, "wb") as f: |
| | f.write(uploaded_file.getbuffer()) |
| |
|
| | get_gpu_memory_info() |
| |
|
| | if 'transcript' not in st.session_state or st.session_state.transcript == '': |
| | |
| | start_time = time.time() |
| | with st.spinner('Транскрибируем...'): |
| | |
| | model = whisperx.load_model(os.getenv('WHISPER_MODEL_SIZE'), device, compute_type=compute_type) |
| | |
| | audio = whisperx.load_audio(temp_file_path) |
| | result = model.transcribe(audio, batch_size=batch_size, language="ru") |
| | print('Transcribed, now aligning') |
| | |
| | model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) |
| | result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) |
| | print('Aligned, now diarizing') |
| | |
| | diarize_model = whisperx.DiarizationPipeline(use_auth_token=st.secrets["HF_TOKEN"], device=device) |
| | diarize_segments = diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers) |
| | result_diar = whisperx.assign_word_speakers(diarize_segments, result) |
| | |
| | transcript = convert_segments_object_to_text_simple(result_diar) |
| | st.session_state.transcript = transcript |
| | end_time = time.time() |
| | total_time = end_time - start_time |
| | print(f'Полный процесс транскрипции занял {total_time:.2f} секунд') |
| | else: |
| | |
| | transcript = st.session_state.transcript |
| | |
| | st.write("Результат транскрибации:") |
| | st.text(transcript) |
| |
|
| | if (llm == 'Сбер'): |
| | access_token = get_access_token() |
| | |
| | if (enable_processing): |
| | with st.spinner('Обрабатываем транскрибацию...'): |
| |
|
| | if (llm == 'Сбер'): |
| | number_of_tokens = get_number_of_tokens(transcript, access_token, llm_model) |
| | print('Количество токенов в транскрибации: ' + str(number_of_tokens)) |
| | transcript = process_transcribation_with_gigachat(processing_prompt, transcript, number_of_tokens + 1000, access_token, llm_model) |
| | print(transcript) |
| | |
| | elif (llm == 'OpenAI'): |
| | transcript = process_transcribation_with_assistant(processing_prompt, transcript) |
| | print(transcript) |
| |
|
| | else: |
| | st.write("На данный момент обработка транскрибации не поддерживается этой моделью.") |
| | |
| | with st.spinner('Резюмируем...'): |
| | if (llm == 'Сбер'): |
| | summary_answer = get_completion_from_gigachat(base_prompt + transcript, 1024, access_token, llm_model) |
| | elif (llm == 'OpenAI'): |
| | summary_answer = get_completion_from_openai(base_prompt + transcript, llm_model, 1024) |
| | elif (llm == 'Qwen'): |
| | torch.cuda.empty_cache() |
| | from qwen import respond |
| | summary_answer = respond(base_prompt + transcript) |
| | |
| | st.write("Результат резюмирования:") |
| | st.text(summary_answer) |