Spaces:

Sabbah13
/

text_transcribation_diarization_and_summarization

Paused

text_transcribation_diarization_and_summarization

File size: 6,525 Bytes

8fb0be5
4048265
ee531be
59f6126
 
df7e4e5
93ba2b1
b83f300
024f740
4b331f0
ad9e842
 
 
 
 
 
 
56a81fb
ad9e842
4b331f0
 
e8a4c9c
 
 
 
ee531be
09b358f
 
7dc42bb
8ac5c0e
 
78f79fc
2b3433c
 
 
 
 
78f79fc
 
2b3433c
 
 
69b2504
09b358f
29a10e5
c9323c5
 
791adc1
4b331f0
ee531be
4b331f0
ee531be
4b331f0
09b358f
 
 
 
 
060a1e0
09b358f
4b331f0
 
 
ee531be
4b331f0
 
09b358f
df7e4e5
 
09b358f
ee531be
4048265
09b358f
 
 
 
 
 
 
4380489
09b358f
 
 
4380489
09b358f
8ac5c0e
09b358f
ee531be
1c8f6d7
060a1e0
4048265
 
 
09b358f
623c1fa
09b358f
 
623c1fa
4b331f0
e86a1dc
2b3433c
e86a1dc
a8c8823
c9323c5
 
46f1669
2b3433c
69b2504
c9323c5
69b2504
93ba2b1
b83f300
2b3433c
b83f300
 
78f79fc
 
 
791adc1
4b331f0
2b3433c
69b2504
2b3433c
78f79fc
 
7a9eeaf
096ee7c
78f79fc
 
4b331f0

import os
import time
import streamlit as st
import whisperx
import torch
from utils import convert_segments_object_to_text, check_password, convert_segments_object_to_text_simple, get_gpu_memory_info
from gigiachat_requests import get_access_token, get_completion_from_gigachat, get_number_of_tokens, process_transcribation_with_gigachat
from openai_requests import get_completion_from_openai, process_transcribation_with_assistant

if check_password():    
    if torch.cuda.is_available():
        print('GPU доступен')
    else:
        print('GPU не доступен')

    print(f'Версия торча: {torch.__version__}')
    print(f'Версия cuda: {torch.version.cuda}')
    print(f'Версия cudnn: {torch.backends.cudnn.version()}')
    
    st.title('Audio Transcription App')
    st.sidebar.title("Settings")
    
    device = os.getenv('DEVICE')
    batch_size = int(os.getenv('BATCH_SIZE'))
    compute_type = os.getenv('COMPUTE_TYPE')

    initial_base_prompt = os.getenv('BASE_PROMPT')
    initial_processing_prompt = os.getenv('PROCCESS_PROMPT')

    min_speakers = st.sidebar.number_input("Минимальное количество спикеров", min_value=1, value=2)
    max_speakers = st.sidebar.number_input("Максимальное количество спикеров", min_value=1, value=2)
    llm = st.sidebar.selectbox("Производитель LLM", ["Сбер", "OpenAI", "Qwen"], index=0)

    if llm == "Сбер":
        options = ["GigaChat-Plus", "GigaChat", "GigaChat-Pro"]
    elif llm == "OpenAI":
        options = ["gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-4", "gpt-3.5-turbo"]
    elif llm == "Qwen":
        options = ["Qwen/Qwen2-7B-Instruct"]
    else:
        options = []

    llm_model = st.sidebar.selectbox("Модель", options, index=0)
    base_prompt = st.sidebar.text_area("Промпт для резюмирования", value=initial_base_prompt)

    enable_processing = st.sidebar.checkbox("Добавить обработку транскрибации", value=False)
    processing_prompt = st.sidebar.text_area("Промпт для обработки транскрибации", value=initial_processing_prompt)

    ACCESS_TOKEN = st.secrets["HF_TOKEN"]

    uploaded_file = st.file_uploader("Загрузите аудиофайл", type=["mp4", "wav", "m4a"])

    if uploaded_file is not None:
        file_name = uploaded_file.name

        if 'file_name' not in st.session_state or st.session_state.file_name != file_name:
            st.session_state.transcript = ''
            st.session_state.file_name = file_name

            
        st.audio(uploaded_file)
        file_extension = uploaded_file.name.split(".")[-1]  # Получаем расширение файла
        temp_file_path = f"temp_file.{file_extension}"  # Создаем временное имя файла с правильным расширением
    
        with open(temp_file_path, "wb") as f:
            f.write(uploaded_file.getbuffer())

        get_gpu_memory_info()

        if 'transcript' not in st.session_state or st.session_state.transcript == '':
    
            start_time = time.time()
            with st.spinner('Транскрибируем...'):
                # Load model
                model = whisperx.load_model(os.getenv('WHISPER_MODEL_SIZE'), device, compute_type=compute_type)
                # Load and transcribe audio
                audio = whisperx.load_audio(temp_file_path)
                result = model.transcribe(audio, batch_size=batch_size, language="ru")
                print('Transcribed, now aligning')
        
                model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
                result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
                print('Aligned, now diarizing')
        
                diarize_model = whisperx.DiarizationPipeline(use_auth_token=st.secrets["HF_TOKEN"], device=device)
                diarize_segments = diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
                result_diar = whisperx.assign_word_speakers(diarize_segments, result)
        
            transcript = convert_segments_object_to_text_simple(result_diar)
            st.session_state.transcript = transcript
            end_time = time.time()  # Конец отсчета времени
            total_time = end_time - start_time
            print(f'Полный процесс транскрипции занял {total_time:.2f} секунд')
        else:
            
            transcript = st.session_state.transcript
            
        st.write("Результат транскрибации:")
        st.text(transcript)

        if (llm == 'Сбер'):
            access_token = get_access_token()
    
        if (enable_processing):
            with st.spinner('Обрабатываем транскрибацию...'):

                if (llm == 'Сбер'):
                    number_of_tokens = get_number_of_tokens(transcript, access_token, llm_model)
                    print('Количество токенов в транскрибации: ' + str(number_of_tokens))
                    transcript = process_transcribation_with_gigachat(processing_prompt, transcript, number_of_tokens + 1000, access_token, llm_model)
                    print(transcript)
                    
                elif (llm == 'OpenAI'):
                    transcript = process_transcribation_with_assistant(processing_prompt, transcript)
                    print(transcript)

                else:
                    st.write("На данный момент обработка транскрибации не поддерживается этой моделью.")
    
        with st.spinner('Резюмируем...'):
            if (llm == 'Сбер'):
                summary_answer = get_completion_from_gigachat(base_prompt + transcript, 1024, access_token, llm_model)
            elif (llm == 'OpenAI'):
                summary_answer = get_completion_from_openai(base_prompt + transcript, llm_model, 1024)
            elif (llm == 'Qwen'):
                torch.cuda.empty_cache()
                from qwen import respond
                summary_answer = respond(base_prompt + transcript)
                
            st.write("Результат резюмирования:")
            st.text(summary_answer)