Spaces:

Sabbah13
/

text_transcribation_diarization_and_summarization

Paused

App Files Files Community

text_transcribation_diarization_and_summarization / app.py

Sabbah13

Update app.py

56a81fb verified over 1 year ago

raw

history blame contribute delete

6.53 kB

	import os
	import time
	import streamlit as st
	import whisperx
	import torch
	from utils import convert_segments_object_to_text, check_password, convert_segments_object_to_text_simple, get_gpu_memory_info
	from gigiachat_requests import get_access_token, get_completion_from_gigachat, get_number_of_tokens, process_transcribation_with_gigachat
	from openai_requests import get_completion_from_openai, process_transcribation_with_assistant

	if check_password():
	if torch.cuda.is_available():
	print('GPU доступен')
	else:
	print('GPU не доступен')

	print(f'Версия торча: {torch.__version__}')
	print(f'Версия cuda: {torch.version.cuda}')
	print(f'Версия cudnn: {torch.backends.cudnn.version()}')

	st.title('Audio Transcription App')
	st.sidebar.title("Settings")

	device = os.getenv('DEVICE')
	batch_size = int(os.getenv('BATCH_SIZE'))
	compute_type = os.getenv('COMPUTE_TYPE')

	initial_base_prompt = os.getenv('BASE_PROMPT')
	initial_processing_prompt = os.getenv('PROCCESS_PROMPT')

	min_speakers = st.sidebar.number_input("Минимальное количество спикеров", min_value=1, value=2)
	max_speakers = st.sidebar.number_input("Максимальное количество спикеров", min_value=1, value=2)
	llm = st.sidebar.selectbox("Производитель LLM", ["Сбер", "OpenAI", "Qwen"], index=0)

	if llm == "Сбер":
	options = ["GigaChat-Plus", "GigaChat", "GigaChat-Pro"]
	elif llm == "OpenAI":
	options = ["gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-4", "gpt-3.5-turbo"]
	elif llm == "Qwen":
	options = ["Qwen/Qwen2-7B-Instruct"]
	else:
	options = []

	llm_model = st.sidebar.selectbox("Модель", options, index=0)
	base_prompt = st.sidebar.text_area("Промпт для резюмирования", value=initial_base_prompt)

	enable_processing = st.sidebar.checkbox("Добавить обработку транскрибации", value=False)
	processing_prompt = st.sidebar.text_area("Промпт для обработки транскрибации", value=initial_processing_prompt)

	ACCESS_TOKEN = st.secrets["HF_TOKEN"]

	uploaded_file = st.file_uploader("Загрузите аудиофайл", type=["mp4", "wav", "m4a"])

	if uploaded_file is not None:
	file_name = uploaded_file.name

	if 'file_name' not in st.session_state or st.session_state.file_name != file_name:
	st.session_state.transcript = ''
	st.session_state.file_name = file_name


	st.audio(uploaded_file)
	file_extension = uploaded_file.name.split(".")[-1] # Получаем расширение файла
	temp_file_path = f"temp_file.{file_extension}" # Создаем временное имя файла с правильным расширением

	with open(temp_file_path, "wb") as f:
	f.write(uploaded_file.getbuffer())

	get_gpu_memory_info()

	if 'transcript' not in st.session_state or st.session_state.transcript == '':

	start_time = time.time()
	with st.spinner('Транскрибируем...'):
	# Load model
	model = whisperx.load_model(os.getenv('WHISPER_MODEL_SIZE'), device, compute_type=compute_type)
	# Load and transcribe audio
	audio = whisperx.load_audio(temp_file_path)
	result = model.transcribe(audio, batch_size=batch_size, language="ru")
	print('Transcribed, now aligning')

	model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
	result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
	print('Aligned, now diarizing')

	diarize_model = whisperx.DiarizationPipeline(use_auth_token=st.secrets["HF_TOKEN"], device=device)
	diarize_segments = diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
	result_diar = whisperx.assign_word_speakers(diarize_segments, result)

	transcript = convert_segments_object_to_text_simple(result_diar)
	st.session_state.transcript = transcript
	end_time = time.time() # Конец отсчета времени
	total_time = end_time - start_time
	print(f'Полный процесс транскрипции занял {total_time:.2f} секунд')
	else:

	transcript = st.session_state.transcript

	st.write("Результат транскрибации:")
	st.text(transcript)

	if (llm == 'Сбер'):
	access_token = get_access_token()

	if (enable_processing):
	with st.spinner('Обрабатываем транскрибацию...'):

	if (llm == 'Сбер'):
	number_of_tokens = get_number_of_tokens(transcript, access_token, llm_model)
	print('Количество токенов в транскрибации: ' + str(number_of_tokens))
	transcript = process_transcribation_with_gigachat(processing_prompt, transcript, number_of_tokens + 1000, access_token, llm_model)
	print(transcript)

	elif (llm == 'OpenAI'):
	transcript = process_transcribation_with_assistant(processing_prompt, transcript)
	print(transcript)

	else:
	st.write("На данный момент обработка транскрибации не поддерживается этой моделью.")

	with st.spinner('Резюмируем...'):
	if (llm == 'Сбер'):
	summary_answer = get_completion_from_gigachat(base_prompt + transcript, 1024, access_token, llm_model)
	elif (llm == 'OpenAI'):
	summary_answer = get_completion_from_openai(base_prompt + transcript, llm_model, 1024)
	elif (llm == 'Qwen'):
	torch.cuda.empty_cache()
	from qwen import respond
	summary_answer = respond(base_prompt + transcript)

	st.write("Результат резюмирования:")
	st.text(summary_answer)