Spaces:
Sleeping
Sleeping
| import yt_dlp | |
| import os | |
| import streamlit as st | |
| import transformers | |
| from transformers import pipeline | |
| from transformers import AutoTokenizer | |
| import nltk | |
| from PIL import Image | |
| import torch | |
| icon = Image.open("Traçado laranja #f1863d.png") | |
| st.set_page_config( | |
| page_title = "Turing Videos", | |
| page_icon = icon, | |
| layout = "wide", | |
| initial_sidebar_state = "auto", | |
| ) | |
| #@st.cache_resource | |
| def download_audio(link): | |
| with yt_dlp.YoutubeDL({'extract_audio': True, 'format': 'bestaudio', 'outtmpl': '%(id)s.mp3'}) as video: | |
| info_dict = video.extract_info(link, download = True) | |
| id = info_dict['id'] | |
| video.download(link) | |
| return id | |
| #Load Whisper pipeline via HuggingFace | |
| def load_whisper(): | |
| return pipeline("automatic-speech-recognition", | |
| model="openai/whisper-tiny", | |
| chunk_length_s=30, | |
| ) | |
| #Load Extractive Summarizer pipeline via HuggingFace | |
| def load_extractive(): | |
| return pipeline("summarization", | |
| model = "NotXia/longformer-bio-ext-summ", | |
| tokenizer = AutoTokenizer.from_pretrained("NotXia/longformer-bio-ext-summ"), | |
| trust_remote_code = True, | |
| ) | |
| #Load QA pipeline via HuggingFace | |
| def load_qa(): | |
| return pipeline("question-answering", | |
| model="rsvp-ai/bertserini-bert-base-squad" | |
| ) | |
| #Download punkt function from nltk | |
| def load_nltk(): | |
| nltk.download("punkt") | |
| #Make the ASR task | |
| def audio_speech_recognition(_model_pipeline, video_id): | |
| return _model_pipeline(video_id + ".mp3", batch_size=64)["text"].strip() | |
| #Make the Summarization task | |
| def text_summarization(_model_pipeline, full_text, ratio): | |
| sentences = nltk.sent_tokenize(full_text) | |
| extractive_sentences = _model_pipeline({"sentences": sentences}, strategy="ratio", strategy_args=ratio) | |
| extractive_text = " ".join(extractive_sentences[0]) | |
| return extractive_text.strip() | |
| #Make the QA task | |
| def answer_questions(_model_pipeline, full_text, questionings): | |
| answers = [] | |
| for question in questionings: | |
| result = _model_pipeline(question=question, context=full_text) | |
| answers.append(result["answer"]) | |
| return answers | |
| def main(): | |
| header = st.container() | |
| model = st.container() | |
| model_1, model_2 = st.columns(2) | |
| with st.sidebar: | |
| st.title(":red[Turing]Videos") | |
| with st.form("data_collection"): | |
| language = st.selectbox('Qual a linguagem do seu modelo?', | |
| ('Inglês (en)', 'Português (pt)', 'Outra') | |
| ) | |
| link = st.text_area(label="Coloque o link do seu vídeo do YouTube:", | |
| height=25, placeholder="Digite seu link...") | |
| compression_rate = st.slider(label="Selecione a taxa de compressão:", | |
| min_value=0.05, max_value=0.35, | |
| value=0.25, step=0.05 | |
| ) | |
| questions = st.text_area(label="Coloque suas perguntas separadas por vírgula!", | |
| height=50, placeholder="Digite suas perguntas..." | |
| ).split(",") | |
| submitted = st.form_submit_button("Submit") | |
| if submitted: | |
| st.success('Dados coletados!', icon="✅") | |
| else: | |
| st.error('Dados ainda não coletados!', icon="🚨") | |
| with header: | |
| st.title(":red[Turing]Videos") | |
| st.subheader("Este projeto utiliza técnicas de inteligência artificial para simplificar e acelerar a compreensão de conteúdo audiovisual.", | |
| divider = "red" | |
| ) | |
| with model: | |
| if submitted: | |
| with st.spinner("Carregando modelos..."): | |
| if language == "Inglês (en)": | |
| id = download_audio(link) | |
| load_nltk() | |
| whisper = load_whisper() | |
| extractive = load_extractive() | |
| qa_model = load_qa() | |
| elif language == "Português (pt)": | |
| st.header("Modelo ainda não implementado.") | |
| else: | |
| st.header("Erro na seleção de linguagem.") | |
| with st.spinner("Transcrevendo texto..."): | |
| transcript_text = audio_speech_recognition(whisper, id) | |
| with model_1: | |
| st.header("Texto Sumarizado:") | |
| with st.spinner("Carregando sumarização..."): | |
| summary = text_summarization(extractive, transcript_text, compression_rate) | |
| st.subheader(summary) | |
| with model_2: | |
| st.header("Resposta das perguntas:") | |
| with st.spinner("Carregando respostas..."): | |
| answers = answer_questions(qa_model, transcript_text, questions) | |
| for i in range(len(answers)): | |
| st.subheader(questions[i]) | |
| st.subheader(answers[i]) | |
| st.write("\n\n") | |
| main() |