Spaces:
Sleeping
Sleeping
| import torch | |
| import re | |
| import PyPDF2 | |
| import utils | |
| import streamlit as st | |
| from transformers import BertTokenizerFast, EncoderDecoderModel | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| model_id = 'mrm8488/bert2bert_shared-spanish-finetuned-summarization' | |
| tokenizer = BertTokenizerFast.from_pretrained(model_id) | |
| modelo = EncoderDecoderModel.from_pretrained(model_id).to(device) | |
| def generate_summary(text): | |
| inputs = tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt") | |
| input_ids = inputs.input_ids.to(device) | |
| attention_mask = inputs.attention_mask.to(device) #attention_mask only says that the model that this words are not pedded | |
| output = modelo.generate(input_ids, attention_mask=attention_mask) | |
| return tokenizer.decode(output[0], skip_special_tokens=True) | |
| def summarize_pdf(pdf_file): | |
| if pdf_file is not None: | |
| with st.spinner('Generando resumen, espera un poco...'): | |
| reader = PyPDF2.PdfReader(pdf_file) | |
| if reader.metadata.title == None: | |
| title = '' | |
| else: | |
| title = reader.metadata.title | |
| if reader.metadata.author == None: | |
| author = '' | |
| else: | |
| author = reader.metadata.author | |
| pages =reader.pages | |
| text = [pages[i].extract_text() for i in range(len(pages))] | |
| text = [utils.drop_non_relevant_text(utils.preprocess_text(x)) for x in text] | |
| text = [' '.join(x) for x in text] | |
| text=[x+'\n' if len(x) < 50 else generate_summary(x)+' \n' for x in text] | |
| results = [title+' \n'] + text | |
| st.session_state["summary"] = ' '.join(results) | |
| ## Graphic interfaz | |
| def output(pdf_file): | |
| if pdf_file is not None: | |
| reader = PyPDF2.PdfReader(pdf_file) | |
| title = reader.metadata.title | |
| st.session_state["summary"] = title | |
| if 'summary' not in st.session_state: | |
| st.session_state['summary'] = '' | |
| #output = summarize_pdf(pdf_file) | |
| #reader = PyPDF2.PdfReader(pdf_file) | |
| # title = reader.metadata.title | |
| # output = title | |
| # st.write(output) | |
| st.caption('Demo para la generación de resumenes en español') | |
| with st.sidebar: | |
| with st.container(border = True): | |
| st.title('PDF-Summarizer para español') | |
| st.caption('Este demo está basado en el modelo: \n mrm8488/bert2bert_shared-spanish-finetuned-summarization \n creado por Manuel Romero/@mrm8488 con el soporte de Narrativa. \n Importante: Recomendado para PDFs cortos.') | |
| pdf_file = st.file_uploader('Carga tu archivo PDF', type="pdf") | |
| with st.spinner('Estamos generando tu resumen, espera un poco...'): | |
| corre_button = st.button('Genera resumen', | |
| on_click=summarize_pdf, | |
| args = (pdf_file, ), | |
| help = 'Presiona para generar resumen') | |
| container = st.container(height=300) | |
| container.write('Resumen:') | |
| container.write(st.session_state["summary"]) | |