Spaces:
Runtime error
Runtime error
UI improvements
Browse files- AutoSumm.png +0 -0
- app.py +36 -7
- extractor/_utils.py +22 -5
AutoSumm.png
ADDED
|
app.py
CHANGED
|
@@ -29,23 +29,43 @@ def main():
|
|
| 29 |
search_model, summ_model, tokenizer = init()
|
| 30 |
Timer.reset()
|
| 31 |
|
| 32 |
-
st.
|
|
|
|
| 33 |
st.subheader("Lucas Antunes & Matheus Vieira")
|
| 34 |
|
| 35 |
portuguese = st.checkbox('Traduzir para o português.')
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
if portuguese:
|
| 38 |
environ['PORTUGUESE'] = 'true' # work around (gambiarra)
|
| 39 |
-
st.
|
| 40 |
-
query_pt = st.text_input('Digite o tópico') #text is stored in this variable
|
| 41 |
button = st.button('Gerar resumo')
|
| 42 |
else:
|
| 43 |
environ['PORTUGUESE'] = 'false' # work around (gambiarra)
|
| 44 |
-
st.
|
| 45 |
-
query = st.text_input('Type your topic') #text is stored in this variable
|
| 46 |
button = st.button('Generate summary')
|
| 47 |
|
| 48 |
-
result = st.
|
| 49 |
|
| 50 |
if 'few_documents' not in st.session_state:
|
| 51 |
st.session_state['few_documents'] = False
|
|
@@ -68,22 +88,31 @@ def main():
|
|
| 68 |
|
| 69 |
if portuguese:
|
| 70 |
result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}')
|
|
|
|
|
|
|
| 71 |
else:
|
| 72 |
result.markdown(f'Your summary for "{query}":\n\n> {summary}')
|
|
|
|
|
|
|
| 73 |
|
| 74 |
Timer.show_total()
|
| 75 |
|
| 76 |
|
| 77 |
if few_documents:
|
| 78 |
st.warning(st.session_state['msg'])
|
| 79 |
-
|
|
|
|
| 80 |
text = extract(query, search_model=search_model, extracted_documents=st.session_state['documents'])
|
| 81 |
summary = summarize(text, summ_model, tokenizer)
|
| 82 |
|
| 83 |
if portuguese:
|
| 84 |
result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}')
|
|
|
|
|
|
|
| 85 |
else:
|
| 86 |
result.markdown(f'Your summary for "{query}":\n\n> {summary}')
|
|
|
|
|
|
|
| 87 |
|
| 88 |
st.session_state['few_documents'] = False
|
| 89 |
few_documents = False
|
|
|
|
| 29 |
search_model, summ_model, tokenizer = init()
|
| 30 |
Timer.reset()
|
| 31 |
|
| 32 |
+
_, col2, _ = st.columns([1,1,1])
|
| 33 |
+
col2.image('AutoSumm.png', width=250)
|
| 34 |
st.subheader("Lucas Antunes & Matheus Vieira")
|
| 35 |
|
| 36 |
portuguese = st.checkbox('Traduzir para o português.')
|
| 37 |
|
| 38 |
+
st.sidebar.markdown("""
|
| 39 |
+
# Processing steps
|
| 40 |
+
#### Translation
|
| 41 |
+
Step where the system translates the user's query from Portuguese to English and the summary from English to Portuguese.
|
| 42 |
+
|
| 43 |
+
#### Corpus generation
|
| 44 |
+
Step where the system generates the complete corpus: query-related web pages and documents (PDFs and text files) on query-related knowledge area. The Corpus for this model was built to gather documents related to the Blue Amazon, a maritime region in South America.
|
| 45 |
+
|
| 46 |
+
#### Exhaustive search
|
| 47 |
+
Step where the system filters the texts of the corpus that contain keywords from the query.
|
| 48 |
+
|
| 49 |
+
#### Semantic search over documents
|
| 50 |
+
Step in which the system selects documents related to the query through semantic search.
|
| 51 |
+
|
| 52 |
+
#### Semantic search over paragraphs
|
| 53 |
+
Step in which the system breaks documents into paragraphs and selects those related to the query through semantic search.
|
| 54 |
+
|
| 55 |
+
#### Abstraction
|
| 56 |
+
Step in which the system generates an abstractive summary about the query from the best three paragraphs of the previous step.
|
| 57 |
+
""")
|
| 58 |
+
|
| 59 |
if portuguese:
|
| 60 |
environ['PORTUGUESE'] = 'true' # work around (gambiarra)
|
| 61 |
+
query_pt = st.text_input('Digite o tópico sobre o qual você deseja gerar um resumo') #text is stored in this variable
|
|
|
|
| 62 |
button = st.button('Gerar resumo')
|
| 63 |
else:
|
| 64 |
environ['PORTUGUESE'] = 'false' # work around (gambiarra)
|
| 65 |
+
query = st.text_input('Type the desired topic to generate the summary') #text is stored in this variable
|
|
|
|
| 66 |
button = st.button('Generate summary')
|
| 67 |
|
| 68 |
+
result = st.container()
|
| 69 |
|
| 70 |
if 'few_documents' not in st.session_state:
|
| 71 |
st.session_state['few_documents'] = False
|
|
|
|
| 88 |
|
| 89 |
if portuguese:
|
| 90 |
result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}')
|
| 91 |
+
with result.expander(f'Parágrafos usados na geração do resumo'):
|
| 92 |
+
st.markdown(translate(text, "en", "pt").replace('\n', '\n\n'))
|
| 93 |
else:
|
| 94 |
result.markdown(f'Your summary for "{query}":\n\n> {summary}')
|
| 95 |
+
with result.expander(f'Paragraphs used in summarization'):
|
| 96 |
+
st.markdown(text.replace('\n', '\n\n'))
|
| 97 |
|
| 98 |
Timer.show_total()
|
| 99 |
|
| 100 |
|
| 101 |
if few_documents:
|
| 102 |
st.warning(st.session_state['msg'])
|
| 103 |
+
msg = 'Prosseguir' if portuguese else 'Proceed'
|
| 104 |
+
if st.button(msg):
|
| 105 |
text = extract(query, search_model=search_model, extracted_documents=st.session_state['documents'])
|
| 106 |
summary = summarize(text, summ_model, tokenizer)
|
| 107 |
|
| 108 |
if portuguese:
|
| 109 |
result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}')
|
| 110 |
+
with result.expander(f'Parágrafos usados na geração do resumo'):
|
| 111 |
+
st.markdown(translate(text, "en", "pt").replace('\n', '\n\n'))
|
| 112 |
else:
|
| 113 |
result.markdown(f'Your summary for "{query}":\n\n> {summary}')
|
| 114 |
+
with result.expander(f'Paragraphs used in summarization'):
|
| 115 |
+
st.markdown(text.replace('\n', '\n\n'))
|
| 116 |
|
| 117 |
st.session_state['few_documents'] = False
|
| 118 |
few_documents = False
|
extractor/_utils.py
CHANGED
|
@@ -3,6 +3,7 @@ import numpy as np
|
|
| 3 |
import streamlit as st
|
| 4 |
# import inflect
|
| 5 |
import torch
|
|
|
|
| 6 |
|
| 7 |
# p = inflect.engine()
|
| 8 |
|
|
@@ -23,6 +24,13 @@ def document_extraction(dataset, query, keywords, min_document_size, min_just_on
|
|
| 23 |
lower_query = query.lower()
|
| 24 |
lower_keywords = [keyword.lower() for keyword in keywords]
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
documents = {}
|
| 27 |
|
| 28 |
documents['QUERY'] = [
|
|
@@ -61,7 +69,10 @@ def document_extraction(dataset, query, keywords, min_document_size, min_just_on
|
|
| 61 |
if all(empty.values()):
|
| 62 |
# TODO: throw error
|
| 63 |
st.info(empty.values())
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
| 65 |
st.stop()
|
| 66 |
|
| 67 |
if sizes['QUERY'] >= 10:
|
|
@@ -72,10 +83,16 @@ def document_extraction(dataset, query, keywords, min_document_size, min_just_on
|
|
| 72 |
extracted_documents = documents['OR']
|
| 73 |
else:
|
| 74 |
number_of_documents = sizes['OR']
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
return extracted_documents, empty, sizes
|
| 81 |
|
|
|
|
| 3 |
import streamlit as st
|
| 4 |
# import inflect
|
| 5 |
import torch
|
| 6 |
+
from os import environ
|
| 7 |
|
| 8 |
# p = inflect.engine()
|
| 9 |
|
|
|
|
| 24 |
lower_query = query.lower()
|
| 25 |
lower_keywords = [keyword.lower() for keyword in keywords]
|
| 26 |
|
| 27 |
+
if environ['PORTUGUESE'] == 'true':
|
| 28 |
+
portuguese = True
|
| 29 |
+
elif environ['PORTUGUESE'] == 'false':
|
| 30 |
+
portuguese = False
|
| 31 |
+
else:
|
| 32 |
+
raise EnvironmentError
|
| 33 |
+
|
| 34 |
documents = {}
|
| 35 |
|
| 36 |
documents['QUERY'] = [
|
|
|
|
| 69 |
if all(empty.values()):
|
| 70 |
# TODO: throw error
|
| 71 |
st.info(empty.values())
|
| 72 |
+
if portuguese:
|
| 73 |
+
st.warning(f'Nenhum documento encontrado para a query "{query}", por favor, tente com outra query')
|
| 74 |
+
else:
|
| 75 |
+
st.warning(f'No document found for the query "{query}", please try with another query')
|
| 76 |
st.stop()
|
| 77 |
|
| 78 |
if sizes['QUERY'] >= 10:
|
|
|
|
| 83 |
extracted_documents = documents['OR']
|
| 84 |
else:
|
| 85 |
number_of_documents = sizes['OR']
|
| 86 |
+
if portuguese:
|
| 87 |
+
raise FewDocumentsError(documents['OR'], number_of_documents,
|
| 88 |
+
f'Somente {number_of_documents} documentos encontrados para a query "{query}"\n\
|
| 89 |
+
Por favor selecione "Prosseguir" para prosseguir com {number_of_documents} documentos ou tente novamente com outra query'
|
| 90 |
+
)
|
| 91 |
+
else:
|
| 92 |
+
raise FewDocumentsError(documents['OR'], number_of_documents,
|
| 93 |
+
f'Only {number_of_documents} documents found for the query "{query}"\n\
|
| 94 |
+
Please select "Proceed" to proceed with {number_of_documents} documents or try again with another query'
|
| 95 |
+
)
|
| 96 |
|
| 97 |
return extracted_documents, empty, sizes
|
| 98 |
|