Spaces:
Build error
Build error
| import streamlit as st | |
| from haystack.document_stores import InMemoryDocumentStore | |
| from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, TfidfRetriever | |
| from haystack.schema import Document | |
| import logging | |
| def start_haystack(): | |
| document_store = InMemoryDocumentStore() | |
| preprocessor = PreProcessor( | |
| clean_empty_lines=True, | |
| clean_whitespace=True, | |
| clean_header_footer=True, | |
| split_by="word", | |
| split_length=100, | |
| split_respect_sentence_boundary=True, | |
| ) | |
| summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-newsroom") | |
| return document_store, summarizer, preprocessor | |
| def pdf_to_document_store(pdf_files): | |
| converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) | |
| documents = [] | |
| for pdf in pdf_files: | |
| doc = converter.convert(file_path=pdf.name, meta=None)[0] | |
| st.write(len(doc)) | |
| st.write(doc) | |
| preprocessed_doc=preprocessor.process([doc]) | |
| st.write(len(preprocessed_doc)) | |
| documents.append(preprocessed_doc) | |
| document_store.write_documents(documents) | |
| st.write('Document count: ', document_store.get_document_count()) | |
| def summarize(files): | |
| pdf_to_document_store(files) | |
| summary = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=False) | |
| st.write(summary) | |
| document_store, summarizer, preprocessor = start_haystack() | |
| uploaded_files = st.file_uploader("Choose PDF files", accept_multiple_files=True) | |
| if uploaded_files is not None: | |
| st.write(len(uploaded_files)) | |
| if st.button('Summarize Documents'): | |
| summarize(uploaded_files) | |
| if st.button('Calculate num of docs'): | |
| st.write(document_store.get_document_count()) | |
| if st.button('Clear DocumentStore'): | |
| document_store.delete_all_documents() |