Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import os | |
| from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs | |
| from haystack.schema import Answer | |
| from haystack.document_stores import InMemoryDocumentStore | |
| from haystack.pipelines import ExtractiveQAPipeline | |
| from haystack.nodes import FARMReader, TfidfRetriever | |
| import logging | |
| from markdown import markdown | |
| from annotated_text import annotation | |
| from PIL import Image | |
| os.environ['TOKENIZERS_PARALLELISM'] ="false" | |
| #def load_and_write_data(document_store): | |
| # doc_dir = './article_txt_got' | |
| # docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) | |
| # document_store.write_documents(docs) | |
| #pipeline = start_haystack() | |
| def load_document( | |
| file_path: str, | |
| encoding: Optional[str] = None, | |
| id_hash_keys: Optional[List[str]] = None, | |
| ) -> List[Document]: | |
| """ | |
| Takes docx, txt and pdf files as input and extracts text as well as the | |
| filename as metadata. Image pdf will not be handled in this notebook. | |
| Returns a list of type haystack.schema.Document | |
| """ | |
| file_name = str.split(file_path,'/')[-1] | |
| if file_name.endswith('.pdf'): | |
| converter = PDFToTextConverter(remove_numeric_tables=True) | |
| if file_name.endswith('.txt'): | |
| converter = TextConverter() | |
| if file_name.endswith('.docx'): | |
| converter = DocxToTextConverter() | |
| documents = [] | |
| #logger.info("Converting {}".format(file_name)) | |
| print("Converting '{}'".format(file_name)) | |
| # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document | |
| document = converter.convert( | |
| file_path=file_path, meta=None, encoding=encoding, id_hash_keys=id_hash_keys | |
| )[0] | |
| text = document.content | |
| # creating the Haystack document by extracting 'content' from the returned object and passing meta information | |
| documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys)) | |
| return documents | |
| def preprocessing(document, | |
| split_by: Literal["sentence", "word"] = 'sentence', | |
| split_length:int = 3): | |
| """ | |
| takes in haystack document object and splits it into synthetically generated paragraphs and applies simple cleaning. | |
| Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and | |
| list that contains all text joined together. | |
| """ | |
| if split_by == 'sentence': | |
| split_respect_sentence_boundary = False | |
| split_overlap=0 | |
| else: | |
| split_respect_sentence_boundary = True | |
| split_overlap= 20 | |
| preprocessor = PreProcessor( | |
| clean_empty_lines=True, | |
| clean_whitespace=True, | |
| clean_header_footer=True, | |
| split_by=split_by, | |
| split_length=split_length, | |
| split_respect_sentence_boundary= split_respect_sentence_boundary, | |
| split_overlap=split_overlap | |
| ) | |
| for i in document: | |
| docs_processed = preprocessor.process([i]) | |
| for item in docs_processed: | |
| item.content = basic(item.content) | |
| print("\n your document has been splitted to", len(docs_processed), "paragraphs") | |
| # logger.info("document has been splitted to {}".format(len(docs_processed))) | |
| # create dataframe of text and list of all text | |
| #df = pd.DataFrame(docs_processed) | |
| #all_text = " ".join(df.content.to_list()) | |
| #par_list = df.content.to_list() | |
| return docs_processed #, df, all_text, par_list | |