Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| from typing import Callable, Dict, List, Optional | |
| import re | |
| import os | |
| from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs | |
| from haystack.schema import Answer | |
| from haystack.document_stores import InMemoryDocumentStore | |
| from haystack.pipelines import ExtractiveQAPipeline | |
| from haystack.nodes import FARMReader, TfidfRetriever | |
| from haystack.nodes.file_converter import PDFToTextConverter, TextConverter, DocxToTextConverter | |
| from haystack.schema import Document | |
| from haystack.nodes import PreProcessor | |
| import logging | |
| from markdown import markdown | |
| from annotated_text import annotation | |
| from PIL import Image | |
| logger = logging.getLogger(__name__) | |
| os.environ['TOKENIZERS_PARALLELISM'] ="false" | |
| #def load_and_write_data(document_store): | |
| # doc_dir = './article_txt_got' | |
| # docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) | |
| # document_store.write_documents(docs) | |
| def basic(s): | |
| """ | |
| :param s: string to be processed | |
| :return: processed string: see comments in the source code for more info | |
| """ | |
| # Text Lowercase | |
| #s = s.lower() | |
| # Remove punctuation | |
| #translator = str.maketrans(' ', ' ', string.punctuation) | |
| #s = s.translate(translator) | |
| # Remove URLs | |
| s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE) | |
| s = re.sub(r"http\S+", " ", s) | |
| # Remove new line characters | |
| #s = re.sub('\n', ' ', s) | |
| # Remove distracting single quotes | |
| #s = re.sub("\'", " ", s) | |
| # Remove all remaining numbers and non alphanumeric characters | |
| #s = re.sub(r'\d+', ' ', s) | |
| #s = re.sub(r'\W+', ' ', s) | |
| # define custom words to replace: | |
| #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s) | |
| return s.strip() | |
| def load_document( | |
| file_path: str, | |
| file_name, | |
| encoding: Optional[str] = None, | |
| id_hash_keys: Optional[List[str]] = None, | |
| ) -> List[Document]: | |
| """ | |
| takes docx, txt and pdf files as input and \ | |
| extracts text as well as the filename as metadata. \ | |
| Since haystack does not take care of all pdf files, \ | |
| pdfplumber is attached to the pipeline in case the pdf \ | |
| extraction fails via Haystack. | |
| Returns a list of type haystack.schema.Document | |
| """ | |
| with st.spinner("π Uploading file"):#+file.name+"..."): | |
| if file_name.endswith('.pdf'): | |
| converter = PDFToTextConverter(remove_numeric_tables=True) | |
| if file_name.endswith('.txt'): | |
| converter = TextConverter() | |
| if file_name.endswith('.docx'): | |
| converter = DocxToTextConverter() | |
| documents = [] | |
| #logger.info("Converting {}".format(file_name)) | |
| # PDFToTextConverter, TextConverter, and DocxToTextConverter | |
| # return a list containing a single Document | |
| document = converter.convert( | |
| file_path=file_path, meta=None, | |
| encoding=encoding, id_hash_keys=id_hash_keys | |
| )[0] | |
| text = document.content | |
| documents.append(Document(content=text, | |
| meta={"name": file_name}, | |
| id_hash_keys=id_hash_keys)) | |
| return documents | |
| def preprocessing(document): | |
| """ | |
| takes in haystack document object and splits it into paragraphs and applies simple cleaning. | |
| Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and | |
| list that contains all text joined together. | |
| """ | |
| preprocessor = PreProcessor( | |
| clean_empty_lines=True, | |
| clean_whitespace=True, | |
| clean_header_footer=True, | |
| split_by="sentence", | |
| split_length=3, | |
| split_respect_sentence_boundary=False, | |
| split_overlap=1 | |
| ) | |
| for i in document: | |
| docs_processed = preprocessor.process([i]) | |
| for item in docs_processed: | |
| item.content = basic(item.content) | |
| #st.write("your document has been splitted to", len(docs_processed), "paragraphs") | |
| # create dataframe of text and list of all text | |
| #df = pd.DataFrame(docs_processed) | |
| #all_text = " ".join(df.content.to_list()) | |
| #par_list = df.content.to_list() | |
| return docs_processed #, df, all_text, par_list | |