Spaces:

peter2000
/

policy_test

Runtime error

App Files Files Community

peter2000 commited on Sep 29, 2022

Commit

5cf4bb5

1 Parent(s): bca1994

Update scripts/process.py

Browse files

Files changed (1) hide show

scripts/process.py +34 -33

scripts/process.py CHANGED Viewed

@@ -69,38 +69,39 @@ def load_document(
     extraction fails via Haystack.
     Returns a list of type haystack.schema.Document
     """
-    st.write(file_name)
-    if file_name.endswith('.pdf'):
-        converter = PDFToTextConverter(remove_numeric_tables=True)
-    if file_name.endswith('.txt'):
-        converter = TextConverter()
-    if file_name.endswith('.docx'):
-        converter = DocxToTextConverter()
-    documents = []
-    logger.info("Converting {}".format(file_name))
-    # PDFToTextConverter, TextConverter, and DocxToTextConverter
-    # return a list containing a single Document
-    document = converter.convert(
-                file_path=file_path, meta=None,
-                encoding=encoding, id_hash_keys=id_hash_keys
-                )[0]
-    text = document.content
-    documents.append(Document(content=text,
-                              meta={"name": file_name},
-                              id_hash_keys=id_hash_keys))
-    '''check if text is empty and apply different pdf processor. \
-    This can happen whith certain pdf types.'''
-    for i in documents:
-        if i.content == "":
-            st.write("using pdfplumber")
-            text = []
-            with pdfplumber.open(file_path) as pdf:
-                for page in pdf.pages:
-                    text.append(page.extract_text())
-            i.content = ' '.join([page for page in text])
     return documents
@@ -126,7 +127,7 @@ def preprocessing(document):
         for item in docs_processed:
             item.content = basic(item.content)
-    st.write("your document has been splitted to", len(docs_processed), "paragraphs")
     # create dataframe of text and list of all text
     #df = pd.DataFrame(docs_processed)

     extraction fails via Haystack.
     Returns a list of type haystack.schema.Document
     """
+    with st.spinner("👑 Uploading file"):#+file.name+"..."):
+        try:
+            if file_name.endswith('.pdf'):
+                converter = PDFToTextConverter(remove_numeric_tables=True)
+            if file_name.endswith('.txt'):
+                converter = TextConverter()
+            if file_name.endswith('.docx'):
+                converter = DocxToTextConverter()
+            documents = []
+            #logger.info("Converting {}".format(file_name))
+            # PDFToTextConverter, TextConverter, and DocxToTextConverter
+            # return a list containing a single Document
+            document = converter.convert(
+                        file_path=file_path, meta=None,
+                        encoding=encoding, id_hash_keys=id_hash_keys
+                        )[0]
+            text = document.content
+            documents.append(Document(content=text,
+                                      meta={"name": file_name},
+                                      id_hash_keys=id_hash_keys))
+            '''check if text is empty and apply different pdf processor. \
+            This can happen whith certain pdf types.'''
+            for i in documents:
+                if i.content == "":
+                    st.write("using pdfplumber")
+                    text = []
+                    with pdfplumber.open(file_path) as pdf:
+                        for page in pdf.pages:
+                            text.append(page.extract_text())
+                    i.content = ' '.join([page for page in text])
     return documents
         for item in docs_processed:
             item.content = basic(item.content)
+    #st.write("your document has been splitted to", len(docs_processed), "paragraphs")
     # create dataframe of text and list of all text
     #df = pd.DataFrame(docs_processed)