Spaces:
Sleeping
Sleeping
Commit
·
1400e5d
1
Parent(s):
684322b
trying pdf tp txt converter
Browse files- app.py +10 -1
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -41,11 +41,20 @@ from utils.ui import reset_results, set_initial_state
|
|
| 41 |
import pandas as pd
|
| 42 |
import haystack
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
# Whether the file upload should be enabled or not
|
| 46 |
DISABLE_FILE_UPLOAD = bool(os.getenv("DISABLE_FILE_UPLOAD"))
|
| 47 |
# Define a function to handle file uploads
|
| 48 |
def upload_files():
|
|
|
|
| 49 |
uploaded_files = upload_container.file_uploader(
|
| 50 |
"upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="collapsed"
|
| 51 |
)
|
|
@@ -78,7 +87,7 @@ def process_file(data_file, preprocesor, document_store):
|
|
| 78 |
print(e)
|
| 79 |
|
| 80 |
def reset_documents():
|
| 81 |
-
print('
|
| 82 |
document_store.delete_documents()
|
| 83 |
|
| 84 |
def upload_document():
|
|
|
|
| 41 |
import pandas as pd
|
| 42 |
import haystack
|
| 43 |
|
| 44 |
+
from datetime import datetime
|
| 45 |
+
|
| 46 |
+
from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor
|
| 47 |
+
|
| 48 |
+
pdf_converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en","de"])
|
| 49 |
+
docx_converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en","de"])
|
| 50 |
+
txt_converter = TextConverter(remove_numeric_tables=True, valid_languages=["en","de"])
|
| 51 |
+
|
| 52 |
|
| 53 |
# Whether the file upload should be enabled or not
|
| 54 |
DISABLE_FILE_UPLOAD = bool(os.getenv("DISABLE_FILE_UPLOAD"))
|
| 55 |
# Define a function to handle file uploads
|
| 56 |
def upload_files():
|
| 57 |
+
print(f'Uploading files at {datetime.now()}')
|
| 58 |
uploaded_files = upload_container.file_uploader(
|
| 59 |
"upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="collapsed"
|
| 60 |
)
|
|
|
|
| 87 |
print(e)
|
| 88 |
|
| 89 |
def reset_documents():
|
| 90 |
+
print('\nReseting documents list at ' + str(datetime.now()) + '\n')
|
| 91 |
document_store.delete_documents()
|
| 92 |
|
| 93 |
def upload_document():
|
requirements.txt
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
safetensors==0.3.3.post1
|
| 2 |
-
farm-haystack[inference,weaviate,opensearch]==1.20.0
|
| 3 |
milvus-haystack
|
| 4 |
streamlit==1.23.0
|
| 5 |
markdown
|
|
|
|
| 1 |
safetensors==0.3.3.post1
|
| 2 |
+
farm-haystack[inference,weaviate,opensearch,file-conversion,pdf]==1.20.0
|
| 3 |
milvus-haystack
|
| 4 |
streamlit==1.23.0
|
| 5 |
markdown
|