Spaces:
Sleeping
Sleeping
| from haystack.utils import fetch_archive_from_http | |
| from haystack.document_stores import ElasticsearchDocumentStore | |
| import os | |
| from haystack import Pipeline | |
| from haystack.nodes import TextConverter, PreProcessor | |
| doc_dir = "data/JBNU-FOCUS" | |
| host = os.environ.get("ELASTICSEARCH_HOST", "localhost") | |
| print(host) | |
| document_store = ElasticsearchDocumentStore( | |
| host='121.186.58.11', | |
| username="", | |
| password="", | |
| index="document" | |
| ) | |
| indexing_pipeline = Pipeline() | |
| text_converter = TextConverter() | |
| preprocessor = PreProcessor( | |
| clean_whitespace=True, | |
| clean_header_footer=True, | |
| clean_empty_lines=True, | |
| split_by="word", | |
| split_length=200, | |
| split_overlap=20, | |
| split_respect_sentence_boundary=True, | |
| ) | |
| import os | |
| indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"]) | |
| indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"]) | |
| indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"]) | |
| files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)] | |
| indexing_pipeline.run_batch(file_paths=files_to_index) |