Spaces:
Sleeping
Sleeping
| """ | |
| This script creates a Marqo index of preprocessed and original OCR texts. Each page is indexed as a document that is split into 2 sentences long vectors. | |
| The model used for sentence embedding is https://huggingface.co/flax-sentence-embeddings/all_datasets_v4_mpnet-base. | |
| Code by Michela Vignoli. Parts of this code were developed with assistance from Simon König. | |
| """ | |
| from pprint import pprint | |
| import csv | |
| import marqo as mq | |
| ## | |
| ## Connect to Marqo | |
| ## | |
| MARQO_URL = "http://10.103.251.104:8882" | |
| marqoClient = mq.Client(url=MARQO_URL) | |
| #pprint(marqoClient.get_indexes()) | |
| ## | |
| ## Index settings | |
| ## | |
| settings = { | |
| "textPreprocessing": { | |
| "splitLength": 2, | |
| "splitOverlap": 0, | |
| "splitMethod": "sentence", | |
| }, | |
| } | |
| ## | |
| ## Ask if index exists, if not create it | |
| ## | |
| indexName = "onit-sonnini-DHd2025-clean" | |
| print("Indexname: ", indexName) | |
| current_indexes = [d["indexName"] for d in marqoClient.get_indexes()["results"]] | |
| if indexName in current_indexes: | |
| print(f"Index already exists: {indexName} ") | |
| # Set indexName as the current index | |
| print(f"Defaulting to index connection. Index connected: {indexName} ") | |
| else: # Create a new index | |
| print(f"Index does not exist: {indexName} ") | |
| print(f"Creating index: {indexName} ") | |
| marqoClient.create_index( | |
| indexName, | |
| model="flax-sentence-embeddings/all_datasets_v4_mpnet-base", | |
| settings_dict=settings | |
| ) | |
| ## List of models integrated in Marqo: https://docs.marqo.ai/latest/models/marqo/list-of-models/ | |
| pprint(marqoClient.get_indexes()) | |
| ## | |
| ## Load dict of data | |
| ## | |
| # Load list of dictionaries with each dictionary containing keys: text, barcode, page | |
| # CSV path | |
| csv_file = 'data/DHd_index-cleaned.csv' | |
| # Read data from CSV file into a list of dictionaries | |
| with open(csv_file, mode='r', encoding='utf-8') as file: | |
| reader = csv.DictReader(file) | |
| animal_descriptions = [row for row in reader] | |
| # Function to clean text by replacing \n with spaces | |
| def clean_text(text): | |
| return text.replace('\n', ' ').strip() | |
| # Clean the 'text' field in each dictionary | |
| for entry in animal_descriptions: | |
| entry['text_orig'] = clean_text(entry['text_orig']) | |
| entry['text_clean'] = clean_text(entry['text_clean']) | |
| entry['text_prep'] = clean_text(entry['text_prep']) | |
| pprint(animal_descriptions[:3]) | |
| ## | |
| ## Add documents to the index | |
| ## | |
| print(f"Indexing data...") | |
| # Define client_batch_size | |
| client_batch_size = 128 | |
| # Indexing | |
| marqoClient.index(indexName).add_documents( | |
| animal_descriptions, | |
| client_batch_size=client_batch_size, | |
| tensor_fields=["text_clean"], | |
| ) | |
| print(f"Data has been indexed in {indexName}") |