Spaces:

surfiniaburger
/

aura-mind-glow

Paused

App Files Files Community

aura-mind-glow / create_index.py

surfiniaburger

symphony

2cbbef6 8 months ago

raw

history blame contribute delete

2.48 kB

	import numpy as np
	import faiss
	from sentence_transformers import SentenceTransformer
	import os

	from database import init_db, get_db_connection, INDEX_FILE, DB_FILE, delete_database_and_index
	from security import encrypt_data

	# Use a CLIP model that can handle both text and images
	MODEL_NAME = 'clip-ViT-B-32'

	def create_initial_index(documents_dict):
	"""
	Creates an initial encrypted, persistent index from a dictionary of text documents.
	This will delete any existing database to ensure a clean start.
	"""
	print("Performing a clean rebuild of the knowledge base...")
	delete_database_and_index()
	init_db()

	conn = get_db_connection()
	cursor = conn.cursor()
	model = SentenceTransformer(MODEL_NAME)

	all_chunks = []
	all_embeddings = []

	for name, content in documents_dict.items():
	# Add document to documents table
	cursor.execute("INSERT INTO documents (name) VALUES (?)", (name,))
	doc_id = cursor.lastrowid

	# For initial docs, we treat the whole content as one chunk
	chunk_text = content
	all_chunks.append((doc_id, 'text', encrypt_data(chunk_text.encode('utf-8')), 1))

	# Create text embedding
	text_embedding = model.encode([chunk_text])
	all_embeddings.append(text_embedding)

	# Batch insert chunks
	cursor.executemany(
	"INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
	all_chunks
	)
	conn.commit()
	conn.close()

	if not all_embeddings:
	print("No content to index.")
	return

	# Create and save the FAISS index
	embeddings_np = np.vstack(all_embeddings).astype('float32')
	dimension = embeddings_np.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(embeddings_np)
	faiss.write_index(index, INDEX_FILE)

	print(f"Initial encrypted index created with {len(all_chunks)} chunks.")
	print(f"Database: {DB_FILE}, FAISS Index: {INDEX_FILE}")



	if __name__ == '__main__':
	document_files = ["healthy_maize_remedy.txt", "maize_phosphorus_deficiency_remedy.txt", "comic_relief.txt"]
	documents_content = []
	for file_path in document_files:
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	documents_content.append(f.read())
	except FileNotFoundError:
	print(f"Warning: File not found, skipping: {file_path}")

	create_initial_index(documents_content)