| import numpy as np |
| import faiss |
| from sentence_transformers import SentenceTransformer |
| import os |
|
|
| from database import init_db, get_db_connection, INDEX_FILE, DB_FILE, delete_database_and_index |
| from security import encrypt_data |
|
|
| |
| MODEL_NAME = 'clip-ViT-B-32' |
|
|
| def create_initial_index(documents_dict): |
| """ |
| Creates an initial encrypted, persistent index from a dictionary of text documents. |
| This will delete any existing database to ensure a clean start. |
| """ |
| print("Performing a clean rebuild of the knowledge base...") |
| delete_database_and_index() |
| init_db() |
|
|
| conn = get_db_connection() |
| cursor = conn.cursor() |
| model = SentenceTransformer(MODEL_NAME) |
|
|
| all_chunks = [] |
| all_embeddings = [] |
|
|
| for name, content in documents_dict.items(): |
| |
| cursor.execute("INSERT INTO documents (name) VALUES (?)", (name,)) |
| doc_id = cursor.lastrowid |
|
|
| |
| chunk_text = content |
| all_chunks.append((doc_id, 'text', encrypt_data(chunk_text.encode('utf-8')), 1)) |
| |
| |
| text_embedding = model.encode([chunk_text]) |
| all_embeddings.append(text_embedding) |
|
|
| |
| cursor.executemany( |
| "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)", |
| all_chunks |
| ) |
| conn.commit() |
| conn.close() |
|
|
| if not all_embeddings: |
| print("No content to index.") |
| return |
|
|
| |
| embeddings_np = np.vstack(all_embeddings).astype('float32') |
| dimension = embeddings_np.shape[1] |
| index = faiss.IndexFlatL2(dimension) |
| index.add(embeddings_np) |
| faiss.write_index(index, INDEX_FILE) |
|
|
| print(f"Initial encrypted index created with {len(all_chunks)} chunks.") |
| print(f"Database: {DB_FILE}, FAISS Index: {INDEX_FILE}") |
|
|
|
|
|
|
| if __name__ == '__main__': |
| document_files = ["healthy_maize_remedy.txt", "maize_phosphorus_deficiency_remedy.txt", "comic_relief.txt"] |
| documents_content = [] |
| for file_path in document_files: |
| try: |
| with open(file_path, 'r', encoding='utf-8') as f: |
| documents_content.append(f.read()) |
| except FileNotFoundError: |
| print(f"Warning: File not found, skipping: {file_path}") |
|
|
| create_initial_index(documents_content) |
|
|