""" This script handles document embedding using EmbeddingGemma. This is the entry point for indexing documents. """ import os import pickle import faiss import numpy as np from sentence_transformers import SentenceTransformer def embed_documents(path: str, config: dict): """ Embed documents from a directory and save to FAISS index. Args: path (str): Path to the directory containing the documents to embed. config (dict): Configuration dictionary. """ try: model = SentenceTransformer(config["embedding"]["model_path"]) print(f"Initalized embedding model: {config['embedding']['model_path']}") except ValueError as e: print(f"Error initializing embedding model: {e}") return [] embeddings = [] texts = [] filenames = [] # Read all documents for fname in os.listdir(path): fpath = os.path.join(path, fname) if os.path.isfile(fpath): try: with open(fpath, "r", encoding="utf-8") as f: text = f.read() if text.strip(): # Only process non-empty files emb = model.encode(text) embeddings.append(emb) texts.append(text) filenames.append(fname) except Exception as e: print(f"Error reading file {fpath}: {e}") if not embeddings: print("No documents were successfully embedded.") return [] # Create FAISS index dimension = embeddings[0].shape[0] index = faiss.IndexFlatIP(dimension) # Normalize embeddings for cosine similarity embeddings_matrix = np.array(embeddings).astype("float32") faiss.normalize_L2(embeddings_matrix) # Add embeddings to index index.add(embeddings_matrix) # Save FAISS index and metadata os.makedirs("vector_cache", exist_ok=True) faiss.write_index(index, "vector_cache/faiss_index.bin") with open("vector_cache/metadata.pkl", "wb") as f: pickle.dump({"texts": texts, "filenames": filenames}, f) print(f"Saved FAISS index to vector_cache/ with {len(embeddings)} documents.") print(f"Total embeddings created: {len(embeddings)}") return list(zip(filenames, embeddings))