Spaces:

dev-jas
/

CodeMind

Running

File size: 2,361 Bytes

80b95e8
 
 
 
 
 
eae02ee
 
 
80b95e8
 
 
 
eae02ee
 
 
 
 
 
 
80b95e8
 
eae02ee
 
 
 
80b95e8
 
eae02ee
 
80b95e8
eae02ee
80b95e8
eae02ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80b95e8
eae02ee
80b95e8
 
eae02ee

"""

This script handles document embedding using EmbeddingGemma.

This is the entry point for indexing documents.

"""

import os
import pickle
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer


def embed_documents(path: str, config: dict):
    """

    Embed documents from a directory and save to FAISS index.



    Args:

        path (str): Path to the directory containing the documents to embed.

        config (dict): Configuration dictionary.

    """
    try:
        model = SentenceTransformer(config["embedding"]["model_path"])
        print(f"Initalized embedding model: {config['embedding']['model_path']}")
    except ValueError as e:
        print(f"Error initializing embedding model: {e}")
        return []

    embeddings = []
    texts = []
    filenames = []

    # Read all documents
    for fname in os.listdir(path):
        fpath = os.path.join(path, fname)
        if os.path.isfile(fpath):
            try:
                with open(fpath, "r", encoding="utf-8") as f:
                    text = f.read()
                    if text.strip():  # Only process non-empty files
                        emb = model.encode(text)
                        embeddings.append(emb)
                        texts.append(text)
                        filenames.append(fname)
            except Exception as e:
                print(f"Error reading file {fpath}: {e}")

    if not embeddings:
        print("No documents were successfully embedded.")
        return []

    # Create FAISS index
    dimension = embeddings[0].shape[0]
    index = faiss.IndexFlatIP(dimension)

    # Normalize embeddings for cosine similarity
    embeddings_matrix = np.array(embeddings).astype("float32")
    faiss.normalize_L2(embeddings_matrix)
    # Add embeddings to index
    index.add(embeddings_matrix)

    # Save FAISS index and metadata
    os.makedirs("vector_cache", exist_ok=True)
    faiss.write_index(index, "vector_cache/faiss_index.bin")

    with open("vector_cache/metadata.pkl", "wb") as f:
        pickle.dump({"texts": texts, "filenames": filenames}, f)

    print(f"Saved FAISS index to vector_cache/ with {len(embeddings)} documents.")
    print(f"Total embeddings created: {len(embeddings)}")

    return list(zip(filenames, embeddings))