Spaces:

chelleboyer
/

cert-challenge

Sleeping

File size: 2,739 Bytes

0389a81

from aimakerspace.vectordatabase import VectorDatabase
import asyncio
import numpy as np
from langchain_core.documents import Document
from typing import List, Dict, Any, Callable

class SimpleRetriever:
    """A simpler retriever class that doesn't rely on Pydantic."""
    
    def __init__(self, vector_db, text_map, k=5):
        """Initialize the retriever.
        
        Args:
            vector_db: The vector database
            text_map: Mapping of IDs to full text content
            k: Number of results to return
        """
        self.vector_db = vector_db
        self.text_map = text_map
        self.k = k
    
    def invoke(self, query: str) -> List[Document]:
        """Get documents relevant to the query."""
        results = self.vector_db.search_by_text(query, k=self.k, return_as_text=False)
        
        # Return actual document content instead of just IDs
        documents = []
        for doc_id, score in results:
            if doc_id in self.text_map:
                documents.append(Document(
                    page_content=self.text_map[doc_id],
                    metadata={"score": score, "id": doc_id}
                ))
            else:
                # Fallback for IDs without content
                documents.append(Document(
                    page_content=f"Document {doc_id} content not available",
                    metadata={"score": score, "id": doc_id}
                ))
        
        return documents
    
    # Make the class callable like a function
    def __call__(self, query):
        return self.invoke(query)

def create_vector_store(embeddings, texts=None):
    """Create vector store from embeddings and texts.
    
    Args:
        embeddings: List of embeddings
        texts: List of text documents corresponding to embeddings
    """
    vector_db = VectorDatabase()
    
    # Create a mapping of document IDs to actual content
    text_map = {}
    
    # Add the embeddings and content to the vector database
    if texts and len(texts) == len(embeddings):
        for i, (text, embedding) in enumerate(zip(texts, embeddings)):
            doc_id = f"text_{i}"
            vector_db.insert(doc_id, embedding)
            text_map[doc_id] = text
    else:
        for i, embedding in enumerate(embeddings):
            doc_id = f"text_{i}"
            vector_db.insert(doc_id, embedding)
            text_map[doc_id] = f"Content for document {doc_id} not available"
    
    # Add a simple retriever as the as_retriever method
    vector_db.as_retriever = lambda search_kwargs=None: SimpleRetriever(
        vector_db=vector_db,
        text_map=text_map,
        k=search_kwargs.get("k", 5) if search_kwargs else 5
    )
    
    return vector_db