File size: 2,115 Bytes
d6e43ef
 
b09d763
d6e43ef
 
b09d763
 
 
 
 
 
 
 
 
 
 
 
 
697b441
a03948e
697b441
b09d763
 
a03948e
b09d763
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6e43ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from langchain_chroma import Chroma
from langchain_core.vectorstores import VectorStore
#from task1 import LangchainGeminiWrapper #This is from your old task1 file
import chromadb

from llama_index.embeddings.gemini import GeminiEmbedding
from typing import List, Dict

import chromadb
import os
import pickle


# Retrieve API keys from environment variables
userdata = {
    "GEMINI_API_KEY":os.getenv("GEMINI_API_KEY"),
}
gemini_key = userdata.get("GEMINI_API_KEY")
parent_dir = os.path.dirname(os.path.abspath(__file__))
pkl_path = os.path.join(parent_dir, 'split_docs.pkl')

# sync
# Load docs later
with open(pkl_path, 'rb') as f:
   docs = pickle.load(f)
client = chromadb.PersistentClient(path=parent_dir)

# For all subsequent usage:
class LangchainGeminiWrapper:
    """
    Wrapper class to make GeminiEmbedding compatible with Langchain Chroma's interface
    """
    def __init__(self, api_key: str, model_name: str = "models/embedding-001"):
        self.model = GeminiEmbedding(
            api_key=api_key,
            model_name=model_name
        )

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """
        Embed multiple documents
        """
        return [self.model.get_text_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        """
        Embed a single query
        """
        return self.model.get_text_embedding(text)






def load_vector_store(gemini_key: str, persist_directory: str) -> VectorStore:
    gemini_embedder = LangchainGeminiWrapper(api_key=gemini_key)
    return Chroma(
        collection_name="example_collection",
        embedding_function=gemini_embedder,
        persist_directory=persist_directory
    )

class Retriever:
    def __init__(self, vectordb: VectorStore):
        self.vectordb = vectordb

    def retrieve_documents(self, query: str, k: int = 7) -> str:
        docs = self.vectordb.similarity_search(query, k=k)
        return "\nRetrieved documents:\n" + "".join(
            [f"===== Document {str(i)} =====\n" + doc.page_content for i, doc in enumerate(docs)]
        )