Spaces:
Sleeping
Sleeping
File size: 2,115 Bytes
d6e43ef b09d763 d6e43ef b09d763 697b441 a03948e 697b441 b09d763 a03948e b09d763 d6e43ef | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | from langchain_chroma import Chroma
from langchain_core.vectorstores import VectorStore
#from task1 import LangchainGeminiWrapper #This is from your old task1 file
import chromadb
from llama_index.embeddings.gemini import GeminiEmbedding
from typing import List, Dict
import chromadb
import os
import pickle
# Retrieve API keys from environment variables
userdata = {
"GEMINI_API_KEY":os.getenv("GEMINI_API_KEY"),
}
gemini_key = userdata.get("GEMINI_API_KEY")
parent_dir = os.path.dirname(os.path.abspath(__file__))
pkl_path = os.path.join(parent_dir, 'split_docs.pkl')
# sync
# Load docs later
with open(pkl_path, 'rb') as f:
docs = pickle.load(f)
client = chromadb.PersistentClient(path=parent_dir)
# For all subsequent usage:
class LangchainGeminiWrapper:
"""
Wrapper class to make GeminiEmbedding compatible with Langchain Chroma's interface
"""
def __init__(self, api_key: str, model_name: str = "models/embedding-001"):
self.model = GeminiEmbedding(
api_key=api_key,
model_name=model_name
)
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""
Embed multiple documents
"""
return [self.model.get_text_embedding(text) for text in texts]
def embed_query(self, text: str) -> List[float]:
"""
Embed a single query
"""
return self.model.get_text_embedding(text)
def load_vector_store(gemini_key: str, persist_directory: str) -> VectorStore:
gemini_embedder = LangchainGeminiWrapper(api_key=gemini_key)
return Chroma(
collection_name="example_collection",
embedding_function=gemini_embedder,
persist_directory=persist_directory
)
class Retriever:
def __init__(self, vectordb: VectorStore):
self.vectordb = vectordb
def retrieve_documents(self, query: str, k: int = 7) -> str:
docs = self.vectordb.similarity_search(query, k=k)
return "\nRetrieved documents:\n" + "".join(
[f"===== Document {str(i)} =====\n" + doc.page_content for i, doc in enumerate(docs)]
) |