File size: 1,958 Bytes
ab5dcab
9797603
ab5dcab
 
 
 
 
9797603
 
ab5dcab
9797603
ab5dcab
 
9797603
ab5dcab
 
 
 
9797603
 
 
ab5dcab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0438c70
 
ab5dcab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from typing import Literal, List

from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document


RetrievalMode = Literal["mmr", "similarity", "hybrid"]


def get_vectorstore(persist_dir: str) -> Chroma:
    embeddings = OpenAIEmbeddings()
    db = Chroma(
        persist_directory=persist_dir,
        embedding_function=embeddings,
    )
    return db


class HybridRetriever(BaseRetriever):
    db: Chroma
    top_k: int

    def _get_relevant_documents(
            self,
            query: str,
            *,
            run_manager: CallbackManagerForRetrieverRun,
    ) -> List[Document]:
        dense = self.db.similarity_search(query, k=self.top_k * 2)
        mmr = self.db.max_marginal_relevance_search(
            query,
            k=self.top_k,
            fetch_k=self.top_k * 3,
        )
        docs: List[Document] = []
        seen = set()
        for d in dense + mmr:
            key = (d.metadata.get("source"), d.page_content)
            if key in seen:
                continue
            seen.add(key)
            docs.append(d)
            if len(docs) >= self.top_k:
                break
        return docs


def get_retriever(
        persist_dir: str,
        top_k: int,
        retrieval_mode: RetrievalMode = "hybrid"

):
    db = get_vectorstore(persist_dir=persist_dir)
    mode = retrieval_mode.lower()
    if mode == "hybrid":
        return HybridRetriever(db=db, top_k=top_k)
    if mode == "similarity":
        return db.as_retriever(
            search_type="similarity",
            search_kwargs={"k": top_k},
        )
    return db.as_retriever(
        search_type="mmr",
        search_kwargs={
            "k": top_k,
            "fetch_k": max(top_k * 3, top_k + 2),
        },
    )