ClariDoc / app /retrieval /retriever.py
Kshitijk20's picture
changes k value to 15
da21908
from langchain.retrievers import EnsembleRetriever
from app.retrieval.reranker import Reranker
class Retriever:
def __init__(self, pinecone_index, query = None, metadata = None, namespace=None, vectore_store = None,sparse_retriever = None, llm = None):
self.pinecone_index = pinecone_index
self.query = query
self.metadata = metadata
self.namespace = namespace
self.vector_store = vectore_store
self.sparse_retriever = sparse_retriever
self.llm = llm
self.dense_retriever = self.vector_store.as_retriever(
search_type="similarity",
search_kwargs={"k": 15,"namespace": self.namespace, "filter": self.metadata}
)
self.hybrid_retriever = EnsembleRetriever(
retrievers=[self.dense_retriever, sparse_retriever], # Use .retriever attribute
weights=[0.7, 0.3] # Fix: 'weights' not 'weight'
)
def retrieval_from_pinecone_vectoreStore(self):
"""
Retrieve the top matching chunks from Pinecone.
Args:
pinecone_index: Your Pinecone index object.
embedding: The vector embedding of the query.
top_k: How many chunks to retrieve.
filter_meta: Optional metadata filter dict.
Returns:
List of ClauseHit objects (lightweight container for chunk info).
"""
# res = self.pinecone_index.query(
# vector= self.query,
# top_k =top_k ,
# include_metadata = True,
# include_values = False,
# filter = self.metadata,
# namespace = self.namespace
# )
# Process the results into the expected format
# class ClauseHit:
# def __init__(self, doc_id, page, chunk_id, text, metadata, score):
# self.doc_id = doc_id
# self.page = page
# self.chunk_id = chunk_id
# self.text = text
# self.metadata = metadata
# self.score = score
# hits = []
# for match in res['matches']:
# hits.append(ClauseHit(
# doc_id=match['metadata'].get('doc_id', ''),
# page=match['metadata'].get('page_no', -1), # Use page_no instead of page
# chunk_id=match['metadata'].get('chunk_id', ''),
# text=match['metadata'].get('text', match.get('text', '')),
# metadata=match['metadata'],
# score=match['score']
# ))
# return hits
results = self.hybrid_retriever.invoke(self.query)
for doc in results:
print(f"printing Doc content : {doc.page_content}")
if self.llm:
reranker = Reranker(self.llm, results, self.query)
results = reranker.rerank_documents()
return results