from typing import List import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sentence_transformers import CrossEncoder from langchain_community.vectorstores import FAISS from langchain_core.documents import Document class HybridReranker: def __init__( self, vector_store: FAISS, reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", device: str = 'cpu', cache_dir: str = "/app/huggingface_cache" ): self.vector_store = vector_store print(f"loading CrossEncoder. saving in: {cache_dir}") self.reranker = CrossEncoder(reranker_model, max_length=512, device=device, cache_folder=cache_dir) docs_in_order = list(self.vector_store.docstore._dict.values()) self.chunk_texts = [doc.page_content for doc in docs_in_order] self.chunk_metadata = [doc.metadata for doc in docs_in_order] print("building tf-idf") self.vectorizer = TfidfVectorizer() self.tfidf_matrix = self.vectorizer.fit_transform(self.chunk_texts) print("reranker ready") def retrieve_and_rerank( self, query: str, top_k_dense: int = 20, top_k_final: int = 5, ) -> List[Document]: dense_docs = self.vector_store.similarity_search(query, k=top_k_dense) q_vec = self.vectorizer.transform([query]) sparse_scores = (self.tfidf_matrix @ q_vec.T).toarray().ravel() sparse_indices = np.argsort(-sparse_scores)[:top_k_dense] sparse_docs = [ Document(page_content=self.chunk_texts[i], metadata=self.chunk_metadata[i]) for i in sparse_indices ] combined_docs = [] seen_contents = set() for doc in dense_docs + sparse_docs: if doc.page_content not in seen_contents: combined_docs.append(doc) seen_contents.add(doc.page_content) pairs = [[query, doc.page_content] for doc in combined_docs] rerank_scores = self.reranker.predict(pairs) doc_scores = list(zip(combined_docs, rerank_scores)) sorted_doc_scores = sorted(doc_scores, key=lambda x: x[1], reverse=True) final_docs = [doc for doc, score in sorted_doc_scores[:top_k_final]] return final_docs