hcmut-rag-chatbot / src /Retrieve /retriever.py
botconming's picture
fix vector path
8f1c08f
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, CrossEncoder
BI_ENCODER_MODEL = "intfloat/multilingual-e5-large"
CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
RETRIEVAL_TOP_K = 15
RERANK_TOP_K = 3
def load_vector_database():
doc_embedding = np.load("src/VectorDatabase/embeddings.npy", allow_pickle=True)
texts = np.load("src/VectorDatabase/texts.npy", allow_pickle=True)
bi_encoder = SentenceTransformer(BI_ENCODER_MODEL)
cross_encoder = CrossEncoder(CROSS_ENCODER_MODEL)
return bi_encoder, cross_encoder, texts, doc_embedding
bi_encoder, cross_encoder, texts, doc_embedding = load_vector_database()
def retrieve_candidates(queries: list[str], top_k: int = RETRIEVAL_TOP_K) -> list[int]:
prefixed = [f"query: {q}" for q in queries]
query_embeddings = bi_encoder.encode(prefixed, normalize_embeddings=True)
combined_scores: dict[int, float] = {}
for q_emb in query_embeddings:
scores = cosine_similarity(q_emb.reshape(1, -1), doc_embedding)[0]
indices = np.argsort(scores)[::-1][:top_k]
for rank, idx in enumerate(indices):
combined_scores[idx] = combined_scores.get(idx, 0) + 1 / (30 + rank)
sorted_indices = sorted(combined_scores, key=combined_scores.get, reverse=True)
return sorted_indices[:top_k]
def rerank(question: str, candidate_indices: list[int], top_k: int = RERANK_TOP_K) -> list[str]:
candidates = [texts[idx] for idx in candidate_indices]
pairs = [(question, text) for text in candidates]
scores = cross_encoder.predict(pairs)
ranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
return [text for text, _ in ranked[:top_k]]
def hybrid_search(query: str, question: str, top_k: int = RERANK_TOP_K) -> list[str]:
all_queries = list({query, question})
candidate_indices = retrieve_candidates(all_queries, top_k=RETRIEVAL_TOP_K)
results = rerank(question, candidate_indices, top_k=top_k)
return results
def retrieve(query: str, question: str, top_k: int = 5):
all_queries = [query, question]
prefixed = [f"query: {q}" for q in all_queries]
query_embeddings = bi_encoder.encode(prefixed, normalize_embeddings=True)
results = []
for q_emb in query_embeddings:
scores = cosine_similarity(q_emb.reshape(1, -1), doc_embedding)[0]
indices = np.argsort(scores)[::-1][:top_k]
results.append(indices)
return results[0], results[1]