Spaces:

botconming
/

hcmut-rag-chatbot

Sleeping

App Files Files Community

hcmut-rag-chatbot / src /Retrieve /retriever.py

botconming

fix vector path

8f1c08f about 1 month ago

raw

history blame contribute delete

2.62 kB

	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity
	from sentence_transformers import SentenceTransformer, CrossEncoder


	BI_ENCODER_MODEL = "intfloat/multilingual-e5-large"
	CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"

	RETRIEVAL_TOP_K = 15
	RERANK_TOP_K = 3


	def load_vector_database():
	doc_embedding = np.load("src/VectorDatabase/embeddings.npy", allow_pickle=True)
	texts = np.load("src/VectorDatabase/texts.npy", allow_pickle=True)
	bi_encoder = SentenceTransformer(BI_ENCODER_MODEL)
	cross_encoder = CrossEncoder(CROSS_ENCODER_MODEL)
	return bi_encoder, cross_encoder, texts, doc_embedding

	bi_encoder, cross_encoder, texts, doc_embedding = load_vector_database()

	def retrieve_candidates(queries: list[str], top_k: int = RETRIEVAL_TOP_K) -> list[int]:
	prefixed = [f"query: {q}" for q in queries]
	query_embeddings = bi_encoder.encode(prefixed, normalize_embeddings=True)

	combined_scores: dict[int, float] = {}

	for q_emb in query_embeddings:
	scores = cosine_similarity(q_emb.reshape(1, -1), doc_embedding)[0]
	indices = np.argsort(scores)[::-1][:top_k]

	for rank, idx in enumerate(indices):
	combined_scores[idx] = combined_scores.get(idx, 0) + 1 / (30 + rank)


	sorted_indices = sorted(combined_scores, key=combined_scores.get, reverse=True)
	return sorted_indices[:top_k]

	def rerank(question: str, candidate_indices: list[int], top_k: int = RERANK_TOP_K) -> list[str]:

	candidates = [texts[idx] for idx in candidate_indices]
	pairs = [(question, text) for text in candidates]
	scores = cross_encoder.predict(pairs)

	ranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
	return [text for text, _ in ranked[:top_k]]

	def hybrid_search(query: str, question: str, top_k: int = RERANK_TOP_K) -> list[str]:

	all_queries = list({query, question})

	candidate_indices = retrieve_candidates(all_queries, top_k=RETRIEVAL_TOP_K)
	results = rerank(question, candidate_indices, top_k=top_k)

	return results


	def retrieve(query: str, question: str, top_k: int = 5):
	all_queries = [query, question]
	prefixed = [f"query: {q}" for q in all_queries]
	query_embeddings = bi_encoder.encode(prefixed, normalize_embeddings=True)

	results = []
	for q_emb in query_embeddings:
	scores = cosine_similarity(q_emb.reshape(1, -1), doc_embedding)[0]
	indices = np.argsort(scores)[::-1][:top_k]
	results.append(indices)

	return results[0], results[1]