Spaces:
Sleeping
Sleeping
| import numpy as np | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sentence_transformers import SentenceTransformer, CrossEncoder | |
| BI_ENCODER_MODEL = "intfloat/multilingual-e5-large" | |
| CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2" | |
| RETRIEVAL_TOP_K = 15 | |
| RERANK_TOP_K = 3 | |
| def load_vector_database(): | |
| doc_embedding = np.load("src/VectorDatabase/embeddings.npy", allow_pickle=True) | |
| texts = np.load("src/VectorDatabase/texts.npy", allow_pickle=True) | |
| bi_encoder = SentenceTransformer(BI_ENCODER_MODEL) | |
| cross_encoder = CrossEncoder(CROSS_ENCODER_MODEL) | |
| return bi_encoder, cross_encoder, texts, doc_embedding | |
| bi_encoder, cross_encoder, texts, doc_embedding = load_vector_database() | |
| def retrieve_candidates(queries: list[str], top_k: int = RETRIEVAL_TOP_K) -> list[int]: | |
| prefixed = [f"query: {q}" for q in queries] | |
| query_embeddings = bi_encoder.encode(prefixed, normalize_embeddings=True) | |
| combined_scores: dict[int, float] = {} | |
| for q_emb in query_embeddings: | |
| scores = cosine_similarity(q_emb.reshape(1, -1), doc_embedding)[0] | |
| indices = np.argsort(scores)[::-1][:top_k] | |
| for rank, idx in enumerate(indices): | |
| combined_scores[idx] = combined_scores.get(idx, 0) + 1 / (30 + rank) | |
| sorted_indices = sorted(combined_scores, key=combined_scores.get, reverse=True) | |
| return sorted_indices[:top_k] | |
| def rerank(question: str, candidate_indices: list[int], top_k: int = RERANK_TOP_K) -> list[str]: | |
| candidates = [texts[idx] for idx in candidate_indices] | |
| pairs = [(question, text) for text in candidates] | |
| scores = cross_encoder.predict(pairs) | |
| ranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True) | |
| return [text for text, _ in ranked[:top_k]] | |
| def hybrid_search(query: str, question: str, top_k: int = RERANK_TOP_K) -> list[str]: | |
| all_queries = list({query, question}) | |
| candidate_indices = retrieve_candidates(all_queries, top_k=RETRIEVAL_TOP_K) | |
| results = rerank(question, candidate_indices, top_k=top_k) | |
| return results | |
| def retrieve(query: str, question: str, top_k: int = 5): | |
| all_queries = [query, question] | |
| prefixed = [f"query: {q}" for q in all_queries] | |
| query_embeddings = bi_encoder.encode(prefixed, normalize_embeddings=True) | |
| results = [] | |
| for q_emb in query_embeddings: | |
| scores = cosine_similarity(q_emb.reshape(1, -1), doc_embedding)[0] | |
| indices = np.argsort(scores)[::-1][:top_k] | |
| results.append(indices) | |
| return results[0], results[1] |