import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, CrossEncoder
 

BI_ENCODER_MODEL  = "intfloat/multilingual-e5-large"  
CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
 
RETRIEVAL_TOP_K = 15   
RERANK_TOP_K    = 3    
 
 
def load_vector_database():
    doc_embedding = np.load("src/VectorDatabase/embeddings.npy", allow_pickle=True)
    texts         = np.load("src/VectorDatabase/texts.npy",      allow_pickle=True)
    bi_encoder    = SentenceTransformer(BI_ENCODER_MODEL)
    cross_encoder = CrossEncoder(CROSS_ENCODER_MODEL)
    return bi_encoder, cross_encoder, texts, doc_embedding
 
bi_encoder, cross_encoder, texts, doc_embedding = load_vector_database()

def retrieve_candidates(queries: list[str], top_k: int = RETRIEVAL_TOP_K) -> list[int]:
    prefixed = [f"query: {q}" for q in queries]
    query_embeddings = bi_encoder.encode(prefixed, normalize_embeddings=True)
 
    combined_scores: dict[int, float] = {}
 
    for q_emb in query_embeddings:
        scores  = cosine_similarity(q_emb.reshape(1, -1), doc_embedding)[0]
        indices = np.argsort(scores)[::-1][:top_k]
 
        for rank, idx in enumerate(indices):
            combined_scores[idx] = combined_scores.get(idx, 0) + 1 / (30 + rank)
 

    sorted_indices = sorted(combined_scores, key=combined_scores.get, reverse=True)
    return sorted_indices[:top_k]

def rerank(question: str, candidate_indices: list[int], top_k: int = RERANK_TOP_K) -> list[str]:

    candidates = [texts[idx] for idx in candidate_indices]
    pairs      = [(question, text) for text in candidates]
    scores     = cross_encoder.predict(pairs)
 
    ranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
    return [text for text, _ in ranked[:top_k]]

def hybrid_search(query: str, question: str, top_k: int = RERANK_TOP_K) -> list[str]:

    all_queries = list({query, question}) 
 
    candidate_indices = retrieve_candidates(all_queries, top_k=RETRIEVAL_TOP_K)
    results           = rerank(question, candidate_indices, top_k=top_k)
 
    return results
 
 
def retrieve(query: str, question: str, top_k: int = 5):
    all_queries      = [query, question]
    prefixed         = [f"query: {q}" for q in all_queries]
    query_embeddings = bi_encoder.encode(prefixed, normalize_embeddings=True)
 
    results = []
    for q_emb in query_embeddings:
        scores  = cosine_similarity(q_emb.reshape(1, -1), doc_embedding)[0]
        indices = np.argsort(scores)[::-1][:top_k]
        results.append(indices)
 
    return results[0], results[1]