File size: 2,353 Bytes
2068d15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import json
import numpy as np
from src.retrieval.retriever import Retriever
from src.retrieval.reranker import HybridReranker
from src.evaluation.metrics import (
    precision_at_k as retrieval_precision_at_k,
    recall_at_k,
    mean_reciprocal_rank,
    bleu_score
)
from src.ingestion.document_loader import load_documents_from_dir
from src.ingestion.preprocessor import preprocess_documents
from src.ingestion.text_splitter import split_text


def run_evaluation(
    benchmark_path: str = "tests/benchmark.json",
    k: int = 3,
    top_k_dense: int = 10,
    top_k_final: int = 3,
    sparse_alpha: float = 0.5
):
    with open(benchmark_path, encoding="utf-8") as f:
        benchmarks = json.load(f)

    docs = load_documents_from_dir("data/raw")
    clean_docs = preprocess_documents(docs)
    chunks = split_text(clean_docs, chunk_size=300, chunk_overlap=50)
    texts = [chunk['content'] for chunk in chunks]

    retriever = Retriever("data/embeddings/batch_000.npy")
    reranker = HybridReranker(
        retriever=retriever,
        chunk_texts=texts,
        reranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
        sparse_alpha=sparse_alpha
    )

    all_retrieved = []
    all_relevant = []

    print(f"starting assessment reranker: Precision@{k}, Recall@{k}, MRR")
    print(f"dense top_k: {top_k_dense}, final top_k: {top_k_final}, sparse_alpha: {sparse_alpha}\n")

    for i, entry in enumerate(benchmarks, 1):
        query = entry['query']
        relevant_idxs = entry.get('relevant_idxs', [])

        idxs, scores = reranker.retrieve_and_rerank(
            query,
            top_k_dense=top_k_dense,
            top_k_final=top_k_final
        )

        p = retrieval_precision_at_k(retrieved_idxs=idxs, relevant_idxs=relevant_idxs, k=k)
        r = recall_at_k(retrieved_idxs=idxs, relevant_idxs=relevant_idxs, k=k)

        all_retrieved.append(idxs)
        all_relevant.append(relevant_idxs)

        print(f"{i}. Query: {query}")
        print(f"   Precision@{k}: {p:.2f}, Recall@{k}: {r:.2f}")
        print(f"   Retrieved idxs: {idxs}")
        print(f"   Rerank scores: {[f'{s:.4f}' for s in scores]}\n")

    mrr = mean_reciprocal_rank(retrieved_lists=all_retrieved, relevant_idxs_list=all_relevant)
    print(f"mean reciprocal rank (MRR): {mrr:.2f}\n")

if __name__ == "__main__":
    run_evaluation()