File size: 2,718 Bytes
fe617ac
 
6ad997d
fe617ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import time
import pandas as pd
from src.core.rag.vector_db import VectorDB

def run_benchmark():
    print("🚀 Starting Reranked Retrieval Benchmark...")
    
    # Load Title Mapping
    try:
        books_df = pd.read_csv("data/books_processed.csv")
        if 'isbn13' in books_df.columns:
            books_df['isbn'] = books_df['isbn13'].astype(str)
        else:
             books_df['isbn'] = books_df['isbn'].astype(str)
        isbn_map = books_df.set_index('isbn')['title'].to_dict()
    except Exception as e:
        print(f"⚠️ Failed to load books_processed.csv: {e}")
        isbn_map = {}

    db = VectorDB()
    
    # Same Test Cases
    test_queries = [
        # 1. Semantic (Reranker should bubble up best Semantic matches)
        {"type": "Semantic", "query": "books about finding love in unexpected places"},
        # Complex mood query
        {"type": "Complex", "query": "a dark sci-fi thriller with a female protagonist"},
        
        # 2. Keyword/Proper Noun (Reranker should confirm these are relevant)
        {"type": "Keyword", "query": "Harry Potter"}, 
        {"type": "Keyword", "query": "Jane Austen"}, 
        
        # 3. Exact Match (Should still work)
        {"type": "Exact", "query": "0060959479"}, 
    ]
    
    results = []
    
    for case in test_queries:
        q = case["query"]
        print(f"\nScanning: '{q}' ({case['type']})...")
        
        start_time = time.time()
        # USE HYBRID WITH RERANK
        docs = db.hybrid_search(q, k=5, rerank=True)
        duration = (time.time() - start_time) * 1000
        
        # Capture results with scores
        top_results = []
        for doc in docs:
            # Extract ISBN
            parts = doc.page_content.strip().split(' ', 1)
            isbn = parts[0]
            if "ISBN:" in doc.page_content:
                 isbn = doc.page_content.split("ISBN:")[1].strip().split()[0]

            title = isbn_map.get(isbn, f"ISBN:{isbn}")
            if len(title) > 30:
                title = title[:27] + "..."
            
            score = doc.metadata.get("relevance_score", 0.0)
            top_results.append(f"{title} ({score:.4f})")
            
        print(f"  -> Found: {top_results}")
        results.append({
            "query": q,
            "type": case["type"],
            "latency_ms": round(duration, 2),
            "top_results": top_results
        })

    # Save
    df = pd.DataFrame(results)
    path = "experiments/03_rerank_results.csv"
    df.to_csv(path, index=False)
    print(f"\n💾 Results saved to {path}")
    
    print("\n## Reranked Search Results")
    print(df.to_string(index=False))

if __name__ == "__main__":
    run_benchmark()