File size: 2,171 Bytes
f780124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
Test the full retrieval pipeline: hybrid search + re-ranking + diversity.
Compare it against pure dense search to show the improvement.
"""

import time
from src.utils.logger import setup_logger, get_logger
from src.retrieval.retrieval_pipeline import RetrievalPipeline
from src.vectorstore.qdrant_store import QdrantStore
from src.embeddings.embedding_model import EmbeddingModel

setup_logger()
logger = get_logger(__name__)


def test_pipeline(pipeline: RetrievalPipeline, query: str):
    print(f"\n{'='*60}")
    print(f"QUERY: {query}")
    print(f"{'='*60}")

    start = time.time()
    results = pipeline.retrieve(query, top_k_final=5)
    elapsed = time.time() - start

    print(f"Retrieved {len(results)} results in {elapsed:.2f}s\n")

    for i, r in enumerate(results):
        print(f"[{i+1}] CE Score: {r.get('ce_score', 'N/A'):>7} | "
              f"RRF: {r.get('rrf_score', 'N/A'):.4f}")
        print(f"     {r.get('title','')[:65]}...")
        print(f"     {r.get('text','')[:120].replace(chr(10),' ')}...")
        print()


def main():
    logger.info("Initializing full retrieval pipeline...")
    pipeline = RetrievalPipeline()

    # Test 1: Conceptual query
    test_pipeline(
        pipeline,
        "how does self-attention mechanism work in transformers"
    )

    # Test 2: Specific method query - tests BM25 keyword advantage
    test_pipeline(
        pipeline,
        "LoRA low-rank adaptation fine-tuning"
    )

    # Test 3: Comparison query
    test_pipeline(
        pipeline,
        "reinforcement learning reward shaping techniques"
    )

    # Test 4: With year filter
    print(f"\n{'='*60}")
    print("FILTERED: 'graph neural networks' (2026 only)")
    print(f"{'='*60}")

    results = pipeline.retrieve(
        "graph neural networks",
        filter_year_gte = 2026,
        top_k_final = 3
    )

    for i, r in enumerate(results):
        print(
            f"[{i+1}] {r.get('published_date', 'N/A')} | "
            f"CE: {r.get('ce_score','N/A'):>6} | "
            f"{r.get('title','')[:55]}..."
        )

    logger.info("\n✅ Retrieval pipeline test complete")


if __name__ == "__main__":
    main()