File size: 5,507 Bytes
e1624f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""
SOTA RAG Pipeline β€” Integration Test Suite.

Tests the full multi-stage retrieval pipeline:
  1. Bi-Encoder recall from ChromaDB
  2. Distance Gate filtering
  3. Cross-Encoder Re-ranking
  4. Token Trimming
  5. Collection stats
"""

import sys
import os

# Ensure project root is on path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from rag_engine.retriever import OncoRAGRetriever


def test_standard_query(
    query: str = "What is the recommended treatment for advanced HCC?",
) -> None:
    """
    Test the standard SOTA query pipeline.

    Args:
        query: A clinical question to search for in the guidelines.
    """
    print("=" * 70)
    print("πŸ§ͺ TEST 1: Standard SOTA Query Pipeline")
    print("=" * 70)

    retriever = OncoRAGRetriever()
    stats = retriever.get_collection_stats()
    print(f"\nπŸ“Š Collection: {stats['name']} | Docs: {stats['count']}")
    print(f"   Distance Threshold: {stats['distance_threshold']}")
    print(f"   Context Budget: {stats['max_context_chars']} chars")

    print(f"\n❓ Query: '{query}'")
    results = retriever.query(query, n_results=5, use_reranking=True)

    if not results:
        print("\n⚠️  No results passed the distance gate!")
        print("   β†’ This means the query is likely outside guideline coverage.")
        print("   β†’ Anti-Hallucination policy: 'InformaciΓ³n no concluyente'")
        return

    print(f"\nπŸ” {len(results)} results passed all stages:\n")
    for i, r in enumerate(results, 1):
        ce_score = r.get("cross_encoder_score", "N/A")
        bi_dist = r.get("bi_encoder_distance", "N/A")
        print(f"--- Result {i} ---")
        print(f"  πŸ“„ Source: {r['source']} (Page: {r['page']})")
        print(f"  🏷️  Section: {r['header']}")
        print(f"  πŸ“ Bi-Encoder Distance: {bi_dist}")
        print(f"  🎯 Cross-Encoder Score: {ce_score}")
        print(f"  πŸ“ Excerpt: {r['text'][:250]}...")
        print()

    # Show formatted context
    formatted = retriever.format_context_for_llm(results)
    print(f"\nπŸ“‹ Formatted LLM Context ({len(formatted)} chars):")
    print("-" * 50)
    print(formatted[:500] + "..." if len(formatted) > 500 else formatted)


def test_distance_gate() -> None:
    """
    Test that the distance gate correctly rejects irrelevant queries.
    A query about the common cold should return zero results from
    oncology guidelines.
    """
    print("\n" + "=" * 70)
    print("πŸ§ͺ TEST 2: Distance Gate (Anti-Hallucination)")
    print("=" * 70)

    retriever = OncoRAGRetriever()

    irrelevant_query = "How to treat a common cold with chicken soup"
    print(f"\n❓ Irrelevant Query: '{irrelevant_query}'")

    results = retriever.query(irrelevant_query, use_reranking=True)

    if not results:
        print("βœ… PASS β€” Distance gate correctly rejected all results!")
        print("   β†’ Anti-hallucination defense is working.")
    else:
        print(f"⚠️  WARN β€” {len(results)} results passed (may need tighter threshold)")
        for r in results:
            print(f"   Distance: {r.get('bi_encoder_distance', '?')} | {r['header']}")


def test_cross_encoder_reranking() -> None:
    """
    Test that cross-encoder re-ranking actually changes the order
    compared to bi-encoder-only results.
    """
    print("\n" + "=" * 70)
    print("πŸ§ͺ TEST 3: Cross-Encoder Re-Ranking Effect")
    print("=" * 70)

    retriever = OncoRAGRetriever()

    query = "EGFR mutation non-small cell lung cancer targeted therapy"
    print(f"\n❓ Query: '{query}'")

    # Without re-ranking (bi-encoder order)
    results_no_rerank = retriever.query(query, n_results=5, use_reranking=False)
    # With re-ranking
    results_reranked = retriever.query(query, n_results=5, use_reranking=True)

    print("\nπŸ“Š Bi-Encoder Order (no re-rank):")
    for i, r in enumerate(results_no_rerank, 1):
        print(f"  {i}. [{r.get('bi_encoder_distance', '?')}] {r['header'][:60]}")

    print("\nπŸ“Š After Cross-Encoder Re-Rank:")
    for i, r in enumerate(results_reranked, 1):
        print(f"  {i}. [score={r.get('cross_encoder_score', '?')}] {r['header'][:60]}")

    # Check if order changed
    headers_no = [r["header"] for r in results_no_rerank]
    headers_re = [r["header"] for r in results_reranked]
    if headers_no != headers_re:
        print("\nβœ… PASS β€” Re-ranking changed the order (precision improvement).")
    else:
        print("\n ℹ️  INFO β€” Same order (bi-encoder was already optimal for this query).")


def test_token_trimming() -> None:
    """
    Verify that the total context stays within the character budget.
    """
    print("\n" + "=" * 70)
    print("πŸ§ͺ TEST 4: Token Trimming (Context Budget)")
    print("=" * 70)

    retriever = OncoRAGRetriever(max_context_chars=2000)  # Tight budget

    query = "Breast cancer treatment recommendations"
    results = retriever.query(query, n_results=10)

    total_chars = sum(len(r["text"]) for r in results)
    print(f"\n   Budget: 2000 chars")
    print(f"   Actual: {total_chars} chars in {len(results)} results")

    if total_chars <= 2000:
        print("βœ… PASS β€” Context fits within budget.")
    else:
        print("⚠️  WARN β€” Context exceeds budget!")


if __name__ == "__main__":
    test_standard_query()
    test_distance_gate()
    test_cross_encoder_reranking()
    test_token_trimming()

    print("\n" + "=" * 70)
    print("🏁 All SOTA RAG tests completed.")
    print("=" * 70)