Sync backend Docker context from GitHub main
Browse files- Dockerfile +2 -1
- backend/routes/predict.py +2 -1
- backend/routes/predict_stream.py +4 -1
- backend/services/startup.py +0 -1
- config.yaml +3 -5
- data/__init__.py +0 -0
- data/vector_db.py +0 -245
- main.py +304 -204
- requirements.txt +3 -0
- retriever/evaluator.py +5 -4
- retriever/generator.py +12 -11
- retriever/processor.py +60 -2
- retriever/retriever.py +136 -62
- test.py +153 -0
Dockerfile
CHANGED
|
@@ -17,7 +17,8 @@ RUN pip install --upgrade pip && pip install -r requirements.txt
|
|
| 17 |
|
| 18 |
COPY . .
|
| 19 |
|
| 20 |
-
# Fail fast during build if critical runtime
|
|
|
|
| 21 |
RUN test -d /app/backend && test -d /app/retriever && test -d /app/models && test -f /app/config.yaml
|
| 22 |
|
| 23 |
# Hugging Face Spaces exposes apps on port 7860 by default.
|
|
|
|
| 17 |
|
| 18 |
COPY . .
|
| 19 |
|
| 20 |
+
# Fail fast during build if critical runtime code is missing from context.
|
| 21 |
+
# changed this cuz we no longer have monorep
|
| 22 |
RUN test -d /app/backend && test -d /app/retriever && test -d /app/models && test -f /app/config.yaml
|
| 23 |
|
| 24 |
# Hugging Face Spaces exposes apps on port 7860 by default.
|
backend/routes/predict.py
CHANGED
|
@@ -39,7 +39,7 @@ def predict(payload: PredictRequest) -> PredictResponse:
|
|
| 39 |
model_resolve_time = time.perf_counter() - model_resolve_start
|
| 40 |
|
| 41 |
retrieval_start = time.perf_counter()
|
| 42 |
-
contexts = retriever.search(
|
| 43 |
query,
|
| 44 |
index,
|
| 45 |
chunking_technique=payload.chunking_technique,
|
|
@@ -72,6 +72,7 @@ def predict(payload: PredictRequest) -> PredictResponse:
|
|
| 72 |
f"lambda={payload.lambda_param:.2f} | temp={payload.temperature:.2f} | "
|
| 73 |
f"chunking={payload.chunking_technique} | "
|
| 74 |
f"top_k={payload.top_k} | final_k={payload.final_k} | returned={len(contexts)} | "
|
|
|
|
| 75 |
f"precheck={precheck_time:.3f}s | "
|
| 76 |
f"state_access={state_access_time:.3f}s | model_resolve={model_resolve_time:.3f}s | "
|
| 77 |
f"retrieval={retrieval_time:.3f}s | inference={inference_time:.3f}s | "
|
|
|
|
| 39 |
model_resolve_time = time.perf_counter() - model_resolve_start
|
| 40 |
|
| 41 |
retrieval_start = time.perf_counter()
|
| 42 |
+
contexts, chunk_score = retriever.search(
|
| 43 |
query,
|
| 44 |
index,
|
| 45 |
chunking_technique=payload.chunking_technique,
|
|
|
|
| 72 |
f"lambda={payload.lambda_param:.2f} | temp={payload.temperature:.2f} | "
|
| 73 |
f"chunking={payload.chunking_technique} | "
|
| 74 |
f"top_k={payload.top_k} | final_k={payload.final_k} | returned={len(contexts)} | "
|
| 75 |
+
f"chunk_score={chunk_score:.4f} | "
|
| 76 |
f"precheck={precheck_time:.3f}s | "
|
| 77 |
f"state_access={state_access_time:.3f}s | model_resolve={model_resolve_time:.3f}s | "
|
| 78 |
f"retrieval={retrieval_time:.3f}s | inference={inference_time:.3f}s | "
|
backend/routes/predict_stream.py
CHANGED
|
@@ -47,7 +47,7 @@ def predict_stream(payload: PredictRequest) -> StreamingResponse:
|
|
| 47 |
model_resolve_time = time.perf_counter() - model_resolve_start
|
| 48 |
|
| 49 |
retrieval_start = time.perf_counter()
|
| 50 |
-
contexts = retriever.search(
|
| 51 |
query,
|
| 52 |
index,
|
| 53 |
chunking_technique=payload.chunking_technique,
|
|
@@ -80,6 +80,7 @@ def predict_stream(payload: PredictRequest) -> StreamingResponse:
|
|
| 80 |
"requested_top_k": payload.top_k,
|
| 81 |
"requested_final_k": payload.final_k,
|
| 82 |
"returned_context_count": len(contexts),
|
|
|
|
| 83 |
"use_mmr": payload.use_mmr,
|
| 84 |
"lambda_param": payload.lambda_param,
|
| 85 |
},
|
|
@@ -114,6 +115,7 @@ def predict_stream(payload: PredictRequest) -> StreamingResponse:
|
|
| 114 |
"requested_top_k": payload.top_k,
|
| 115 |
"requested_final_k": payload.final_k,
|
| 116 |
"returned_context_count": len(contexts),
|
|
|
|
| 117 |
"use_mmr": payload.use_mmr,
|
| 118 |
"lambda_param": payload.lambda_param,
|
| 119 |
},
|
|
@@ -127,6 +129,7 @@ def predict_stream(payload: PredictRequest) -> StreamingResponse:
|
|
| 127 |
f"lambda={payload.lambda_param:.2f} | temp={payload.temperature:.2f} | "
|
| 128 |
f"chunking={payload.chunking_technique} | "
|
| 129 |
f"top_k={payload.top_k} | final_k={payload.final_k} | returned={len(contexts)} | "
|
|
|
|
| 130 |
f"precheck={precheck_time:.3f}s | "
|
| 131 |
f"state_access={state_access_time:.3f}s | model_resolve={model_resolve_time:.3f}s | "
|
| 132 |
f"retrieval={retrieval_time:.3f}s | first_token={first_token_latency if first_token_latency is not None else -1:.3f}s | "
|
|
|
|
| 47 |
model_resolve_time = time.perf_counter() - model_resolve_start
|
| 48 |
|
| 49 |
retrieval_start = time.perf_counter()
|
| 50 |
+
contexts, chunk_score = retriever.search(
|
| 51 |
query,
|
| 52 |
index,
|
| 53 |
chunking_technique=payload.chunking_technique,
|
|
|
|
| 80 |
"requested_top_k": payload.top_k,
|
| 81 |
"requested_final_k": payload.final_k,
|
| 82 |
"returned_context_count": len(contexts),
|
| 83 |
+
"chunk_score": chunk_score,
|
| 84 |
"use_mmr": payload.use_mmr,
|
| 85 |
"lambda_param": payload.lambda_param,
|
| 86 |
},
|
|
|
|
| 115 |
"requested_top_k": payload.top_k,
|
| 116 |
"requested_final_k": payload.final_k,
|
| 117 |
"returned_context_count": len(contexts),
|
| 118 |
+
"chunk_score": chunk_score,
|
| 119 |
"use_mmr": payload.use_mmr,
|
| 120 |
"lambda_param": payload.lambda_param,
|
| 121 |
},
|
|
|
|
| 129 |
f"lambda={payload.lambda_param:.2f} | temp={payload.temperature:.2f} | "
|
| 130 |
f"chunking={payload.chunking_technique} | "
|
| 131 |
f"top_k={payload.top_k} | final_k={payload.final_k} | returned={len(contexts)} | "
|
| 132 |
+
f"chunk_score={chunk_score:.4f} | "
|
| 133 |
f"precheck={precheck_time:.3f}s | "
|
| 134 |
f"state_access={state_access_time:.3f}s | model_resolve={model_resolve_time:.3f}s | "
|
| 135 |
f"retrieval={retrieval_time:.3f}s | first_token={first_token_latency if first_token_latency is not None else -1:.3f}s | "
|
backend/services/startup.py
CHANGED
|
@@ -67,7 +67,6 @@ def initialize_runtime_state(state: dict[str, Any]) -> None:
|
|
| 67 |
|
| 68 |
retriever_start = time.perf_counter()
|
| 69 |
retriever = HybridRetriever(
|
| 70 |
-
final_chunks,
|
| 71 |
proc.encoder,
|
| 72 |
rerank_model_name=rerank_model_name,
|
| 73 |
verbose=False,
|
|
|
|
| 67 |
|
| 68 |
retriever_start = time.perf_counter()
|
| 69 |
retriever = HybridRetriever(
|
|
|
|
| 70 |
proc.encoder,
|
| 71 |
rerank_model_name=rerank_model_name,
|
| 72 |
verbose=False,
|
config.yaml
CHANGED
|
@@ -27,20 +27,18 @@ retrieval:
|
|
| 27 |
mode: "hybrid"
|
| 28 |
# Options: cross-encoder, rrf
|
| 29 |
rerank_strategy: "cross-encoder"
|
| 30 |
-
use_mmr:
|
| 31 |
-
top_k:
|
| 32 |
final_k: 5
|
| 33 |
|
| 34 |
generation:
|
| 35 |
temperature: 0.
|
| 36 |
max_new_tokens: 512
|
| 37 |
# The model used to Judge the others (OpenRouter)
|
| 38 |
-
judge_model: "
|
| 39 |
|
| 40 |
# List of contestants in the tournament
|
| 41 |
models:
|
| 42 |
- "Llama-3-8B"
|
| 43 |
- "Mistral-7B"
|
| 44 |
-
- "Qwen-2.5"
|
| 45 |
-
- "DeepSeek-V3"
|
| 46 |
- "TinyAya"
|
|
|
|
| 27 |
mode: "hybrid"
|
| 28 |
# Options: cross-encoder, rrf
|
| 29 |
rerank_strategy: "cross-encoder"
|
| 30 |
+
use_mmr: False
|
| 31 |
+
top_k: 50
|
| 32 |
final_k: 5
|
| 33 |
|
| 34 |
generation:
|
| 35 |
temperature: 0.
|
| 36 |
max_new_tokens: 512
|
| 37 |
# The model used to Judge the others (OpenRouter)
|
| 38 |
+
judge_model: "deepseek/deepseek-v3.2"
|
| 39 |
|
| 40 |
# List of contestants in the tournament
|
| 41 |
models:
|
| 42 |
- "Llama-3-8B"
|
| 43 |
- "Mistral-7B"
|
|
|
|
|
|
|
| 44 |
- "TinyAya"
|
data/__init__.py
DELETED
|
File without changes
|
data/vector_db.py
DELETED
|
@@ -1,245 +0,0 @@
|
|
| 1 |
-
import time
|
| 2 |
-
import re
|
| 3 |
-
import json
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
from typing import Any, Dict, List
|
| 6 |
-
from pinecone import Pinecone, ServerlessSpec
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
# Added cacheing to reduce consecutive startup time
|
| 10 |
-
# --@Qamar
|
| 11 |
-
|
| 12 |
-
def slugify_technique(name):
|
| 13 |
-
"""Converts 'Sentence Splitter' to 'sentence-splitter' for Pinecone naming."""
|
| 14 |
-
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
|
| 15 |
-
|
| 16 |
-
def get_index_by_name(api_key: str, index_name: str):
|
| 17 |
-
"""
|
| 18 |
-
Directly connects to a Pinecone index by its full string name.
|
| 19 |
-
Useful for the API/Production side where the name is already known.
|
| 20 |
-
"""
|
| 21 |
-
pc = Pinecone(api_key=api_key)
|
| 22 |
-
|
| 23 |
-
# Check if it exists first to avoid a 404 crash
|
| 24 |
-
existing_indexes = [idx.name for idx in pc.list_indexes()]
|
| 25 |
-
if index_name not in existing_indexes:
|
| 26 |
-
raise ValueError(f"Index '{index_name}' does not exist in your Pinecone project.")
|
| 27 |
-
|
| 28 |
-
print(f" Connecting to Index: {index_name}")
|
| 29 |
-
return pc.Index(index_name)
|
| 30 |
-
|
| 31 |
-
def get_pinecone_index(api_key, base_name, technique, dimension=384, metric="cosine"):
|
| 32 |
-
"""
|
| 33 |
-
Creates/Returns an index specifically for a technique.
|
| 34 |
-
Example: 'arxiv-index-token'
|
| 35 |
-
"""
|
| 36 |
-
pc = Pinecone(api_key=api_key)
|
| 37 |
-
tech_slug = slugify_technique(technique)
|
| 38 |
-
full_index_name = f"{base_name}-{tech_slug}"
|
| 39 |
-
|
| 40 |
-
existing_indexes = [idx.name for idx in pc.list_indexes()]
|
| 41 |
-
|
| 42 |
-
if full_index_name not in existing_indexes:
|
| 43 |
-
print(f" Creating specialized index: {full_index_name}...")
|
| 44 |
-
pc.create_index(
|
| 45 |
-
name=full_index_name,
|
| 46 |
-
dimension=dimension,
|
| 47 |
-
metric=metric,
|
| 48 |
-
spec=ServerlessSpec(cloud="aws", region="us-east-1")
|
| 49 |
-
)
|
| 50 |
-
# Wait for index to spin up
|
| 51 |
-
while not pc.describe_index(full_index_name).status['ready']:
|
| 52 |
-
time.sleep(1)
|
| 53 |
-
|
| 54 |
-
# Use our new helper to return the index object
|
| 55 |
-
return get_index_by_name(api_key, full_index_name)
|
| 56 |
-
|
| 57 |
-
def refresh_pinecone_index(index, final_chunks, batch_size=100):
|
| 58 |
-
"""
|
| 59 |
-
Refreshes the specific index. Since index is now technique-specific,
|
| 60 |
-
we just check if it's already populated.
|
| 61 |
-
"""
|
| 62 |
-
if not final_chunks:
|
| 63 |
-
print("No chunks provided to refresh.")
|
| 64 |
-
return False
|
| 65 |
-
|
| 66 |
-
try:
|
| 67 |
-
# Check current stats for this specific index
|
| 68 |
-
stats = index.describe_index_stats()
|
| 69 |
-
current_count = stats.get('total_vector_count', 0)
|
| 70 |
-
expected_count = len(final_chunks)
|
| 71 |
-
|
| 72 |
-
print(f" Index Stats -> Existing: {current_count} | New Chunks: {expected_count}")
|
| 73 |
-
|
| 74 |
-
if current_count == 0:
|
| 75 |
-
print(f"➕ Index is empty. Upserting {expected_count} vectors...")
|
| 76 |
-
vectors = prepare_vectors_for_upsert(final_chunks)
|
| 77 |
-
upsert_to_pinecone(index, vectors, batch_size)
|
| 78 |
-
return True
|
| 79 |
-
|
| 80 |
-
elif current_count < expected_count:
|
| 81 |
-
# Simple check to see if we need to top up or refresh
|
| 82 |
-
print(f" Vector count mismatch ({current_count} < {expected_count}). Updating index...")
|
| 83 |
-
vectors = prepare_vectors_for_upsert(final_chunks)
|
| 84 |
-
upsert_to_pinecone(index, vectors, batch_size)
|
| 85 |
-
return True
|
| 86 |
-
|
| 87 |
-
else:
|
| 88 |
-
print(f" Index is already populated with {current_count} vectors. Ready for search.")
|
| 89 |
-
return False
|
| 90 |
-
|
| 91 |
-
except Exception as e:
|
| 92 |
-
print(f" Error refreshing index: {e}")
|
| 93 |
-
return False
|
| 94 |
-
|
| 95 |
-
# Utility functions remain the same as previous version
|
| 96 |
-
def prepare_vectors_for_upsert(final_chunks):
|
| 97 |
-
vectors = []
|
| 98 |
-
for chunk in final_chunks:
|
| 99 |
-
meta = chunk.get('metadata', {})
|
| 100 |
-
metadata_payload = dict(meta) if isinstance(meta, dict) else {}
|
| 101 |
-
metadata_payload.setdefault('text', meta.get('text', "") if isinstance(meta, dict) else "")
|
| 102 |
-
metadata_payload.setdefault('title', meta.get('title', "") if isinstance(meta, dict) else "")
|
| 103 |
-
metadata_payload.setdefault('url', meta.get('url', "") if isinstance(meta, dict) else "")
|
| 104 |
-
metadata_payload.setdefault('chunk_index', meta.get('chunk_index', 0) if isinstance(meta, dict) else 0)
|
| 105 |
-
metadata_payload.setdefault('technique', meta.get('technique', "unknown") if isinstance(meta, dict) else "unknown")
|
| 106 |
-
metadata_payload.setdefault('chunking_technique', meta.get('chunking_technique', "unknown") if isinstance(meta, dict) else "unknown")
|
| 107 |
-
|
| 108 |
-
vectors.append({
|
| 109 |
-
'id': chunk['id'],
|
| 110 |
-
'values': chunk['values'],
|
| 111 |
-
'metadata': metadata_payload
|
| 112 |
-
})
|
| 113 |
-
return vectors
|
| 114 |
-
|
| 115 |
-
def upsert_to_pinecone(index, chunks, batch_size=100):
|
| 116 |
-
for i in range(0, len(chunks), batch_size):
|
| 117 |
-
batch = chunks[i : i + batch_size]
|
| 118 |
-
index.upsert(vectors=batch)
|
| 119 |
-
|
| 120 |
-
# Some methods for loading chunks back from Pinecone with local caching to speed up BM25 initialization
|
| 121 |
-
|
| 122 |
-
def _sanitize_index_name(index_name: str) -> str:
|
| 123 |
-
return re.sub(r'[^a-zA-Z0-9._-]+', '-', index_name).strip('-') or 'default-index'
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
def _chunk_cache_path(cache_dir: str, index_name: str) -> Path:
|
| 127 |
-
cache_root = Path(cache_dir)
|
| 128 |
-
cache_root.mkdir(parents=True, exist_ok=True)
|
| 129 |
-
safe_name = _sanitize_index_name(index_name)
|
| 130 |
-
return cache_root / f"bm25_chunks_{safe_name}.json"
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
def _read_chunk_cache(path: Path) -> Dict[str, Any]:
|
| 134 |
-
with path.open("r", encoding="utf-8") as f:
|
| 135 |
-
return json.load(f)
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
def _write_chunk_cache(path: Path, payload: Dict[str, Any]) -> None:
|
| 139 |
-
with path.open("w", encoding="utf-8") as f:
|
| 140 |
-
json.dump(payload, f)
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
def load_chunks_with_local_cache(
|
| 144 |
-
index,
|
| 145 |
-
index_name: str,
|
| 146 |
-
cache_dir: str = ".cache",
|
| 147 |
-
batch_size: int = 100,
|
| 148 |
-
force_refresh: bool = False,
|
| 149 |
-
) -> tuple[List[Dict[str, Any]], str]:
|
| 150 |
-
|
| 151 |
-
cache_file = _chunk_cache_path(cache_dir=cache_dir, index_name=index_name)
|
| 152 |
-
stats = index.describe_index_stats()
|
| 153 |
-
current_count = stats.get("total_vector_count", 0)
|
| 154 |
-
|
| 155 |
-
if not force_refresh and cache_file.exists():
|
| 156 |
-
try:
|
| 157 |
-
cached_payload = _read_chunk_cache(cache_file)
|
| 158 |
-
cached_meta = cached_payload.get("meta", {})
|
| 159 |
-
cached_count = cached_meta.get("vector_count", -1)
|
| 160 |
-
cached_chunks = cached_payload.get("chunks", [])
|
| 161 |
-
|
| 162 |
-
if cached_count == current_count and cached_chunks:
|
| 163 |
-
print(
|
| 164 |
-
f" Loaded BM25 chunk cache: {cache_file} "
|
| 165 |
-
f"(chunks={len(cached_chunks)}, vectors={cached_count})"
|
| 166 |
-
)
|
| 167 |
-
return cached_chunks, "cache"
|
| 168 |
-
|
| 169 |
-
print(
|
| 170 |
-
" BM25 cache stale or empty. "
|
| 171 |
-
f"cache_vectors={cached_count}, pinecone_vectors={current_count}. Refreshing..."
|
| 172 |
-
)
|
| 173 |
-
except Exception as e:
|
| 174 |
-
print(f" Failed to read BM25 cache ({cache_file}): {e}. Refreshing from Pinecone...")
|
| 175 |
-
|
| 176 |
-
chunks = load_chunks_from_pinecone(index=index, batch_size=batch_size)
|
| 177 |
-
payload = {
|
| 178 |
-
"meta": {
|
| 179 |
-
"index_name": index_name,
|
| 180 |
-
"vector_count": current_count,
|
| 181 |
-
"updated_at_epoch_s": int(time.time()),
|
| 182 |
-
},
|
| 183 |
-
"chunks": chunks,
|
| 184 |
-
}
|
| 185 |
-
|
| 186 |
-
try:
|
| 187 |
-
_write_chunk_cache(cache_file, payload)
|
| 188 |
-
print(f" Saved BM25 chunk cache: {cache_file} (chunks={len(chunks)})")
|
| 189 |
-
except Exception as e:
|
| 190 |
-
print(f" Failed to write BM25 cache ({cache_file}): {e}")
|
| 191 |
-
|
| 192 |
-
return chunks, "pinecone"
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
def load_chunks_from_pinecone(index, batch_size: int = 100) -> list[dict[str, any]]:
|
| 196 |
-
"""
|
| 197 |
-
Scans the Pinecone index to retrieve all text metadata for the BM25 corpus.
|
| 198 |
-
"""
|
| 199 |
-
stats = index.describe_index_stats()
|
| 200 |
-
namespaces = list(stats.get('namespaces', {}).keys())
|
| 201 |
-
# If no namespaces are explicitly named, Pinecone uses an empty string for the default
|
| 202 |
-
if not namespaces:
|
| 203 |
-
namespaces = [""]
|
| 204 |
-
|
| 205 |
-
all_chunks: List[Dict[str, Any]] = []
|
| 206 |
-
seen_ids = set()
|
| 207 |
-
|
| 208 |
-
print(f"Loading vectors for BM25 from namespaces: {namespaces}")
|
| 209 |
-
|
| 210 |
-
for ns in namespaces:
|
| 211 |
-
# Pinecone's list() generator returns batches of IDs
|
| 212 |
-
for id_batch in index.list(namespace=ns, limit=batch_size):
|
| 213 |
-
if not id_batch:
|
| 214 |
-
continue
|
| 215 |
-
|
| 216 |
-
# Fetch the actual content (metadata) for this batch of IDs
|
| 217 |
-
fetched = index.fetch(ids=id_batch, namespace=ns)
|
| 218 |
-
vectors = getattr(fetched, "vectors", {})
|
| 219 |
-
|
| 220 |
-
for vector_id, vector_data in vectors.items():
|
| 221 |
-
if vector_id in seen_ids:
|
| 222 |
-
continue
|
| 223 |
-
seen_ids.add(vector_id)
|
| 224 |
-
|
| 225 |
-
# Safely extract metadata
|
| 226 |
-
metadata = getattr(vector_data, "metadata", {})
|
| 227 |
-
if metadata is None:
|
| 228 |
-
metadata = {}
|
| 229 |
-
if not isinstance(metadata, dict):
|
| 230 |
-
metadata = dict(metadata)
|
| 231 |
-
|
| 232 |
-
text = metadata.get("text")
|
| 233 |
-
|
| 234 |
-
if not text:
|
| 235 |
-
continue
|
| 236 |
-
|
| 237 |
-
all_chunks.append({
|
| 238 |
-
"id": vector_id,
|
| 239 |
-
"metadata": metadata
|
| 240 |
-
})
|
| 241 |
-
|
| 242 |
-
print(f" Finished namespace: '{ns if ns else 'default'}'")
|
| 243 |
-
|
| 244 |
-
print(f"Total chunks loaded into memory: {len(all_chunks)}")
|
| 245 |
-
return all_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main.py
CHANGED
|
@@ -2,8 +2,6 @@ import os
|
|
| 2 |
import json
|
| 3 |
import time
|
| 4 |
from datetime import datetime
|
| 5 |
-
from multiprocessing import Pool, cpu_count
|
| 6 |
-
from functools import partial
|
| 7 |
from dotenv import load_dotenv
|
| 8 |
from config_loader import cfg
|
| 9 |
|
|
@@ -18,63 +16,47 @@ from data.ingest import ingest_data, CHUNKING_TECHNIQUES
|
|
| 18 |
# Import model fleet
|
| 19 |
from models.llama_3_8b import Llama3_8B
|
| 20 |
from models.mistral_7b import Mistral_7b
|
| 21 |
-
from models.qwen_2_5 import Qwen2_5
|
| 22 |
-
from models.deepseek_v3 import DeepSeek_V3
|
| 23 |
from models.tiny_aya import TinyAya
|
| 24 |
|
| 25 |
MODEL_MAP = {
|
| 26 |
"Llama-3-8B": Llama3_8B,
|
| 27 |
"Mistral-7B": Mistral_7b,
|
| 28 |
-
"Qwen-2.5": Qwen2_5,
|
| 29 |
-
"DeepSeek-V3": DeepSeek_V3,
|
| 30 |
"TinyAya": TinyAya
|
| 31 |
}
|
| 32 |
|
| 33 |
load_dotenv()
|
| 34 |
|
| 35 |
|
| 36 |
-
def run_rag_for_technique(technique_name, query, index, encoder, models, evaluator, rag_engine):
|
| 37 |
-
"""Run RAG pipeline for a specific chunking technique."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
print(f"\n{'='*80}")
|
| 40 |
-
print(f"TECHNIQUE: {technique_name.upper()}")
|
| 41 |
print(f"{'='*80}")
|
| 42 |
|
| 43 |
-
#
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
| 52 |
)
|
| 53 |
|
| 54 |
-
|
| 55 |
-
all_candidates = []
|
| 56 |
-
chunk_urls = []
|
| 57 |
-
for match in res['matches']:
|
| 58 |
-
all_candidates.append(match['metadata']['text'])
|
| 59 |
-
chunk_urls.append(match['metadata'].get('url', ''))
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
if not all_candidates:
|
| 64 |
print(f"WARNING: No chunks found for technique '{technique_name}'")
|
| 65 |
return {}
|
| 66 |
|
| 67 |
-
# Apply cross-encoder reranking to get top 5
|
| 68 |
-
# Use global reranker loaded once per worker
|
| 69 |
-
global _worker_reranker
|
| 70 |
-
pairs = [[query, chunk] for chunk in all_candidates]
|
| 71 |
-
scores = _worker_reranker.predict(pairs)
|
| 72 |
-
ranked = sorted(zip(all_candidates, chunk_urls, scores), key=lambda x: x[2], reverse=True)
|
| 73 |
-
context_chunks = [chunk for chunk, _, _ in ranked[:5]]
|
| 74 |
-
context_urls = [url for _, url, _ in ranked[:5]]
|
| 75 |
-
|
| 76 |
-
print(f"After reranking: {len(context_chunks)} chunks (top 5)")
|
| 77 |
-
|
| 78 |
# Print the final RAG context being passed to models (only once)
|
| 79 |
print(f"\n{'='*80}")
|
| 80 |
print(f"📚 FINAL RAG CONTEXT FOR TECHNIQUE '{technique_name.upper()}'")
|
|
@@ -88,6 +70,8 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
|
|
| 88 |
|
| 89 |
# Run model tournament for this technique
|
| 90 |
tournament_results = {}
|
|
|
|
|
|
|
| 91 |
|
| 92 |
for name, model_inst in models.items():
|
| 93 |
print(f"\n{'-'*60}")
|
|
@@ -97,7 +81,6 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
|
|
| 97 |
# Generation
|
| 98 |
answer = rag_engine.get_answer(
|
| 99 |
model_inst, query, context_chunks,
|
| 100 |
-
context_urls=context_urls,
|
| 101 |
temperature=cfg.gen['temperature']
|
| 102 |
)
|
| 103 |
|
|
@@ -118,7 +101,6 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
|
|
| 118 |
"Relevancy": rel['score'],
|
| 119 |
"Claims": faith['details'],
|
| 120 |
"context_chunks": context_chunks,
|
| 121 |
-
"context_urls": context_urls
|
| 122 |
}
|
| 123 |
|
| 124 |
print(f"\n📊 EVALUATION SCORES:")
|
|
@@ -135,7 +117,6 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
|
|
| 135 |
"Claims": [],
|
| 136 |
"error": str(e),
|
| 137 |
"context_chunks": context_chunks,
|
| 138 |
-
"context_urls": context_urls
|
| 139 |
}
|
| 140 |
|
| 141 |
return tournament_results
|
|
@@ -185,13 +166,21 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
|
|
| 185 |
|
| 186 |
# Aggregate results across all queries
|
| 187 |
aggregated_results = {}
|
|
|
|
| 188 |
|
| 189 |
for query_idx, query_results in all_query_results.items():
|
| 190 |
for technique_name, model_results in query_results.items():
|
| 191 |
if technique_name not in aggregated_results:
|
| 192 |
aggregated_results[technique_name] = {}
|
| 193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
for model_name, results in model_results.items():
|
|
|
|
|
|
|
|
|
|
| 195 |
if model_name not in aggregated_results[technique_name]:
|
| 196 |
aggregated_results[technique_name][model_name] = {
|
| 197 |
'Faithfulness': [],
|
|
@@ -213,6 +202,15 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
|
|
| 213 |
content += "No results available for this technique.\n\n"
|
| 214 |
continue
|
| 215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
# Create results table with averaged scores
|
| 217 |
content += "| Model | Avg Faithfulness | Avg Relevancy | Avg Combined |\n"
|
| 218 |
content += "|-------|------------------|---------------|--------------|\n"
|
|
@@ -348,8 +346,8 @@ Based on the ablation study results:
|
|
| 348 |
- *Embedding Model:* Jina embeddings (512 dimensions)
|
| 349 |
- *Vector Database:* Pinecone (serverless, AWS us-east-1)
|
| 350 |
- *Judge Model:* Openrouter Free models
|
| 351 |
-
- *Retrieval:* Top
|
| 352 |
-
- *Evaluation Metrics:* Faithfulness (context grounding), Relevancy (query addressing)
|
| 353 |
|
| 354 |
---
|
| 355 |
|
|
@@ -364,81 +362,100 @@ This report was automatically generated by the RAG Ablation Study Pipeline.
|
|
| 364 |
return output_file
|
| 365 |
|
| 366 |
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
"
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
"Mistral-7B": Mistral_7b,
|
| 391 |
-
"Qwen-2.5": Qwen2_5,
|
| 392 |
-
"DeepSeek-V3": DeepSeek_V3,
|
| 393 |
-
"TinyAya": TinyAya
|
| 394 |
-
}
|
| 395 |
-
|
| 396 |
-
# Load embedding model once
|
| 397 |
-
_worker_proc = ChunkProcessor(model_name=model_name, verbose=False)
|
| 398 |
-
|
| 399 |
-
# Initialize evaluator
|
| 400 |
-
_worker_evaluator = RAGEvaluator(
|
| 401 |
-
judge_model=evaluator_config['judge_model'],
|
| 402 |
-
embedding_model=_worker_proc.encoder,
|
| 403 |
-
api_key=evaluator_config['api_key']
|
| 404 |
)
|
| 405 |
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
|
|
|
|
|
|
| 409 |
|
| 410 |
-
#
|
| 411 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
|
| 413 |
-
|
| 414 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
|
| 421 |
-
|
| 422 |
-
try:
|
| 423 |
-
# Create new connection in worker process
|
| 424 |
-
from data.vector_db import get_index_by_name
|
| 425 |
-
index = get_index_by_name(pinecone_key, index_name)
|
| 426 |
-
|
| 427 |
-
return technique['name'], run_rag_for_technique(
|
| 428 |
-
technique_name=technique['name'],
|
| 429 |
-
query=query,
|
| 430 |
-
index=index,
|
| 431 |
-
encoder=_worker_proc.encoder,
|
| 432 |
-
models=_worker_models,
|
| 433 |
-
evaluator=_worker_evaluator,
|
| 434 |
-
rag_engine=_worker_rag_engine
|
| 435 |
-
)
|
| 436 |
-
except Exception as e:
|
| 437 |
-
import traceback
|
| 438 |
-
print(f"\n✗ Error processing technique {technique['name']}: {e}")
|
| 439 |
-
print(f"Full traceback:")
|
| 440 |
-
traceback.print_exc()
|
| 441 |
-
return technique['name'], {}
|
| 442 |
|
| 443 |
|
| 444 |
def main():
|
|
@@ -458,8 +475,9 @@ def main():
|
|
| 458 |
# Test queries
|
| 459 |
test_queries = [
|
| 460 |
"What is cognitive behavior therapy and how does it work?",
|
| 461 |
-
"
|
| 462 |
-
"
|
|
|
|
| 463 |
]
|
| 464 |
|
| 465 |
print("=" * 80)
|
|
@@ -478,122 +496,186 @@ def main():
|
|
| 478 |
from data.vector_db import get_index_by_name
|
| 479 |
index_name = f"{cfg.db['base_index_name']}-{cfg.processing['technique']}"
|
| 480 |
|
| 481 |
-
print(f"\
|
| 482 |
|
| 483 |
try:
|
| 484 |
# Try to connect to existing index
|
| 485 |
-
print("Connecting to Pinecone...")
|
| 486 |
existing_index = get_index_by_name(pinecone_key, index_name)
|
| 487 |
-
print("Getting index stats...")
|
| 488 |
stats = existing_index.describe_index_stats()
|
| 489 |
existing_count = stats.get('total_vector_count', 0)
|
| 490 |
|
| 491 |
if existing_count > 0:
|
| 492 |
-
print(f"\n✓ Found existing index with {existing_count} vectors")
|
| 493 |
-
print("Skipping ingestion - using existing data")
|
| 494 |
|
| 495 |
# Initialize processor (this loads the embedding model)
|
| 496 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
from retriever.processor import ChunkProcessor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 498 |
proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)
|
|
|
|
|
|
|
|
|
|
| 499 |
index = existing_index
|
| 500 |
all_chunks = [] # Empty since we're using existing data
|
| 501 |
final_chunks = []
|
| 502 |
-
print("✓ Processor initialized")
|
| 503 |
else:
|
| 504 |
-
print("\
|
| 505 |
all_chunks, final_chunks, proc, index = ingest_data()
|
| 506 |
except Exception as e:
|
| 507 |
-
print(f"\
|
| 508 |
-
|
|
|
|
|
|
|
| 509 |
all_chunks, final_chunks, proc, index = ingest_data()
|
| 510 |
|
| 511 |
print(f"\nTechniques to evaluate: {[tech['name'] for tech in CHUNKING_TECHNIQUES]}")
|
| 512 |
|
| 513 |
-
# Step 2:
|
| 514 |
print("\n" + "=" * 80)
|
| 515 |
-
print("STEP 2:
|
| 516 |
print("=" * 80)
|
|
|
|
|
|
|
| 517 |
|
| 518 |
-
#
|
| 519 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
rag_engine = RAGGenerator()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 521 |
models = {name: MODEL_MAP[name](token=hf_token) for name in cfg.model_list}
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
print("Initializing evaluator...")
|
| 525 |
-
if not openrouter_key:
|
| 526 |
-
raise RuntimeError("OPENROUTER_API_KEY not found in environment variables")
|
| 527 |
|
|
|
|
|
|
|
| 528 |
evaluator = RAGEvaluator(
|
| 529 |
judge_model=cfg.gen['judge_model'],
|
| 530 |
embedding_model=proc.encoder,
|
| 531 |
api_key=openrouter_key
|
| 532 |
)
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
print("
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
'model_list': cfg.model_list
|
| 548 |
-
}
|
| 549 |
|
| 550 |
all_query_results = {}
|
| 551 |
|
| 552 |
for query_idx, query in enumerate(test_queries):
|
| 553 |
print(f"\n{'='*80}")
|
| 554 |
-
print(f"PROCESSING QUERY {query_idx + 1}/{len(test_queries)}")
|
| 555 |
-
print(f"Query: {query}")
|
| 556 |
print(f"{'='*80}")
|
|
|
|
|
|
|
| 557 |
|
| 558 |
-
|
| 559 |
-
processes=num_processes,
|
| 560 |
-
initializer=init_worker,
|
| 561 |
-
initargs=(cfg.processing['embedding_model'], evaluator_config)
|
| 562 |
-
) as pool:
|
| 563 |
-
args_list = [
|
| 564 |
-
(technique, query, index_name, pinecone_key)
|
| 565 |
-
for technique in CHUNKING_TECHNIQUES
|
| 566 |
-
]
|
| 567 |
-
results_list = pool.map(run_rag_for_technique_wrapper, args_list)
|
| 568 |
-
|
| 569 |
-
# Convert results to dictionary and store
|
| 570 |
-
query_results = {name: results for name, results in results_list}
|
| 571 |
-
all_query_results[query_idx] = query_results
|
| 572 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 573 |
# Print quick summary for this query
|
| 574 |
print(f"\n{'='*80}")
|
| 575 |
print(f"QUERY {query_idx + 1} SUMMARY")
|
| 576 |
print(f"{'='*80}")
|
| 577 |
-
print(f"\n{'Technique':<15} {'Avg Faith':>12} {'Avg Rel':>12} {'Best Model':<20}")
|
| 578 |
-
print("-" *
|
| 579 |
|
| 580 |
-
for
|
| 581 |
if model_results:
|
| 582 |
-
|
| 583 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
|
| 585 |
# Find best model
|
| 586 |
best_model = max(
|
| 587 |
-
|
| 588 |
key=lambda x: x[1].get('Faithfulness', 0) + x[1].get('Relevancy', 0)
|
| 589 |
)
|
| 590 |
best_name = best_model[0]
|
| 591 |
|
| 592 |
-
print(f"{
|
| 593 |
else:
|
| 594 |
-
print(f"{
|
| 595 |
|
| 596 |
-
print("-" *
|
| 597 |
|
| 598 |
# Step 4: Generate findings document from all queries
|
| 599 |
print("\n" + "=" * 80)
|
|
@@ -608,45 +690,63 @@ def main():
|
|
| 608 |
print("=" * 80)
|
| 609 |
|
| 610 |
print(f"\nQueries processed: {len(test_queries)}")
|
| 611 |
-
print(f"Techniques evaluated: {len(
|
| 612 |
print(f"Models tested: {len(cfg.model_list)}")
|
| 613 |
print(f"\nFindings document: {findings_file}")
|
| 614 |
|
| 615 |
# Print final summary across all queries
|
| 616 |
-
print("\n" + "-" *
|
| 617 |
-
print(f"{'Technique':<15} {'Avg Faith':>12} {'Avg Rel':>12} {'Best Model':<20}")
|
| 618 |
-
print("-" *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 619 |
|
| 620 |
-
# Calculate averages across all queries
|
| 621 |
-
for tech_config in
|
| 622 |
tech_name = tech_config['name']
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
all_rel.append(rel)
|
| 637 |
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 648 |
|
| 649 |
-
print("-" *
|
| 650 |
|
| 651 |
print("\n✓ Ablation study complete!")
|
| 652 |
print(f"✓ Results saved to: {findings_file}")
|
|
|
|
| 2 |
import json
|
| 3 |
import time
|
| 4 |
from datetime import datetime
|
|
|
|
|
|
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
from config_loader import cfg
|
| 7 |
|
|
|
|
| 16 |
# Import model fleet
|
| 17 |
from models.llama_3_8b import Llama3_8B
|
| 18 |
from models.mistral_7b import Mistral_7b
|
|
|
|
|
|
|
| 19 |
from models.tiny_aya import TinyAya
|
| 20 |
|
| 21 |
MODEL_MAP = {
|
| 22 |
"Llama-3-8B": Llama3_8B,
|
| 23 |
"Mistral-7B": Mistral_7b,
|
|
|
|
|
|
|
| 24 |
"TinyAya": TinyAya
|
| 25 |
}
|
| 26 |
|
| 27 |
load_dotenv()
|
| 28 |
|
| 29 |
|
| 30 |
+
def run_rag_for_technique(technique_name, query, index, encoder, models, evaluator, rag_engine, retriever, retrieval_strategy):
|
| 31 |
+
"""Run RAG pipeline for a specific chunking technique and retrieval strategy."""
|
| 32 |
+
|
| 33 |
+
mode = retrieval_strategy['mode']
|
| 34 |
+
use_mmr = retrieval_strategy['use_mmr']
|
| 35 |
+
strategy_label = retrieval_strategy['label']
|
| 36 |
|
| 37 |
print(f"\n{'='*80}")
|
| 38 |
+
print(f"TECHNIQUE: {technique_name.upper()} | STRATEGY: {strategy_label}")
|
| 39 |
print(f"{'='*80}")
|
| 40 |
|
| 41 |
+
# Use HybridRetriever to retrieve chunks
|
| 42 |
+
context_chunks, chunk_score = retriever.search(
|
| 43 |
+
query=query,
|
| 44 |
+
index=index,
|
| 45 |
+
mode=mode,
|
| 46 |
+
rerank_strategy="cross-encoder",
|
| 47 |
+
use_mmr=use_mmr,
|
| 48 |
+
top_k=50,
|
| 49 |
+
final_k=5,
|
| 50 |
+
technique_name=technique_name,
|
| 51 |
+
verbose=False
|
| 52 |
)
|
| 53 |
|
| 54 |
+
print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
+
if not context_chunks:
|
|
|
|
|
|
|
| 57 |
print(f"WARNING: No chunks found for technique '{technique_name}'")
|
| 58 |
return {}
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
# Print the final RAG context being passed to models (only once)
|
| 61 |
print(f"\n{'='*80}")
|
| 62 |
print(f"📚 FINAL RAG CONTEXT FOR TECHNIQUE '{technique_name.upper()}'")
|
|
|
|
| 70 |
|
| 71 |
# Run model tournament for this technique
|
| 72 |
tournament_results = {}
|
| 73 |
+
tournament_results["_ChunkScore"] = chunk_score # Store at technique level, not per model
|
| 74 |
+
tournament_results["_Strategy"] = strategy_label
|
| 75 |
|
| 76 |
for name, model_inst in models.items():
|
| 77 |
print(f"\n{'-'*60}")
|
|
|
|
| 81 |
# Generation
|
| 82 |
answer = rag_engine.get_answer(
|
| 83 |
model_inst, query, context_chunks,
|
|
|
|
| 84 |
temperature=cfg.gen['temperature']
|
| 85 |
)
|
| 86 |
|
|
|
|
| 101 |
"Relevancy": rel['score'],
|
| 102 |
"Claims": faith['details'],
|
| 103 |
"context_chunks": context_chunks,
|
|
|
|
| 104 |
}
|
| 105 |
|
| 106 |
print(f"\n📊 EVALUATION SCORES:")
|
|
|
|
| 117 |
"Claims": [],
|
| 118 |
"error": str(e),
|
| 119 |
"context_chunks": context_chunks,
|
|
|
|
| 120 |
}
|
| 121 |
|
| 122 |
return tournament_results
|
|
|
|
| 166 |
|
| 167 |
# Aggregate results across all queries
|
| 168 |
aggregated_results = {}
|
| 169 |
+
chunk_scores_by_query_technique = {} # Store ChunkScore per query+technique
|
| 170 |
|
| 171 |
for query_idx, query_results in all_query_results.items():
|
| 172 |
for technique_name, model_results in query_results.items():
|
| 173 |
if technique_name not in aggregated_results:
|
| 174 |
aggregated_results[technique_name] = {}
|
| 175 |
|
| 176 |
+
# Extract ChunkScore (stored at technique level, not per model)
|
| 177 |
+
chunk_score = model_results.get('_ChunkScore', 0)
|
| 178 |
+
chunk_scores_by_query_technique[(query_idx, technique_name)] = chunk_score
|
| 179 |
+
|
| 180 |
for model_name, results in model_results.items():
|
| 181 |
+
if model_name.startswith('_'):
|
| 182 |
+
continue # Skip metadata keys like _ChunkScore
|
| 183 |
+
|
| 184 |
if model_name not in aggregated_results[technique_name]:
|
| 185 |
aggregated_results[technique_name][model_name] = {
|
| 186 |
'Faithfulness': [],
|
|
|
|
| 202 |
content += "No results available for this technique.\n\n"
|
| 203 |
continue
|
| 204 |
|
| 205 |
+
# Show ChunkScore per query for this technique
|
| 206 |
+
content += "#### Chunk Retrieval Scores (ChunkScore)\n\n"
|
| 207 |
+
content += "| Query | Avg ChunkScore |\n"
|
| 208 |
+
content += "|-------|---------------|\n"
|
| 209 |
+
for q_idx in range(len(queries)):
|
| 210 |
+
score = chunk_scores_by_query_technique.get((q_idx, technique_name), 0)
|
| 211 |
+
content += f"| {q_idx + 1} | {score:.4f} |\n"
|
| 212 |
+
content += "\n"
|
| 213 |
+
|
| 214 |
# Create results table with averaged scores
|
| 215 |
content += "| Model | Avg Faithfulness | Avg Relevancy | Avg Combined |\n"
|
| 216 |
content += "|-------|------------------|---------------|--------------|\n"
|
|
|
|
| 346 |
- *Embedding Model:* Jina embeddings (512 dimensions)
|
| 347 |
- *Vector Database:* Pinecone (serverless, AWS us-east-1)
|
| 348 |
- *Judge Model:* Openrouter Free models
|
| 349 |
+
- *Retrieval:* Top 4 chunks per technique
|
| 350 |
+
- *Evaluation Metrics:* Faithfulness (context grounding), Relevancy (query addressing), ChunkScore (reranker confidence)
|
| 351 |
|
| 352 |
---
|
| 353 |
|
|
|
|
| 362 |
return output_file
|
| 363 |
|
| 364 |
|
| 365 |
+
def run_rag_for_technique_sequential(technique_name, query, index, encoder, models, evaluator, rag_engine, retriever, retrieval_strategy):
|
| 366 |
+
"""Run RAG pipeline for a specific chunking technique and retrieval strategy (sequential)."""
|
| 367 |
+
|
| 368 |
+
mode = retrieval_strategy['mode']
|
| 369 |
+
use_mmr = retrieval_strategy['use_mmr']
|
| 370 |
+
strategy_label = retrieval_strategy['label']
|
| 371 |
+
|
| 372 |
+
print(f"\n{'='*80}")
|
| 373 |
+
print(f"TECHNIQUE: {technique_name.upper()} | STRATEGY: {strategy_label}")
|
| 374 |
+
print(f"{'='*80}")
|
| 375 |
+
|
| 376 |
+
# Use HybridRetriever to retrieve chunks
|
| 377 |
+
context_chunks, chunk_score = retriever.search(
|
| 378 |
+
query=query,
|
| 379 |
+
index=index,
|
| 380 |
+
mode=mode,
|
| 381 |
+
rerank_strategy="cross-encoder",
|
| 382 |
+
use_mmr=use_mmr,
|
| 383 |
+
top_k=50,
|
| 384 |
+
final_k=5,
|
| 385 |
+
technique_name=technique_name,
|
| 386 |
+
verbose=False,
|
| 387 |
+
test=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
)
|
| 389 |
|
| 390 |
+
print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
|
| 391 |
+
|
| 392 |
+
if not context_chunks:
|
| 393 |
+
print(f"WARNING: No chunks found for technique '{technique_name}'")
|
| 394 |
+
return {}
|
| 395 |
|
| 396 |
+
# Print the final RAG context being passed to models (only once)
|
| 397 |
+
print(f"\n{'='*80}")
|
| 398 |
+
print(f"📚 FINAL RAG CONTEXT FOR TECHNIQUE '{technique_name.upper()}'")
|
| 399 |
+
print(f"{'='*80}")
|
| 400 |
+
for i, chunk in enumerate(context_chunks, 1):
|
| 401 |
+
print(f"\n[Chunk {i}] ({len(chunk)} chars):")
|
| 402 |
+
print(f"{'─'*60}")
|
| 403 |
+
print(chunk)
|
| 404 |
+
print(f"{'─'*60}")
|
| 405 |
+
print(f"\n{'='*80}")
|
| 406 |
+
|
| 407 |
+
# Run model tournament for this technique
|
| 408 |
+
tournament_results = {}
|
| 409 |
+
tournament_results["_ChunkScore"] = chunk_score
|
| 410 |
+
tournament_results["_Strategy"] = strategy_label
|
| 411 |
|
| 412 |
+
for name, model_inst in models.items():
|
| 413 |
+
print(f"\n{'-'*60}")
|
| 414 |
+
print(f"Model: {name}")
|
| 415 |
+
print(f"{'-'*60}")
|
| 416 |
+
try:
|
| 417 |
+
# Generation
|
| 418 |
+
answer = rag_engine.get_answer(
|
| 419 |
+
model_inst, query, context_chunks,
|
| 420 |
+
temperature=cfg.gen['temperature']
|
| 421 |
+
)
|
| 422 |
+
|
| 423 |
+
print(f"\n{'─'*60}")
|
| 424 |
+
print(f"📝 FULL ANSWER from {name}:")
|
| 425 |
+
print(f"{'─'*60}")
|
| 426 |
+
print(answer)
|
| 427 |
+
print(f"{'─'*60}")
|
| 428 |
+
|
| 429 |
+
# Faithfulness Evaluation (strict=False reduces API calls from ~22 to ~3 per eval)
|
| 430 |
+
faith = evaluator.evaluate_faithfulness(answer, context_chunks, strict=False)
|
| 431 |
+
# Relevancy Evaluation
|
| 432 |
+
rel = evaluator.evaluate_relevancy(query, answer)
|
| 433 |
+
|
| 434 |
+
tournament_results[name] = {
|
| 435 |
+
"answer": answer,
|
| 436 |
+
"Faithfulness": faith['score'],
|
| 437 |
+
"Relevancy": rel['score'],
|
| 438 |
+
"Claims": faith['details'],
|
| 439 |
+
"context_chunks": context_chunks,
|
| 440 |
+
}
|
| 441 |
|
| 442 |
+
print(f"\n📊 EVALUATION SCORES:")
|
| 443 |
+
print(f" Faithfulness: {faith['score']:.1f}%")
|
| 444 |
+
print(f" Relevancy: {rel['score']:.3f}")
|
| 445 |
+
print(f" Combined: {faith['score'] + rel['score']:.3f}")
|
| 446 |
|
| 447 |
+
except Exception as e:
|
| 448 |
+
print(f" Error evaluating {name}: {e}")
|
| 449 |
+
tournament_results[name] = {
|
| 450 |
+
"answer": "",
|
| 451 |
+
"Faithfulness": 0,
|
| 452 |
+
"Relevancy": 0,
|
| 453 |
+
"Claims": [],
|
| 454 |
+
"error": str(e),
|
| 455 |
+
"context_chunks": context_chunks,
|
| 456 |
+
}
|
| 457 |
|
| 458 |
+
return tournament_results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
|
| 460 |
|
| 461 |
def main():
|
|
|
|
| 475 |
# Test queries
|
| 476 |
test_queries = [
|
| 477 |
"What is cognitive behavior therapy and how does it work?",
|
| 478 |
+
"I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
|
| 479 |
+
"No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
|
| 480 |
+
"I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying."
|
| 481 |
]
|
| 482 |
|
| 483 |
print("=" * 80)
|
|
|
|
| 496 |
from data.vector_db import get_index_by_name
|
| 497 |
index_name = f"{cfg.db['base_index_name']}-{cfg.processing['technique']}"
|
| 498 |
|
| 499 |
+
print(f"\n[DEBUG] Checking for existing index: {index_name}")
|
| 500 |
|
| 501 |
try:
|
| 502 |
# Try to connect to existing index
|
| 503 |
+
print("[DEBUG] Connecting to Pinecone...")
|
| 504 |
existing_index = get_index_by_name(pinecone_key, index_name)
|
| 505 |
+
print("[DEBUG] Getting index stats...")
|
| 506 |
stats = existing_index.describe_index_stats()
|
| 507 |
existing_count = stats.get('total_vector_count', 0)
|
| 508 |
|
| 509 |
if existing_count > 0:
|
| 510 |
+
print(f"\n[DEBUG] ✓ Found existing index with {existing_count} vectors")
|
| 511 |
+
print("[DEBUG] Skipping ingestion - using existing data")
|
| 512 |
|
| 513 |
# Initialize processor (this loads the embedding model)
|
| 514 |
+
print("[DEBUG] About to load embedding model...")
|
| 515 |
+
print(f"[DEBUG] Model: {cfg.processing['embedding_model']}")
|
| 516 |
+
import sys
|
| 517 |
+
sys.stdout.flush()
|
| 518 |
+
|
| 519 |
from retriever.processor import ChunkProcessor
|
| 520 |
+
print("[DEBUG] ChunkProcessor imported successfully")
|
| 521 |
+
sys.stdout.flush()
|
| 522 |
+
|
| 523 |
+
print("[DEBUG] Creating ChunkProcessor instance...")
|
| 524 |
+
sys.stdout.flush()
|
| 525 |
proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)
|
| 526 |
+
print("[DEBUG] ChunkProcessor created successfully")
|
| 527 |
+
sys.stdout.flush()
|
| 528 |
+
|
| 529 |
index = existing_index
|
| 530 |
all_chunks = [] # Empty since we're using existing data
|
| 531 |
final_chunks = []
|
| 532 |
+
print("[DEBUG] ✓ Processor initialized")
|
| 533 |
else:
|
| 534 |
+
print("\n[DEBUG] Index exists but is empty. Running full ingestion...")
|
| 535 |
all_chunks, final_chunks, proc, index = ingest_data()
|
| 536 |
except Exception as e:
|
| 537 |
+
print(f"\n[DEBUG] Index check failed: {e}")
|
| 538 |
+
import traceback
|
| 539 |
+
traceback.print_exc()
|
| 540 |
+
print("[DEBUG] Running full ingestion...")
|
| 541 |
all_chunks, final_chunks, proc, index = ingest_data()
|
| 542 |
|
| 543 |
print(f"\nTechniques to evaluate: {[tech['name'] for tech in CHUNKING_TECHNIQUES]}")
|
| 544 |
|
| 545 |
+
# Step 2: Components will be initialized in Step 3 (shared across all sequential runs)
|
| 546 |
print("\n" + "=" * 80)
|
| 547 |
+
print("[DEBUG] STEP 2: PREPARING FOR SEQUENTIAL EXECUTION")
|
| 548 |
print("=" * 80)
|
| 549 |
+
print(f"[DEBUG] Techniques to evaluate: {[t['name'] for t in CHUNKING_TECHNIQUES]}")
|
| 550 |
+
# print(f"[DEBUG] Filtered techniques: {TECHNIQUES_TO_EVALUATE}")
|
| 551 |
|
| 552 |
+
# Define retrieval strategies to test
|
| 553 |
+
RETRIEVAL_STRATEGIES = [
|
| 554 |
+
{"mode": "hybrid", "use_mmr": False, "label": "hybrid-no-mmr"},
|
| 555 |
+
]
|
| 556 |
+
|
| 557 |
+
# Filter to only 4 techniques to reduce memory usage
|
| 558 |
+
TECHNIQUES_TO_EVALUATE = ["markdown", "recursive", "paragraph"]
|
| 559 |
+
CHUNKING_TECHNIQUES_FILTERED = [t for t in CHUNKING_TECHNIQUES if t['name'] in TECHNIQUES_TO_EVALUATE]
|
| 560 |
+
|
| 561 |
+
# Step 3: Run RAG for all techniques x strategies SEQUENTIALLY (to avoid OOM)
|
| 562 |
+
print("\n" + "=" * 80)
|
| 563 |
+
print(f"STEP 3: RUNNING RAG FOR {len(CHUNKING_TECHNIQUES_FILTERED)} TECHNIQUES x {len(RETRIEVAL_STRATEGIES)} STRATEGIES (SEQUENTIAL)")
|
| 564 |
+
print("=" * 80)
|
| 565 |
+
print(f"\nTechniques: {TECHNIQUES_TO_EVALUATE}")
|
| 566 |
+
print(f"\nRetrieval Strategies:")
|
| 567 |
+
for i, strat in enumerate(RETRIEVAL_STRATEGIES, 1):
|
| 568 |
+
mmr_status = "with MMR" if strat['use_mmr'] else "no MMR"
|
| 569 |
+
print(f" {i}. {strat['label']}: mode={strat['mode']}, {mmr_status}")
|
| 570 |
+
|
| 571 |
+
# Initialize components once (shared across all sequential runs)
|
| 572 |
+
print("\n[DEBUG] Initializing components...")
|
| 573 |
+
import sys
|
| 574 |
+
sys.stdout.flush()
|
| 575 |
+
|
| 576 |
+
print("[DEBUG] Creating RAGGenerator...")
|
| 577 |
+
sys.stdout.flush()
|
| 578 |
rag_engine = RAGGenerator()
|
| 579 |
+
print("[DEBUG] RAGGenerator created")
|
| 580 |
+
sys.stdout.flush()
|
| 581 |
+
|
| 582 |
+
print(f"[DEBUG] Loading models: {cfg.model_list}")
|
| 583 |
+
sys.stdout.flush()
|
| 584 |
models = {name: MODEL_MAP[name](token=hf_token) for name in cfg.model_list}
|
| 585 |
+
print("[DEBUG] Models loaded successfully")
|
| 586 |
+
sys.stdout.flush()
|
|
|
|
|
|
|
|
|
|
| 587 |
|
| 588 |
+
print("[DEBUG] Creating RAGEvaluator...")
|
| 589 |
+
sys.stdout.flush()
|
| 590 |
evaluator = RAGEvaluator(
|
| 591 |
judge_model=cfg.gen['judge_model'],
|
| 592 |
embedding_model=proc.encoder,
|
| 593 |
api_key=openrouter_key
|
| 594 |
)
|
| 595 |
+
print("[DEBUG] RAGEvaluator created")
|
| 596 |
+
sys.stdout.flush()
|
| 597 |
+
|
| 598 |
+
print("[DEBUG] Creating HybridRetriever...")
|
| 599 |
+
sys.stdout.flush()
|
| 600 |
+
retriever = HybridRetriever(
|
| 601 |
+
embed_model=proc.encoder,
|
| 602 |
+
rerank_model_name='rerank-2.5',
|
| 603 |
+
verbose=False
|
| 604 |
+
)
|
| 605 |
+
print("[DEBUG] HybridRetriever created")
|
| 606 |
+
sys.stdout.flush()
|
| 607 |
+
|
| 608 |
+
print("[DEBUG] All components initialized successfully.\n")
|
|
|
|
|
|
|
| 609 |
|
| 610 |
all_query_results = {}
|
| 611 |
|
| 612 |
for query_idx, query in enumerate(test_queries):
|
| 613 |
print(f"\n{'='*80}")
|
| 614 |
+
print(f"[DEBUG] PROCESSING QUERY {query_idx + 1}/{len(test_queries)}")
|
| 615 |
+
print(f"[DEBUG] Query: {query}")
|
| 616 |
print(f"{'='*80}")
|
| 617 |
+
import sys
|
| 618 |
+
sys.stdout.flush()
|
| 619 |
|
| 620 |
+
query_results = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
|
| 622 |
+
for technique in CHUNKING_TECHNIQUES_FILTERED:
|
| 623 |
+
for strategy in RETRIEVAL_STRATEGIES:
|
| 624 |
+
result_key = f"{technique['name']}__{strategy['label']}"
|
| 625 |
+
print(f"\n[DEBUG] Processing: {result_key}")
|
| 626 |
+
sys.stdout.flush()
|
| 627 |
+
|
| 628 |
+
try:
|
| 629 |
+
result = run_rag_for_technique_sequential(
|
| 630 |
+
technique_name=technique['name'],
|
| 631 |
+
query=query,
|
| 632 |
+
index=index,
|
| 633 |
+
encoder=proc.encoder,
|
| 634 |
+
models=models,
|
| 635 |
+
evaluator=evaluator,
|
| 636 |
+
rag_engine=rag_engine,
|
| 637 |
+
retriever=retriever,
|
| 638 |
+
retrieval_strategy=strategy
|
| 639 |
+
)
|
| 640 |
+
print(f"[DEBUG] Result for {result_key}: {len(result)} keys")
|
| 641 |
+
query_results[result_key] = result
|
| 642 |
+
except Exception as e:
|
| 643 |
+
import traceback
|
| 644 |
+
print(f"\n[DEBUG] ✗ Error processing {result_key}: {e}")
|
| 645 |
+
traceback.print_exc()
|
| 646 |
+
sys.stdout.flush()
|
| 647 |
+
query_results[result_key] = {}
|
| 648 |
+
|
| 649 |
+
all_query_results[query_idx] = query_results
|
| 650 |
+
|
| 651 |
# Print quick summary for this query
|
| 652 |
print(f"\n{'='*80}")
|
| 653 |
print(f"QUERY {query_idx + 1} SUMMARY")
|
| 654 |
print(f"{'='*80}")
|
| 655 |
+
print(f"\n{'Technique':<15} {'Strategy':<20} {'ChunkScore':>12} {'Avg Faith':>12} {'Avg Rel':>12} {'Best Model':<20}")
|
| 656 |
+
print("-" * 92)
|
| 657 |
|
| 658 |
+
for result_key, model_results in query_results.items():
|
| 659 |
if model_results:
|
| 660 |
+
chunk_score = model_results.get('_ChunkScore', 0)
|
| 661 |
+
strategy = model_results.get('_Strategy', '')
|
| 662 |
+
# Exclude _ChunkScore and _Strategy from model averaging
|
| 663 |
+
model_only = {k: v for k, v in model_results.items() if not k.startswith('_')}
|
| 664 |
+
avg_faith = sum(r.get('Faithfulness', 0) for r in model_only.values()) / len(model_only) if model_only else 0
|
| 665 |
+
avg_rel = sum(r.get('Relevancy', 0) for r in model_only.values()) / len(model_only) if model_only else 0
|
| 666 |
|
| 667 |
# Find best model
|
| 668 |
best_model = max(
|
| 669 |
+
model_only.items(),
|
| 670 |
key=lambda x: x[1].get('Faithfulness', 0) + x[1].get('Relevancy', 0)
|
| 671 |
)
|
| 672 |
best_name = best_model[0]
|
| 673 |
|
| 674 |
+
print(f"{result_key:<15} {strategy:<20} {chunk_score:>12.4f} {avg_faith:>11.1f}% {avg_rel:>12.3f} {best_name:<20}")
|
| 675 |
else:
|
| 676 |
+
print(f"{result_key:<15} {'':<20} {'N/A':>12} {'N/A':>12} {'N/A':>12} {'N/A':<20}")
|
| 677 |
|
| 678 |
+
print("-" * 92)
|
| 679 |
|
| 680 |
# Step 4: Generate findings document from all queries
|
| 681 |
print("\n" + "=" * 80)
|
|
|
|
| 690 |
print("=" * 80)
|
| 691 |
|
| 692 |
print(f"\nQueries processed: {len(test_queries)}")
|
| 693 |
+
print(f"Techniques evaluated: {len(CHUNKING_TECHNIQUES_FILTERED)} ({TECHNIQUES_TO_EVALUATE})")
|
| 694 |
print(f"Models tested: {len(cfg.model_list)}")
|
| 695 |
print(f"\nFindings document: {findings_file}")
|
| 696 |
|
| 697 |
# Print final summary across all queries
|
| 698 |
+
print("\n" + "-" * 92)
|
| 699 |
+
print(f"{'Technique':<15} {'Strategy':<20} {'ChunkScore':>12} {'Avg Faith':>12} {'Avg Rel':>12} {'Best Model':<20}")
|
| 700 |
+
print("-" * 92)
|
| 701 |
+
|
| 702 |
+
# Define retrieval strategies (same as above)
|
| 703 |
+
RETRIEVAL_STRATEGIES = [
|
| 704 |
+
{"mode": "hybrid", "use_mmr": False, "label": "hybrid-no-mmr"},
|
| 705 |
+
]
|
| 706 |
|
| 707 |
+
# Calculate averages across all queries for each technique x strategy
|
| 708 |
+
for tech_config in CHUNKING_TECHNIQUES_FILTERED:
|
| 709 |
tech_name = tech_config['name']
|
| 710 |
+
for strategy in RETRIEVAL_STRATEGIES:
|
| 711 |
+
strategy_label = strategy['label']
|
| 712 |
+
result_key = f"{tech_name}__{strategy_label}"
|
| 713 |
+
|
| 714 |
+
all_faith = []
|
| 715 |
+
all_rel = []
|
| 716 |
+
all_chunk_scores = []
|
| 717 |
+
best_model_name = None
|
| 718 |
+
best_combined = 0
|
| 719 |
+
|
| 720 |
+
for query_idx, query_results in all_query_results.items():
|
| 721 |
+
if result_key in query_results and query_results[result_key]:
|
| 722 |
+
model_results = query_results[result_key]
|
|
|
|
| 723 |
|
| 724 |
+
# Extract ChunkScore
|
| 725 |
+
chunk_score = model_results.get('_ChunkScore', 0)
|
| 726 |
+
all_chunk_scores.append(chunk_score)
|
| 727 |
+
|
| 728 |
+
# Exclude _ChunkScore and _Strategy from model averaging
|
| 729 |
+
model_only = {k: v for k, v in model_results.items() if not k.startswith('_')}
|
| 730 |
+
for model_name, results in model_only.items():
|
| 731 |
+
faith = results.get('Faithfulness', 0)
|
| 732 |
+
rel = results.get('Relevancy', 0)
|
| 733 |
+
combined = faith + rel
|
| 734 |
+
all_faith.append(faith)
|
| 735 |
+
all_rel.append(rel)
|
| 736 |
+
|
| 737 |
+
if combined > best_combined:
|
| 738 |
+
best_combined = combined
|
| 739 |
+
best_model_name = model_name
|
| 740 |
+
|
| 741 |
+
if all_faith:
|
| 742 |
+
avg_faith = sum(all_faith) / len(all_faith)
|
| 743 |
+
avg_rel = sum(all_rel) / len(all_rel)
|
| 744 |
+
avg_chunk_score = sum(all_chunk_scores) / len(all_chunk_scores) if all_chunk_scores else 0
|
| 745 |
+
print(f"{tech_name:<15} {strategy_label:<20} {avg_chunk_score:>12.4f} {avg_faith:>11.1f}% {avg_rel:>12.3f} {best_model_name or 'N/A':<20}")
|
| 746 |
+
else:
|
| 747 |
+
print(f"{tech_name:<15} {strategy_label:<20} {'N/A':>12} {'N/A':>12} {'N/A':>12} {'N/A':<20}")
|
| 748 |
|
| 749 |
+
print("-" * 92)
|
| 750 |
|
| 751 |
print("\n✓ Ablation study complete!")
|
| 752 |
print(f"✓ Results saved to: {findings_file}")
|
requirements.txt
CHANGED
|
@@ -95,3 +95,6 @@ zstandard==0.25.0
|
|
| 95 |
groq==1.1.2
|
| 96 |
jiter==0.13.0
|
| 97 |
openai==2.30.0
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
groq==1.1.2
|
| 96 |
jiter==0.13.0
|
| 97 |
openai==2.30.0
|
| 98 |
+
pinecone-text>=0.11.0
|
| 99 |
+
voyageai==0.3.7
|
| 100 |
+
|
retriever/evaluator.py
CHANGED
|
@@ -10,7 +10,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
| 10 |
# ------------------------------------------------------------------
|
| 11 |
|
| 12 |
class GroqJudge:
|
| 13 |
-
def __init__(self, api_key: str, model: str =
|
| 14 |
"""
|
| 15 |
Wraps OpenRouter's chat completions to match the .generate(prompt) interface
|
| 16 |
expected by RAGEvaluator.
|
|
@@ -27,8 +27,6 @@ class GroqJudge:
|
|
| 27 |
|
| 28 |
# Fallback models in order of preference (OpenRouter free models)
|
| 29 |
self.fallback_models = [
|
| 30 |
-
"deepseek/deepseek-v3.2",
|
| 31 |
-
"qwen/qwen3.6-plus-preview:free",
|
| 32 |
"stepfun/step-3.5-flash:free",
|
| 33 |
"nvidia/nemotron-3-super-120b-a12b:free",
|
| 34 |
"z-ai/glm-4.5-air:free",
|
|
@@ -228,7 +226,10 @@ class RAGEvaluator:
|
|
| 228 |
return {"score": 0, "queries": []}
|
| 229 |
|
| 230 |
# --- Step B: Similarity (single batched encode call) ---
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
| 232 |
original_vec = all_vecs[0:1]
|
| 233 |
generated_vecs = all_vecs[1:]
|
| 234 |
|
|
|
|
| 10 |
# ------------------------------------------------------------------
|
| 11 |
|
| 12 |
class GroqJudge:
|
| 13 |
+
def __init__(self, api_key: str, model: str = "stepfun/step-3.5-flash:free"):
|
| 14 |
"""
|
| 15 |
Wraps OpenRouter's chat completions to match the .generate(prompt) interface
|
| 16 |
expected by RAGEvaluator.
|
|
|
|
| 27 |
|
| 28 |
# Fallback models in order of preference (OpenRouter free models)
|
| 29 |
self.fallback_models = [
|
|
|
|
|
|
|
| 30 |
"stepfun/step-3.5-flash:free",
|
| 31 |
"nvidia/nemotron-3-super-120b-a12b:free",
|
| 32 |
"z-ai/glm-4.5-air:free",
|
|
|
|
| 226 |
return {"score": 0, "queries": []}
|
| 227 |
|
| 228 |
# --- Step B: Similarity (single batched encode call) ---
|
| 229 |
+
try:
|
| 230 |
+
all_vecs = self.encoder.encode([query] + gen_queries)
|
| 231 |
+
except AttributeError:
|
| 232 |
+
all_vecs = np.array([self.encoder.encode(text) for text in [query] + gen_queries])
|
| 233 |
original_vec = all_vecs[0:1]
|
| 234 |
generated_vecs = all_vecs[1:]
|
| 235 |
|
retriever/generator.py
CHANGED
|
@@ -8,21 +8,22 @@ class RAGGenerator:
|
|
| 8 |
else:
|
| 9 |
context_text = "\n\n".join([f"[Source {i+1}]: {c}" for i, c in enumerate(retrieved_contexts)])
|
| 10 |
|
| 11 |
-
return f"""You are
|
| 12 |
|
| 13 |
INSTRUCTIONS:
|
| 14 |
-
1.
|
| 15 |
-
2.
|
| 16 |
-
3.
|
| 17 |
-
4.
|
| 18 |
-
5.
|
| 19 |
-
|
| 20 |
-
|
|
|
|
| 21 |
{context_text}
|
| 22 |
|
| 23 |
-
|
| 24 |
|
| 25 |
-
|
| 26 |
|
| 27 |
def get_answer(self, model_instance, query, retrieved_contexts, context_urls=None, **kwargs):
|
| 28 |
"""Uses a specific model instance to generate the final answer."""
|
|
@@ -42,4 +43,4 @@ ACADEMIC ANSWER (WITH CITATIONS):"""
|
|
| 42 |
# Fallback for model wrappers that only expose sync generation.
|
| 43 |
answer = model_instance.generate(prompt, **kwargs)
|
| 44 |
if answer:
|
| 45 |
-
yield answer
|
|
|
|
| 8 |
else:
|
| 9 |
context_text = "\n\n".join([f"[Source {i+1}]: {c}" for i, c in enumerate(retrieved_contexts)])
|
| 10 |
|
| 11 |
+
return f"""You are an empathetic Cognitive Behavioral Therapy (CBT) therapist speaking directly to a client. **Your task is to provide a therapeutic, helpful response based ONLY on the provided clinical documents and excerpts**.
|
| 12 |
|
| 13 |
INSTRUCTIONS:
|
| 14 |
+
1. THERAPEUTIC DIALOGUE: Respond directly to the user as your client. Start by briefly validating their feelings, then gently apply CBT concepts, psychoeducation, or interventions found STRICTLY in the provided documents.
|
| 15 |
+
2. PATIENT EXAMPLES & NAMES (CRITICAL): The provided documents contain transcripts and examples of other patients and therapists (e.g., Abe, Judith, Joseph). These are illustrative case studies ONLY. DO NOT assume the user is "Abe" or any other person mentioned in the text. NEVER address or refer to the user by these names. Extract the CBT concepts/techniques demonstrated in these transcripts and apply them to the current user's unique situation.
|
| 16 |
+
3. GROUNDING (NO OPINIONS): Do not give your own opinions, general life advice, or use outside knowledge. Every therapeutic concept, identified cognitive distortion, or suggested exercise must come directly from the provided text.
|
| 17 |
+
4. CITATIONS: You must cite the sources used in your response to show where the clinical guidance comes from (e.g., "It sounds like you might be experiencing what is known as 'all-or-nothing thinking' [Source 1]").
|
| 18 |
+
5. FORMAT: Use clear Markdown formatting. Use paragraphs for conversational tone, and bullet points if you are breaking down specific steps, questions, or exercises found in the text.
|
| 19 |
+
6. MISSING INFO: If the provided excerpts do not contain relevant CBT concepts to address the client's specific statement, explicitly state: "While I hear how difficult this is for you, the clinical materials I have right now do not contain specific steps to address this." Do not invent therapeutic advice.
|
| 20 |
+
|
| 21 |
+
RETRIEVED CLINICAL CONTEXT:
|
| 22 |
{context_text}
|
| 23 |
|
| 24 |
+
CLIENT STATEMENT: {query}
|
| 25 |
|
| 26 |
+
THERAPEUTIC RESPONSE (GROUNDED IN SOURCES):"""
|
| 27 |
|
| 28 |
def get_answer(self, model_instance, query, retrieved_contexts, context_urls=None, **kwargs):
|
| 29 |
"""Uses a specific model instance to generate the final answer."""
|
|
|
|
| 43 |
# Fallback for model wrappers that only expose sync generation.
|
| 44 |
answer = model_instance.generate(prompt, **kwargs)
|
| 45 |
if answer:
|
| 46 |
+
yield answer
|
retriever/processor.py
CHANGED
|
@@ -88,14 +88,72 @@ class MarkdownTextSplitter:
|
|
| 88 |
|
| 89 |
|
| 90 |
class ChunkProcessor:
|
| 91 |
-
def __init__(self, model_name='
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
self.model_name = model_name
|
|
|
|
| 93 |
self._use_remote_code = self._requires_remote_code(model_name)
|
|
|
|
| 94 |
st_kwargs = {"trust_remote_code": True} if self._use_remote_code else {}
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
self.verbose = verbose
|
| 97 |
hf_kwargs = {"model_kwargs": {"trust_remote_code": True}} if self._use_remote_code else {}
|
| 98 |
self.hf_embeddings = HuggingFaceEmbeddings(model_name=model_name, **hf_kwargs) if load_hf_embeddings else None
|
|
|
|
| 99 |
|
| 100 |
def _requires_remote_code(self, model_name: str) -> bool:
|
| 101 |
normalized = (model_name or "").strip().lower()
|
|
|
|
| 88 |
|
| 89 |
|
| 90 |
class ChunkProcessor:
|
| 91 |
+
def __init__(self, model_name='jinaai/jina-embeddings-v2-small-en', verbose: bool = True, load_hf_embeddings: bool = False):
|
| 92 |
+
import sys
|
| 93 |
+
import os
|
| 94 |
+
|
| 95 |
+
# Set environment variables to limit memory usage BEFORE importing torch
|
| 96 |
+
os.environ["OMP_NUM_THREADS"] = "2"
|
| 97 |
+
os.environ["MKL_NUM_THREADS"] = "2"
|
| 98 |
+
os.environ["OPENBLAS_NUM_THREADS"] = "2"
|
| 99 |
+
|
| 100 |
+
print(f"[DEBUG-ChunkProcessor] Starting init with model: {model_name}", flush=True)
|
| 101 |
self.model_name = model_name
|
| 102 |
+
print(f"[DEBUG-ChunkProcessor] Checking if remote code needed...", flush=True)
|
| 103 |
self._use_remote_code = self._requires_remote_code(model_name)
|
| 104 |
+
print(f"[DEBUG-ChunkProcessor] Remote code needed: {self._use_remote_code}", flush=True)
|
| 105 |
st_kwargs = {"trust_remote_code": True} if self._use_remote_code else {}
|
| 106 |
+
|
| 107 |
+
# Set torch threads to limit parallelism
|
| 108 |
+
import torch
|
| 109 |
+
torch.set_num_threads(2)
|
| 110 |
+
torch.set_num_interop_threads(2)
|
| 111 |
+
print(f"[DEBUG-ChunkProcessor] Torch threads set to 2", flush=True)
|
| 112 |
+
|
| 113 |
+
print(f"[DEBUG-ChunkProcessor] Loading SentenceTransformer with kwargs: {st_kwargs}", flush=True)
|
| 114 |
+
sys.stdout.flush()
|
| 115 |
+
|
| 116 |
+
# Do not explicitly force cpu if cuda is available, let SentenceTransformer handle it or specify explicit map.
|
| 117 |
+
import torch
|
| 118 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 119 |
+
import numpy as np
|
| 120 |
+
try:
|
| 121 |
+
if self._use_remote_code:
|
| 122 |
+
print("[DEBUG-ChunkProcessor] Using HuggingFaceEmbeddings-based encoder for remote model", flush=True)
|
| 123 |
+
hf_kwargs = {"model_kwargs": {"trust_remote_code": True}}
|
| 124 |
+
hf = HuggingFaceEmbeddings(model_name=model_name, **hf_kwargs)
|
| 125 |
+
|
| 126 |
+
class HFEncoderShim:
|
| 127 |
+
def __init__(self, hf_client):
|
| 128 |
+
self._hf = hf_client
|
| 129 |
+
def encode(self, text: str):
|
| 130 |
+
vecs = self._hf.embed_documents([text])
|
| 131 |
+
return np.array(vecs[0], dtype=float)
|
| 132 |
+
|
| 133 |
+
self.encoder = HFEncoderShim(hf)
|
| 134 |
+
self.hf_embeddings = hf
|
| 135 |
+
else:
|
| 136 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 137 |
+
self.encoder = SentenceTransformer(model_name, device=device, **st_kwargs)
|
| 138 |
+
print("[DEBUG-ChunkProcessor] SentenceTransformer loaded successfully", flush=True)
|
| 139 |
+
except Exception as e:
|
| 140 |
+
print(f"[DEBUG-ChunkProcessor] encoder init failed: {e}. Falling back to HuggingFaceEmbeddings.", flush=True)
|
| 141 |
+
hf_kwargs = {"model_kwargs": {"trust_remote_code": True}} if self._use_remote_code else {}
|
| 142 |
+
hf = HuggingFaceEmbeddings(model_name=model_name, **hf_kwargs)
|
| 143 |
+
class HFEncoderShim:
|
| 144 |
+
def __init__(self, hf_client):
|
| 145 |
+
self._hf = hf_client
|
| 146 |
+
def encode(self, text: str):
|
| 147 |
+
vecs = self._hf.embed_documents([text])
|
| 148 |
+
return np.array(vecs[0], dtype=float)
|
| 149 |
+
self.encoder = HFEncoderShim(hf)
|
| 150 |
+
self.hf_embeddings = hf
|
| 151 |
+
print(f"[DEBUG-ChunkProcessor] SentenceTransformer loaded successfully", flush=True)
|
| 152 |
+
sys.stdout.flush()
|
| 153 |
self.verbose = verbose
|
| 154 |
hf_kwargs = {"model_kwargs": {"trust_remote_code": True}} if self._use_remote_code else {}
|
| 155 |
self.hf_embeddings = HuggingFaceEmbeddings(model_name=model_name, **hf_kwargs) if load_hf_embeddings else None
|
| 156 |
+
print(f"[DEBUG-ChunkProcessor] ChunkProcessor init complete", flush=True)
|
| 157 |
|
| 158 |
def _requires_remote_code(self, model_name: str) -> bool:
|
| 159 |
normalized = (model_name or "").strip().lower()
|
retriever/retriever.py
CHANGED
|
@@ -5,41 +5,58 @@ from rank_bm25 import BM25Okapi
|
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
from typing import Optional, List
|
| 7 |
|
|
|
|
|
|
|
| 8 |
# changed mmr to return final k, as a param, prev was hardcoded to 3
|
| 9 |
# --@Qamare
|
| 10 |
|
| 11 |
# Try to import FlashRank for CPU optimization, fallback to sentence-transformers
|
| 12 |
-
try:
|
| 13 |
-
from flashrank import Ranker, RerankRequest
|
| 14 |
-
FLASHRANK_AVAILABLE = True
|
| 15 |
-
except ImportError:
|
| 16 |
-
from sentence_transformers import CrossEncoder
|
| 17 |
-
FLASHRANK_AVAILABLE = False
|
| 18 |
|
| 19 |
class HybridRetriever:
|
| 20 |
-
def __init__(self,
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
self.embed_model = embed_model
|
| 23 |
self.verbose = verbose
|
| 24 |
self.rerank_model_name = self._normalize_rerank_model_name(rerank_model_name)
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
try:
|
| 29 |
-
|
| 30 |
-
self.
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
self.bm25 = BM25Okapi(self.tokenized_corpus)
|
| 42 |
-
self.technique_to_indices = self._build_chunking_index_map()
|
| 43 |
|
| 44 |
def _normalize_rerank_model_name(self, model_name: str) -> str:
|
| 45 |
normalized = (model_name or "").strip()
|
|
@@ -76,32 +93,70 @@ class HybridRetriever:
|
|
| 76 |
# Retrieval
|
| 77 |
# ------------------------------------------------------------------
|
| 78 |
|
| 79 |
-
def _semantic_search(self, query, index, top_k,
|
| 80 |
query_vector = self.embed_model.encode(query)
|
| 81 |
query_kwargs = {
|
| 82 |
"vector": query_vector.tolist(),
|
| 83 |
"top_k": top_k,
|
| 84 |
"include_metadata": True,
|
| 85 |
}
|
| 86 |
-
if
|
| 87 |
-
query_kwargs["filter"] = {"chunking_technique": {"$eq":
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
| 89 |
chunks = [match['metadata']['text'] for match in res['matches']]
|
| 90 |
return query_vector, chunks
|
| 91 |
|
| 92 |
-
def _bm25_search(self, query, top_k,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
tokenized_query = self._tokenize(query)
|
| 94 |
-
scores =
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
candidate_indices = self.technique_to_indices.get(chunking_technique, [])
|
| 98 |
-
if not candidate_indices:
|
| 99 |
-
return []
|
| 100 |
-
top_indices = sorted(candidate_indices, key=lambda i: scores[i], reverse=True)[:top_k]
|
| 101 |
-
else:
|
| 102 |
-
top_indices = np.argsort(scores)[::-1][:top_k]
|
| 103 |
-
|
| 104 |
-
return [self.final_chunks[i]['metadata']['text'] for i in top_indices]
|
| 105 |
|
| 106 |
# ------------------------------------------------------------------
|
| 107 |
# Fusion
|
|
@@ -119,20 +174,22 @@ class HybridRetriever:
|
|
| 119 |
# Reranking
|
| 120 |
# ------------------------------------------------------------------
|
| 121 |
|
| 122 |
-
def _cross_encoder_rerank(self, query, chunks, final_k) -> List[str]:
|
| 123 |
-
if
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
ranked_chunks = [
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
| 136 |
|
| 137 |
# ------------------------------------------------------------------
|
| 138 |
# MMR (applied after reranking as a diversity filter)
|
|
@@ -155,7 +212,7 @@ class HybridRetriever:
|
|
| 155 |
# STEP 1: Encode chunks to get embeddings
|
| 156 |
print(f" [MMR DEBUG] Encoding {len(chunks)} chunks...")
|
| 157 |
try:
|
| 158 |
-
chunk_embeddings = self.embed_model.encode(chunks)
|
| 159 |
print(f" [MMR DEBUG] Chunk embeddings shape: {chunk_embeddings.shape}")
|
| 160 |
except Exception as e:
|
| 161 |
print(f" [MMR DEBUG] ERROR encoding chunks: {e}")
|
|
@@ -246,24 +303,30 @@ class HybridRetriever:
|
|
| 246 |
# Main search
|
| 247 |
# ------------------------------------------------------------------
|
| 248 |
|
| 249 |
-
def search(self, query, index, top_k=
|
| 250 |
-
chunking_technique: Optional[str] = None,
|
| 251 |
rerank_strategy="cross-encoder", use_mmr=False, lambda_param=0.5,
|
| 252 |
-
|
|
|
|
|
|
|
| 253 |
"""
|
| 254 |
:param mode: "semantic", "bm25", or "hybrid"
|
| 255 |
:param rerank_strategy: "cross-encoder", "rrf", or "none"
|
| 256 |
:param use_mmr: Whether to apply MMR diversity filter after reranking
|
| 257 |
:param lambda_param: MMR trade-off between relevance (1.0) and diversity (0.0)
|
|
|
|
|
|
|
| 258 |
"""
|
| 259 |
should_print = verbose if verbose is not None else self.verbose
|
| 260 |
-
requested_technique = self._normalize_chunking_technique(chunking_technique)
|
| 261 |
total_start = time.perf_counter()
|
| 262 |
semantic_time = 0.0
|
| 263 |
bm25_time = 0.0
|
| 264 |
rerank_time = 0.0
|
| 265 |
mmr_time = 0.0
|
| 266 |
|
|
|
|
|
|
|
|
|
|
| 267 |
if should_print:
|
| 268 |
self._print_search_header(query, mode, rerank_strategy, top_k, final_k)
|
| 269 |
if requested_technique:
|
|
@@ -283,25 +346,32 @@ class HybridRetriever:
|
|
| 283 |
|
| 284 |
if mode in ["bm25", "hybrid"]:
|
| 285 |
bm25_start = time.perf_counter()
|
| 286 |
-
bm25_chunks = self._bm25_search(query, top_k, requested_technique)
|
| 287 |
bm25_time = time.perf_counter() - bm25_start
|
| 288 |
if should_print:
|
| 289 |
self._print_candidates("BM25 Search", bm25_chunks)
|
| 290 |
print(f"BM25 time: {bm25_time:.3f}s")
|
|
|
|
|
|
|
|
|
|
| 291 |
|
| 292 |
# 2. Fuse / rerank
|
| 293 |
rerank_start = time.perf_counter()
|
|
|
|
| 294 |
if rerank_strategy == "rrf":
|
| 295 |
candidates = self._rrf_score(semantic_chunks, bm25_chunks)[:final_k]
|
| 296 |
label = "RRF"
|
| 297 |
elif rerank_strategy == "cross-encoder":
|
| 298 |
combined = list(dict.fromkeys(semantic_chunks + bm25_chunks))
|
| 299 |
-
candidates = self._cross_encoder_rerank(query, combined, final_k)
|
| 300 |
label = "Cross-Encoder"
|
| 301 |
else: # "none"
|
| 302 |
candidates = list(dict.fromkeys(semantic_chunks + bm25_chunks))[:final_k]
|
| 303 |
label = "No Reranking"
|
| 304 |
rerank_time = time.perf_counter() - rerank_start
|
|
|
|
|
|
|
|
|
|
| 305 |
|
| 306 |
# 3. MMR diversity filter (applied after reranking)
|
| 307 |
if use_mmr and candidates:
|
|
@@ -313,13 +383,17 @@ class HybridRetriever:
|
|
| 313 |
label += " + MMR"
|
| 314 |
mmr_time = time.perf_counter() - mmr_start
|
| 315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
total_time = time.perf_counter() - total_start
|
| 317 |
|
| 318 |
if should_print:
|
| 319 |
self._print_final_results(candidates, label)
|
| 320 |
self._print_timing_summary(semantic_time, bm25_time, rerank_time, mmr_time, total_time)
|
| 321 |
|
| 322 |
-
return candidates
|
| 323 |
|
| 324 |
# ------------------------------------------------------------------
|
| 325 |
# Printing
|
|
|
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
from typing import Optional, List
|
| 7 |
|
| 8 |
+
#
|
| 9 |
+
|
| 10 |
# changed mmr to return final k, as a param, prev was hardcoded to 3
|
| 11 |
# --@Qamare
|
| 12 |
|
| 13 |
# Try to import FlashRank for CPU optimization, fallback to sentence-transformers
|
| 14 |
+
# try:
|
| 15 |
+
# from flashrank import Ranker, RerankRequest
|
| 16 |
+
# FLASHRANK_AVAILABLE = True
|
| 17 |
+
# except ImportError:
|
| 18 |
+
# from sentence_transformers import CrossEncoder
|
| 19 |
+
# FLASHRANK_AVAILABLE = False
|
| 20 |
|
| 21 |
class HybridRetriever:
|
| 22 |
+
def __init__(self, embed_model, rerank_model_name='jinaai/jina-reranker-v1-tiny-en', verbose: bool = True):
|
| 23 |
+
import sys
|
| 24 |
+
import os
|
| 25 |
+
print(f"[DEBUG-HybridRetriever] Starting init", flush=True)
|
| 26 |
self.embed_model = embed_model
|
| 27 |
self.verbose = verbose
|
| 28 |
self.rerank_model_name = self._normalize_rerank_model_name(rerank_model_name)
|
| 29 |
+
print(f"[DEBUG-HybridRetriever] Rerank model name: {self.rerank_model_name}", flush=True)
|
| 30 |
+
|
| 31 |
+
self.vo_client = None
|
| 32 |
+
self.ce_reranker = None
|
| 33 |
+
self.reranker_backend = "cross-encoder"
|
| 34 |
+
|
| 35 |
+
voyage_api_key = os.getenv("VOYAGE_API_KEY")
|
| 36 |
+
if voyage_api_key:
|
| 37 |
try:
|
| 38 |
+
import voyageai
|
| 39 |
+
self.vo_client = voyageai.Client(api_key=voyage_api_key)
|
| 40 |
+
self.reranker_backend = "voyageai"
|
| 41 |
+
# Voyage uses model IDs like rerank-2.5; keep a safe default.
|
| 42 |
+
if not self.rerank_model_name.startswith("rerank-"):
|
| 43 |
+
self.rerank_model_name = "rerank-2.5"
|
| 44 |
+
print(f"[DEBUG-HybridRetriever] Voyage AI client initialized", flush=True)
|
| 45 |
+
except Exception as exc:
|
| 46 |
+
print(f"[DEBUG-HybridRetriever] Voyage unavailable ({exc}); falling back to cross-encoder", flush=True)
|
| 47 |
+
|
| 48 |
+
if self.vo_client is None:
|
| 49 |
+
from sentence_transformers import CrossEncoder
|
| 50 |
+
ce_model_name = self.rerank_model_name
|
| 51 |
+
if not ce_model_name.startswith("cross-encoder/"):
|
| 52 |
+
ce_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
| 53 |
+
self.ce_reranker = CrossEncoder(ce_model_name)
|
| 54 |
+
self.rerank_model_name = ce_model_name
|
| 55 |
+
self.reranker_backend = "cross-encoder"
|
| 56 |
+
print(f"[DEBUG-HybridRetriever] Cross-encoder reranker initialized: {ce_model_name}", flush=True)
|
| 57 |
|
| 58 |
+
sys.stdout.flush()
|
| 59 |
+
print(f"[DEBUG-HybridRetriever] Init complete", flush=True)
|
|
|
|
|
|
|
| 60 |
|
| 61 |
def _normalize_rerank_model_name(self, model_name: str) -> str:
|
| 62 |
normalized = (model_name or "").strip()
|
|
|
|
| 93 |
# Retrieval
|
| 94 |
# ------------------------------------------------------------------
|
| 95 |
|
| 96 |
+
def _semantic_search(self, query, index, top_k, technique_name: Optional[str] = None) -> tuple[np.ndarray, List[str]]:
|
| 97 |
query_vector = self.embed_model.encode(query)
|
| 98 |
query_kwargs = {
|
| 99 |
"vector": query_vector.tolist(),
|
| 100 |
"top_k": top_k,
|
| 101 |
"include_metadata": True,
|
| 102 |
}
|
| 103 |
+
if technique_name:
|
| 104 |
+
query_kwargs["filter"] = {"chunking_technique": {"$eq": technique_name}}
|
| 105 |
+
|
| 106 |
+
res = index.query(
|
| 107 |
+
**query_kwargs
|
| 108 |
+
)
|
| 109 |
chunks = [match['metadata']['text'] for match in res['matches']]
|
| 110 |
return query_vector, chunks
|
| 111 |
|
| 112 |
+
def _bm25_search(self, query, index, top_k=50, technique_name: Optional[str] = None) -> List[str]:
|
| 113 |
+
try:
|
| 114 |
+
import os
|
| 115 |
+
from pinecone import Pinecone
|
| 116 |
+
from pinecone_text.sparse import BM25Encoder
|
| 117 |
+
encoder = BM25Encoder().default()
|
| 118 |
+
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
| 119 |
+
sparse_index = pc.Index("cbt-book-sparse")
|
| 120 |
+
sparse_vector = encoder.encode_queries(query)
|
| 121 |
+
query_kwargs = {
|
| 122 |
+
"sparse_vector": sparse_vector,
|
| 123 |
+
"top_k": top_k,
|
| 124 |
+
"include_metadata": True,
|
| 125 |
+
}
|
| 126 |
+
if technique_name:
|
| 127 |
+
query_kwargs["filter"] = {"chunking_technique": {"$eq": technique_name}}
|
| 128 |
+
|
| 129 |
+
res = sparse_index.query(**query_kwargs)
|
| 130 |
+
return [match["metadata"]["text"] for match in res["matches"]]
|
| 131 |
+
except Exception as e:
|
| 132 |
+
print(f"Error in BM25 search against Pinecone: {e}")
|
| 133 |
+
return []
|
| 134 |
+
|
| 135 |
+
"""Fetch chunks from Pinecone and perform BM25 ranking locally."""
|
| 136 |
+
# Fetch more candidates than needed for BM25 to rank against
|
| 137 |
+
# Use a reasonable multiplier to get enough candidates without over-fetching
|
| 138 |
+
fetch_limit = min(top_k * 4,25) # e.g., 4*4=16, capped at 50
|
| 139 |
+
res = index.query(
|
| 140 |
+
vector=[0.0] * 512, # Dummy vector (BM25 doesn't use embeddings)
|
| 141 |
+
top_k=fetch_limit,
|
| 142 |
+
include_metadata=True,
|
| 143 |
+
filter={"chunking_technique": {"$eq": technique_name}}
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
# Extract chunks
|
| 147 |
+
chunks = [match['metadata']['text'] for match in res['matches']]
|
| 148 |
+
if not chunks:
|
| 149 |
+
return []
|
| 150 |
+
|
| 151 |
+
# Build BM25 index on these chunks
|
| 152 |
+
tokenized_corpus = [self._tokenize(chunk) for chunk in chunks]
|
| 153 |
+
bm25 = BM25Okapi(tokenized_corpus)
|
| 154 |
+
|
| 155 |
+
# Score query against chunks
|
| 156 |
tokenized_query = self._tokenize(query)
|
| 157 |
+
scores = bm25.get_scores(tokenized_query)
|
| 158 |
+
top_indices = np.argsort(scores)[::-1][:top_k]
|
| 159 |
+
return [chunks[i] for i in top_indices]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
# ------------------------------------------------------------------
|
| 162 |
# Fusion
|
|
|
|
| 174 |
# Reranking
|
| 175 |
# ------------------------------------------------------------------
|
| 176 |
|
| 177 |
+
def _cross_encoder_rerank(self, query, chunks, final_k) -> tuple[List[str], List[float]]:
|
| 178 |
+
if not chunks:
|
| 179 |
+
return [], []
|
| 180 |
+
|
| 181 |
+
if self.vo_client is not None:
|
| 182 |
+
reranking = self.vo_client.rerank(query, chunks, model=self.rerank_model_name, top_k=final_k)
|
| 183 |
+
ranked_chunks = [result.document for result in reranking.results]
|
| 184 |
+
ranked_scores = [result.relevance_score for result in reranking.results]
|
| 185 |
+
return ranked_chunks, ranked_scores
|
| 186 |
+
|
| 187 |
+
pairs = [[query, chunk] for chunk in chunks]
|
| 188 |
+
scores = self.ce_reranker.predict(pairs)
|
| 189 |
+
ranked_indices = np.argsort(scores)[::-1][:final_k]
|
| 190 |
+
ranked_chunks = [chunks[i] for i in ranked_indices]
|
| 191 |
+
ranked_scores = [float(scores[i]) for i in ranked_indices]
|
| 192 |
+
return ranked_chunks, ranked_scores
|
| 193 |
|
| 194 |
# ------------------------------------------------------------------
|
| 195 |
# MMR (applied after reranking as a diversity filter)
|
|
|
|
| 212 |
# STEP 1: Encode chunks to get embeddings
|
| 213 |
print(f" [MMR DEBUG] Encoding {len(chunks)} chunks...")
|
| 214 |
try:
|
| 215 |
+
chunk_embeddings = np.array([self.embed_model.encode(c) for c in chunks])
|
| 216 |
print(f" [MMR DEBUG] Chunk embeddings shape: {chunk_embeddings.shape}")
|
| 217 |
except Exception as e:
|
| 218 |
print(f" [MMR DEBUG] ERROR encoding chunks: {e}")
|
|
|
|
| 303 |
# Main search
|
| 304 |
# ------------------------------------------------------------------
|
| 305 |
|
| 306 |
+
def search(self, query, index, top_k=50, final_k=5, mode="hybrid",
|
|
|
|
| 307 |
rerank_strategy="cross-encoder", use_mmr=False, lambda_param=0.5,
|
| 308 |
+
technique_name: Optional[str] = None,
|
| 309 |
+
chunking_technique: Optional[str] = None,
|
| 310 |
+
verbose: Optional[bool] = None, test: bool = False) -> tuple[List[str], float]:
|
| 311 |
"""
|
| 312 |
:param mode: "semantic", "bm25", or "hybrid"
|
| 313 |
:param rerank_strategy: "cross-encoder", "rrf", or "none"
|
| 314 |
:param use_mmr: Whether to apply MMR diversity filter after reranking
|
| 315 |
:param lambda_param: MMR trade-off between relevance (1.0) and diversity (0.0)
|
| 316 |
+
:param technique_name: Chunking technique to filter by (default: "markdown")
|
| 317 |
+
:returns: Tuple of (ranked_chunks, avg_chunk_score)
|
| 318 |
"""
|
| 319 |
should_print = verbose if verbose is not None else self.verbose
|
| 320 |
+
requested_technique = self._normalize_chunking_technique(chunking_technique or technique_name)
|
| 321 |
total_start = time.perf_counter()
|
| 322 |
semantic_time = 0.0
|
| 323 |
bm25_time = 0.0
|
| 324 |
rerank_time = 0.0
|
| 325 |
mmr_time = 0.0
|
| 326 |
|
| 327 |
+
if use_mmr:
|
| 328 |
+
final_k = 10
|
| 329 |
+
|
| 330 |
if should_print:
|
| 331 |
self._print_search_header(query, mode, rerank_strategy, top_k, final_k)
|
| 332 |
if requested_technique:
|
|
|
|
| 346 |
|
| 347 |
if mode in ["bm25", "hybrid"]:
|
| 348 |
bm25_start = time.perf_counter()
|
| 349 |
+
bm25_chunks = self._bm25_search(query, index, top_k, requested_technique)
|
| 350 |
bm25_time = time.perf_counter() - bm25_start
|
| 351 |
if should_print:
|
| 352 |
self._print_candidates("BM25 Search", bm25_chunks)
|
| 353 |
print(f"BM25 time: {bm25_time:.3f}s")
|
| 354 |
+
print("All BM25 results:")
|
| 355 |
+
for i, chunk in enumerate(bm25_chunks):
|
| 356 |
+
print(f" [{i}] {chunk[:200]}..." if len(chunk) > 200 else f" [{i}] {chunk}")
|
| 357 |
|
| 358 |
# 2. Fuse / rerank
|
| 359 |
rerank_start = time.perf_counter()
|
| 360 |
+
chunk_scores = []
|
| 361 |
if rerank_strategy == "rrf":
|
| 362 |
candidates = self._rrf_score(semantic_chunks, bm25_chunks)[:final_k]
|
| 363 |
label = "RRF"
|
| 364 |
elif rerank_strategy == "cross-encoder":
|
| 365 |
combined = list(dict.fromkeys(semantic_chunks + bm25_chunks))
|
| 366 |
+
candidates, chunk_scores = self._cross_encoder_rerank(query, combined, final_k)
|
| 367 |
label = "Cross-Encoder"
|
| 368 |
else: # "none"
|
| 369 |
candidates = list(dict.fromkeys(semantic_chunks + bm25_chunks))[:final_k]
|
| 370 |
label = "No Reranking"
|
| 371 |
rerank_time = time.perf_counter() - rerank_start
|
| 372 |
+
|
| 373 |
+
# Compute average chunk score
|
| 374 |
+
avg_chunk_score = float(np.mean(chunk_scores)) if chunk_scores else 0.0
|
| 375 |
|
| 376 |
# 3. MMR diversity filter (applied after reranking)
|
| 377 |
if use_mmr and candidates:
|
|
|
|
| 383 |
label += " + MMR"
|
| 384 |
mmr_time = time.perf_counter() - mmr_start
|
| 385 |
|
| 386 |
+
if test and rerank_strategy != "cross-encoder" and candidates:
|
| 387 |
+
_, test_scores = self._cross_encoder_rerank(query, candidates, len(candidates))
|
| 388 |
+
avg_chunk_score = float(np.mean(test_scores)) if test_scores else 0.0
|
| 389 |
+
|
| 390 |
total_time = time.perf_counter() - total_start
|
| 391 |
|
| 392 |
if should_print:
|
| 393 |
self._print_final_results(candidates, label)
|
| 394 |
self._print_timing_summary(semantic_time, bm25_time, rerank_time, mmr_time, total_time)
|
| 395 |
|
| 396 |
+
return candidates, avg_chunk_score
|
| 397 |
|
| 398 |
# ------------------------------------------------------------------
|
| 399 |
# Printing
|
test.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["OMP_NUM_THREADS"] = "1"
|
| 3 |
+
os.environ["MKL_NUM_THREADS"] = "1"
|
| 4 |
+
import sys
|
| 5 |
+
import traceback
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
from config_loader import cfg
|
| 10 |
+
from data.vector_db import get_index_by_name
|
| 11 |
+
from retriever.retriever import HybridRetriever
|
| 12 |
+
from retriever.processor import ChunkProcessor
|
| 13 |
+
from data.ingest import CHUNKING_TECHNIQUES
|
| 14 |
+
|
| 15 |
+
def generate_retrieval_report(all_results, queries, output_file="retrieval_report.md"):
|
| 16 |
+
"""
|
| 17 |
+
Generates a Markdown document summarizing the retrieved chunks
|
| 18 |
+
for each query, chunking technique, and retrieval strategy.
|
| 19 |
+
"""
|
| 20 |
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 21 |
+
|
| 22 |
+
content = f"# Retrieval Testing Report\n\n*Generated:* {timestamp}\n\n"
|
| 23 |
+
content += "## Test Queries\n\n"
|
| 24 |
+
for i, q in enumerate(queries, 1):
|
| 25 |
+
content += f"{i}. {q}\n"
|
| 26 |
+
|
| 27 |
+
content += "\n## Retrieval Results by Query\n\n"
|
| 28 |
+
|
| 29 |
+
for q_idx, q_results in all_results.items():
|
| 30 |
+
content += f"### Query {q_idx + 1}: {queries[q_idx]}\n\n"
|
| 31 |
+
|
| 32 |
+
for tech_strat_key, chunks_data in q_results.items():
|
| 33 |
+
content += f"#### Strategy & Technique: {tech_strat_key}\n\n"
|
| 34 |
+
|
| 35 |
+
chunks = chunks_data.get('chunks', [])
|
| 36 |
+
score = chunks_data.get('score', 0)
|
| 37 |
+
|
| 38 |
+
content += f"**ChunkScore:** {score:.4f} | **Chunks retrieved:** {len(chunks)}\n\n"
|
| 39 |
+
|
| 40 |
+
if not chunks:
|
| 41 |
+
content += "*No chunks retrieved.*\n\n"
|
| 42 |
+
else:
|
| 43 |
+
for i, chunk in enumerate(chunks, 1):
|
| 44 |
+
content += f"**[Chunk {i}]** ({len(chunk)} chars):\n"
|
| 45 |
+
content += f"```text\n{chunk}\n```\n\n"
|
| 46 |
+
|
| 47 |
+
content += "---\n\n"
|
| 48 |
+
|
| 49 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 50 |
+
f.write(content)
|
| 51 |
+
|
| 52 |
+
print(f"\nRetrieval report saved to: {output_file}")
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def main():
|
| 56 |
+
# Load environment variables
|
| 57 |
+
load_dotenv()
|
| 58 |
+
|
| 59 |
+
pinecone_key = os.getenv("PINECONE_API_KEY")
|
| 60 |
+
if not pinecone_key:
|
| 61 |
+
raise RuntimeError("PINECONE_API_KEY not found in environment variables")
|
| 62 |
+
|
| 63 |
+
test_queries = [
|
| 64 |
+
"What is cognitive behavior therapy and how does it work?",
|
| 65 |
+
"I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
|
| 66 |
+
"No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
|
| 67 |
+
"I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying."
|
| 68 |
+
]
|
| 69 |
+
|
| 70 |
+
# TECHNIQUES_TO_EVALUATE = ["fixed", "semantic", "markdown", "page"]
|
| 71 |
+
# Use all 7 chunking techniques from ingest.py
|
| 72 |
+
CHUNKING_TECHNIQUES_FILTERED = CHUNKING_TECHNIQUES
|
| 73 |
+
print(f"Testing all {len(CHUNKING_TECHNIQUES_FILTERED)} chunking techniques:")
|
| 74 |
+
for tech in CHUNKING_TECHNIQUES_FILTERED:
|
| 75 |
+
print(f" - {tech['name']}: {tech['description']}")
|
| 76 |
+
|
| 77 |
+
RETRIEVAL_STRATEGIES = [
|
| 78 |
+
{"mode": "semantic", "use_mmr": False, "label": "semantic-no-mmr"},
|
| 79 |
+
{"mode": "semantic", "use_mmr": True, "label": "semantic-with-mmr"},
|
| 80 |
+
{"mode": "hybrid", "use_mmr": False, "label": "hybrid-no-mmr"},
|
| 81 |
+
{"mode": "hybrid", "use_mmr": True, "label": "hybrid-with-mmr"},
|
| 82 |
+
{"mode": "bm25", "use_mmr": False, "label": "bm25-no-mmr"},
|
| 83 |
+
]
|
| 84 |
+
|
| 85 |
+
print("Initializing ChunkProcessor to load Embedding Model...")
|
| 86 |
+
proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
print("Initializing HybridRetriever...")
|
| 90 |
+
retriever = HybridRetriever(
|
| 91 |
+
embed_model=proc.encoder,
|
| 92 |
+
rerank_model_name='jinaai/jina-reranker-v1-tiny-en',
|
| 93 |
+
verbose=False
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
all_query_results = {}
|
| 97 |
+
|
| 98 |
+
for query_idx, query in enumerate(test_queries):
|
| 99 |
+
print(f"\n{'='*80}")
|
| 100 |
+
print(f"PROCESSING QUERY {query_idx + 1}/{len(test_queries)}: {query}")
|
| 101 |
+
print(f"{'='*80}")
|
| 102 |
+
|
| 103 |
+
query_results = {}
|
| 104 |
+
|
| 105 |
+
# Connect to the single index where all techniques are stored with metadata differentiation
|
| 106 |
+
index_name = "cbt-book-recursive"
|
| 107 |
+
try:
|
| 108 |
+
index = get_index_by_name(pinecone_key, index_name)
|
| 109 |
+
stats = index.describe_index_stats()
|
| 110 |
+
if stats.get('total_vector_count', 0) == 0:
|
| 111 |
+
print(f" [!] Warning: Index {index_name} is empty. Proceeding for sparse test.")
|
| 112 |
+
except Exception as e:
|
| 113 |
+
print(f" [X] Failed to connect to index {index_name}: {e}")
|
| 114 |
+
continue
|
| 115 |
+
|
| 116 |
+
for technique in CHUNKING_TECHNIQUES_FILTERED:
|
| 117 |
+
technique_name = technique['name']
|
| 118 |
+
|
| 119 |
+
for strategy in RETRIEVAL_STRATEGIES:
|
| 120 |
+
result_key = f"{technique_name} + {strategy['label']}"
|
| 121 |
+
print(f"\nEvaluating: {result_key}")
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
context_chunks, chunk_score = retriever.search(
|
| 125 |
+
query=query,
|
| 126 |
+
index=index,
|
| 127 |
+
mode=strategy['mode'],
|
| 128 |
+
rerank_strategy="cross-encoder",
|
| 129 |
+
use_mmr=strategy['use_mmr'],
|
| 130 |
+
top_k=25,
|
| 131 |
+
final_k=4,
|
| 132 |
+
technique_name=technique_name,
|
| 133 |
+
verbose=False,
|
| 134 |
+
test=True
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
query_results[result_key] = {
|
| 138 |
+
'chunks': context_chunks,
|
| 139 |
+
'score': chunk_score
|
| 140 |
+
}
|
| 141 |
+
print(f" -> Retrieved {len(context_chunks)} chunks (Score: {chunk_score:.4f})")
|
| 142 |
+
|
| 143 |
+
except Exception as e:
|
| 144 |
+
print(f" -> Error retrieving for {result_key}: {e}")
|
| 145 |
+
|
| 146 |
+
all_query_results[query_idx] = query_results
|
| 147 |
+
|
| 148 |
+
# Generate isolated retrieval test report
|
| 149 |
+
generate_retrieval_report(all_query_results, test_queries)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
if __name__ == '__main__':
|
| 153 |
+
main()
|