Qar-Raz commited on
Commit
c27a4e3
·
verified ·
1 Parent(s): f5ff6c4

Sync backend Docker context from GitHub main

Browse files
Dockerfile CHANGED
@@ -17,7 +17,8 @@ RUN pip install --upgrade pip && pip install -r requirements.txt
17
 
18
  COPY . .
19
 
20
- # Fail fast during build if critical runtime modules are missing from context.
 
21
  RUN test -d /app/backend && test -d /app/retriever && test -d /app/models && test -f /app/config.yaml
22
 
23
  # Hugging Face Spaces exposes apps on port 7860 by default.
 
17
 
18
  COPY . .
19
 
20
+ # Fail fast during build if critical runtime code is missing from context.
21
+ # changed this cuz we no longer have monorep
22
  RUN test -d /app/backend && test -d /app/retriever && test -d /app/models && test -f /app/config.yaml
23
 
24
  # Hugging Face Spaces exposes apps on port 7860 by default.
backend/routes/predict.py CHANGED
@@ -39,7 +39,7 @@ def predict(payload: PredictRequest) -> PredictResponse:
39
  model_resolve_time = time.perf_counter() - model_resolve_start
40
 
41
  retrieval_start = time.perf_counter()
42
- contexts = retriever.search(
43
  query,
44
  index,
45
  chunking_technique=payload.chunking_technique,
@@ -72,6 +72,7 @@ def predict(payload: PredictRequest) -> PredictResponse:
72
  f"lambda={payload.lambda_param:.2f} | temp={payload.temperature:.2f} | "
73
  f"chunking={payload.chunking_technique} | "
74
  f"top_k={payload.top_k} | final_k={payload.final_k} | returned={len(contexts)} | "
 
75
  f"precheck={precheck_time:.3f}s | "
76
  f"state_access={state_access_time:.3f}s | model_resolve={model_resolve_time:.3f}s | "
77
  f"retrieval={retrieval_time:.3f}s | inference={inference_time:.3f}s | "
 
39
  model_resolve_time = time.perf_counter() - model_resolve_start
40
 
41
  retrieval_start = time.perf_counter()
42
+ contexts, chunk_score = retriever.search(
43
  query,
44
  index,
45
  chunking_technique=payload.chunking_technique,
 
72
  f"lambda={payload.lambda_param:.2f} | temp={payload.temperature:.2f} | "
73
  f"chunking={payload.chunking_technique} | "
74
  f"top_k={payload.top_k} | final_k={payload.final_k} | returned={len(contexts)} | "
75
+ f"chunk_score={chunk_score:.4f} | "
76
  f"precheck={precheck_time:.3f}s | "
77
  f"state_access={state_access_time:.3f}s | model_resolve={model_resolve_time:.3f}s | "
78
  f"retrieval={retrieval_time:.3f}s | inference={inference_time:.3f}s | "
backend/routes/predict_stream.py CHANGED
@@ -47,7 +47,7 @@ def predict_stream(payload: PredictRequest) -> StreamingResponse:
47
  model_resolve_time = time.perf_counter() - model_resolve_start
48
 
49
  retrieval_start = time.perf_counter()
50
- contexts = retriever.search(
51
  query,
52
  index,
53
  chunking_technique=payload.chunking_technique,
@@ -80,6 +80,7 @@ def predict_stream(payload: PredictRequest) -> StreamingResponse:
80
  "requested_top_k": payload.top_k,
81
  "requested_final_k": payload.final_k,
82
  "returned_context_count": len(contexts),
 
83
  "use_mmr": payload.use_mmr,
84
  "lambda_param": payload.lambda_param,
85
  },
@@ -114,6 +115,7 @@ def predict_stream(payload: PredictRequest) -> StreamingResponse:
114
  "requested_top_k": payload.top_k,
115
  "requested_final_k": payload.final_k,
116
  "returned_context_count": len(contexts),
 
117
  "use_mmr": payload.use_mmr,
118
  "lambda_param": payload.lambda_param,
119
  },
@@ -127,6 +129,7 @@ def predict_stream(payload: PredictRequest) -> StreamingResponse:
127
  f"lambda={payload.lambda_param:.2f} | temp={payload.temperature:.2f} | "
128
  f"chunking={payload.chunking_technique} | "
129
  f"top_k={payload.top_k} | final_k={payload.final_k} | returned={len(contexts)} | "
 
130
  f"precheck={precheck_time:.3f}s | "
131
  f"state_access={state_access_time:.3f}s | model_resolve={model_resolve_time:.3f}s | "
132
  f"retrieval={retrieval_time:.3f}s | first_token={first_token_latency if first_token_latency is not None else -1:.3f}s | "
 
47
  model_resolve_time = time.perf_counter() - model_resolve_start
48
 
49
  retrieval_start = time.perf_counter()
50
+ contexts, chunk_score = retriever.search(
51
  query,
52
  index,
53
  chunking_technique=payload.chunking_technique,
 
80
  "requested_top_k": payload.top_k,
81
  "requested_final_k": payload.final_k,
82
  "returned_context_count": len(contexts),
83
+ "chunk_score": chunk_score,
84
  "use_mmr": payload.use_mmr,
85
  "lambda_param": payload.lambda_param,
86
  },
 
115
  "requested_top_k": payload.top_k,
116
  "requested_final_k": payload.final_k,
117
  "returned_context_count": len(contexts),
118
+ "chunk_score": chunk_score,
119
  "use_mmr": payload.use_mmr,
120
  "lambda_param": payload.lambda_param,
121
  },
 
129
  f"lambda={payload.lambda_param:.2f} | temp={payload.temperature:.2f} | "
130
  f"chunking={payload.chunking_technique} | "
131
  f"top_k={payload.top_k} | final_k={payload.final_k} | returned={len(contexts)} | "
132
+ f"chunk_score={chunk_score:.4f} | "
133
  f"precheck={precheck_time:.3f}s | "
134
  f"state_access={state_access_time:.3f}s | model_resolve={model_resolve_time:.3f}s | "
135
  f"retrieval={retrieval_time:.3f}s | first_token={first_token_latency if first_token_latency is not None else -1:.3f}s | "
backend/services/startup.py CHANGED
@@ -67,7 +67,6 @@ def initialize_runtime_state(state: dict[str, Any]) -> None:
67
 
68
  retriever_start = time.perf_counter()
69
  retriever = HybridRetriever(
70
- final_chunks,
71
  proc.encoder,
72
  rerank_model_name=rerank_model_name,
73
  verbose=False,
 
67
 
68
  retriever_start = time.perf_counter()
69
  retriever = HybridRetriever(
 
70
  proc.encoder,
71
  rerank_model_name=rerank_model_name,
72
  verbose=False,
config.yaml CHANGED
@@ -27,20 +27,18 @@ retrieval:
27
  mode: "hybrid"
28
  # Options: cross-encoder, rrf
29
  rerank_strategy: "cross-encoder"
30
- use_mmr: true
31
- top_k: 10
32
  final_k: 5
33
 
34
  generation:
35
  temperature: 0.
36
  max_new_tokens: 512
37
  # The model used to Judge the others (OpenRouter)
38
- judge_model: "stepfun/step-3.5-flash:free"
39
 
40
  # List of contestants in the tournament
41
  models:
42
  - "Llama-3-8B"
43
  - "Mistral-7B"
44
- - "Qwen-2.5"
45
- - "DeepSeek-V3"
46
  - "TinyAya"
 
27
  mode: "hybrid"
28
  # Options: cross-encoder, rrf
29
  rerank_strategy: "cross-encoder"
30
+ use_mmr: False
31
+ top_k: 50
32
  final_k: 5
33
 
34
  generation:
35
  temperature: 0.
36
  max_new_tokens: 512
37
  # The model used to Judge the others (OpenRouter)
38
+ judge_model: "deepseek/deepseek-v3.2"
39
 
40
  # List of contestants in the tournament
41
  models:
42
  - "Llama-3-8B"
43
  - "Mistral-7B"
 
 
44
  - "TinyAya"
data/__init__.py DELETED
File without changes
data/vector_db.py DELETED
@@ -1,245 +0,0 @@
1
- import time
2
- import re
3
- import json
4
- from pathlib import Path
5
- from typing import Any, Dict, List
6
- from pinecone import Pinecone, ServerlessSpec
7
-
8
-
9
- # Added cacheing to reduce consecutive startup time
10
- # --@Qamar
11
-
12
- def slugify_technique(name):
13
- """Converts 'Sentence Splitter' to 'sentence-splitter' for Pinecone naming."""
14
- return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
15
-
16
- def get_index_by_name(api_key: str, index_name: str):
17
- """
18
- Directly connects to a Pinecone index by its full string name.
19
- Useful for the API/Production side where the name is already known.
20
- """
21
- pc = Pinecone(api_key=api_key)
22
-
23
- # Check if it exists first to avoid a 404 crash
24
- existing_indexes = [idx.name for idx in pc.list_indexes()]
25
- if index_name not in existing_indexes:
26
- raise ValueError(f"Index '{index_name}' does not exist in your Pinecone project.")
27
-
28
- print(f" Connecting to Index: {index_name}")
29
- return pc.Index(index_name)
30
-
31
- def get_pinecone_index(api_key, base_name, technique, dimension=384, metric="cosine"):
32
- """
33
- Creates/Returns an index specifically for a technique.
34
- Example: 'arxiv-index-token'
35
- """
36
- pc = Pinecone(api_key=api_key)
37
- tech_slug = slugify_technique(technique)
38
- full_index_name = f"{base_name}-{tech_slug}"
39
-
40
- existing_indexes = [idx.name for idx in pc.list_indexes()]
41
-
42
- if full_index_name not in existing_indexes:
43
- print(f" Creating specialized index: {full_index_name}...")
44
- pc.create_index(
45
- name=full_index_name,
46
- dimension=dimension,
47
- metric=metric,
48
- spec=ServerlessSpec(cloud="aws", region="us-east-1")
49
- )
50
- # Wait for index to spin up
51
- while not pc.describe_index(full_index_name).status['ready']:
52
- time.sleep(1)
53
-
54
- # Use our new helper to return the index object
55
- return get_index_by_name(api_key, full_index_name)
56
-
57
- def refresh_pinecone_index(index, final_chunks, batch_size=100):
58
- """
59
- Refreshes the specific index. Since index is now technique-specific,
60
- we just check if it's already populated.
61
- """
62
- if not final_chunks:
63
- print("No chunks provided to refresh.")
64
- return False
65
-
66
- try:
67
- # Check current stats for this specific index
68
- stats = index.describe_index_stats()
69
- current_count = stats.get('total_vector_count', 0)
70
- expected_count = len(final_chunks)
71
-
72
- print(f" Index Stats -> Existing: {current_count} | New Chunks: {expected_count}")
73
-
74
- if current_count == 0:
75
- print(f"➕ Index is empty. Upserting {expected_count} vectors...")
76
- vectors = prepare_vectors_for_upsert(final_chunks)
77
- upsert_to_pinecone(index, vectors, batch_size)
78
- return True
79
-
80
- elif current_count < expected_count:
81
- # Simple check to see if we need to top up or refresh
82
- print(f" Vector count mismatch ({current_count} < {expected_count}). Updating index...")
83
- vectors = prepare_vectors_for_upsert(final_chunks)
84
- upsert_to_pinecone(index, vectors, batch_size)
85
- return True
86
-
87
- else:
88
- print(f" Index is already populated with {current_count} vectors. Ready for search.")
89
- return False
90
-
91
- except Exception as e:
92
- print(f" Error refreshing index: {e}")
93
- return False
94
-
95
- # Utility functions remain the same as previous version
96
- def prepare_vectors_for_upsert(final_chunks):
97
- vectors = []
98
- for chunk in final_chunks:
99
- meta = chunk.get('metadata', {})
100
- metadata_payload = dict(meta) if isinstance(meta, dict) else {}
101
- metadata_payload.setdefault('text', meta.get('text', "") if isinstance(meta, dict) else "")
102
- metadata_payload.setdefault('title', meta.get('title', "") if isinstance(meta, dict) else "")
103
- metadata_payload.setdefault('url', meta.get('url', "") if isinstance(meta, dict) else "")
104
- metadata_payload.setdefault('chunk_index', meta.get('chunk_index', 0) if isinstance(meta, dict) else 0)
105
- metadata_payload.setdefault('technique', meta.get('technique', "unknown") if isinstance(meta, dict) else "unknown")
106
- metadata_payload.setdefault('chunking_technique', meta.get('chunking_technique', "unknown") if isinstance(meta, dict) else "unknown")
107
-
108
- vectors.append({
109
- 'id': chunk['id'],
110
- 'values': chunk['values'],
111
- 'metadata': metadata_payload
112
- })
113
- return vectors
114
-
115
- def upsert_to_pinecone(index, chunks, batch_size=100):
116
- for i in range(0, len(chunks), batch_size):
117
- batch = chunks[i : i + batch_size]
118
- index.upsert(vectors=batch)
119
-
120
- # Some methods for loading chunks back from Pinecone with local caching to speed up BM25 initialization
121
-
122
- def _sanitize_index_name(index_name: str) -> str:
123
- return re.sub(r'[^a-zA-Z0-9._-]+', '-', index_name).strip('-') or 'default-index'
124
-
125
-
126
- def _chunk_cache_path(cache_dir: str, index_name: str) -> Path:
127
- cache_root = Path(cache_dir)
128
- cache_root.mkdir(parents=True, exist_ok=True)
129
- safe_name = _sanitize_index_name(index_name)
130
- return cache_root / f"bm25_chunks_{safe_name}.json"
131
-
132
-
133
- def _read_chunk_cache(path: Path) -> Dict[str, Any]:
134
- with path.open("r", encoding="utf-8") as f:
135
- return json.load(f)
136
-
137
-
138
- def _write_chunk_cache(path: Path, payload: Dict[str, Any]) -> None:
139
- with path.open("w", encoding="utf-8") as f:
140
- json.dump(payload, f)
141
-
142
-
143
- def load_chunks_with_local_cache(
144
- index,
145
- index_name: str,
146
- cache_dir: str = ".cache",
147
- batch_size: int = 100,
148
- force_refresh: bool = False,
149
- ) -> tuple[List[Dict[str, Any]], str]:
150
-
151
- cache_file = _chunk_cache_path(cache_dir=cache_dir, index_name=index_name)
152
- stats = index.describe_index_stats()
153
- current_count = stats.get("total_vector_count", 0)
154
-
155
- if not force_refresh and cache_file.exists():
156
- try:
157
- cached_payload = _read_chunk_cache(cache_file)
158
- cached_meta = cached_payload.get("meta", {})
159
- cached_count = cached_meta.get("vector_count", -1)
160
- cached_chunks = cached_payload.get("chunks", [])
161
-
162
- if cached_count == current_count and cached_chunks:
163
- print(
164
- f" Loaded BM25 chunk cache: {cache_file} "
165
- f"(chunks={len(cached_chunks)}, vectors={cached_count})"
166
- )
167
- return cached_chunks, "cache"
168
-
169
- print(
170
- " BM25 cache stale or empty. "
171
- f"cache_vectors={cached_count}, pinecone_vectors={current_count}. Refreshing..."
172
- )
173
- except Exception as e:
174
- print(f" Failed to read BM25 cache ({cache_file}): {e}. Refreshing from Pinecone...")
175
-
176
- chunks = load_chunks_from_pinecone(index=index, batch_size=batch_size)
177
- payload = {
178
- "meta": {
179
- "index_name": index_name,
180
- "vector_count": current_count,
181
- "updated_at_epoch_s": int(time.time()),
182
- },
183
- "chunks": chunks,
184
- }
185
-
186
- try:
187
- _write_chunk_cache(cache_file, payload)
188
- print(f" Saved BM25 chunk cache: {cache_file} (chunks={len(chunks)})")
189
- except Exception as e:
190
- print(f" Failed to write BM25 cache ({cache_file}): {e}")
191
-
192
- return chunks, "pinecone"
193
-
194
-
195
- def load_chunks_from_pinecone(index, batch_size: int = 100) -> list[dict[str, any]]:
196
- """
197
- Scans the Pinecone index to retrieve all text metadata for the BM25 corpus.
198
- """
199
- stats = index.describe_index_stats()
200
- namespaces = list(stats.get('namespaces', {}).keys())
201
- # If no namespaces are explicitly named, Pinecone uses an empty string for the default
202
- if not namespaces:
203
- namespaces = [""]
204
-
205
- all_chunks: List[Dict[str, Any]] = []
206
- seen_ids = set()
207
-
208
- print(f"Loading vectors for BM25 from namespaces: {namespaces}")
209
-
210
- for ns in namespaces:
211
- # Pinecone's list() generator returns batches of IDs
212
- for id_batch in index.list(namespace=ns, limit=batch_size):
213
- if not id_batch:
214
- continue
215
-
216
- # Fetch the actual content (metadata) for this batch of IDs
217
- fetched = index.fetch(ids=id_batch, namespace=ns)
218
- vectors = getattr(fetched, "vectors", {})
219
-
220
- for vector_id, vector_data in vectors.items():
221
- if vector_id in seen_ids:
222
- continue
223
- seen_ids.add(vector_id)
224
-
225
- # Safely extract metadata
226
- metadata = getattr(vector_data, "metadata", {})
227
- if metadata is None:
228
- metadata = {}
229
- if not isinstance(metadata, dict):
230
- metadata = dict(metadata)
231
-
232
- text = metadata.get("text")
233
-
234
- if not text:
235
- continue
236
-
237
- all_chunks.append({
238
- "id": vector_id,
239
- "metadata": metadata
240
- })
241
-
242
- print(f" Finished namespace: '{ns if ns else 'default'}'")
243
-
244
- print(f"Total chunks loaded into memory: {len(all_chunks)}")
245
- return all_chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main.py CHANGED
@@ -2,8 +2,6 @@ import os
2
  import json
3
  import time
4
  from datetime import datetime
5
- from multiprocessing import Pool, cpu_count
6
- from functools import partial
7
  from dotenv import load_dotenv
8
  from config_loader import cfg
9
 
@@ -18,63 +16,47 @@ from data.ingest import ingest_data, CHUNKING_TECHNIQUES
18
  # Import model fleet
19
  from models.llama_3_8b import Llama3_8B
20
  from models.mistral_7b import Mistral_7b
21
- from models.qwen_2_5 import Qwen2_5
22
- from models.deepseek_v3 import DeepSeek_V3
23
  from models.tiny_aya import TinyAya
24
 
25
  MODEL_MAP = {
26
  "Llama-3-8B": Llama3_8B,
27
  "Mistral-7B": Mistral_7b,
28
- "Qwen-2.5": Qwen2_5,
29
- "DeepSeek-V3": DeepSeek_V3,
30
  "TinyAya": TinyAya
31
  }
32
 
33
  load_dotenv()
34
 
35
 
36
- def run_rag_for_technique(technique_name, query, index, encoder, models, evaluator, rag_engine):
37
- """Run RAG pipeline for a specific chunking technique."""
 
 
 
 
38
 
39
  print(f"\n{'='*80}")
40
- print(f"TECHNIQUE: {technique_name.upper()}")
41
  print(f"{'='*80}")
42
 
43
- # Filter chunks by technique metadata
44
- query_vector = encoder.encode(query).tolist()
45
-
46
- # Query with metadata filter for this technique - get more candidates for reranking
47
- res = index.query(
48
- vector=query_vector,
49
- top_k=25,
50
- include_metadata=True,
51
- filter={"technique": {"$eq": technique_name}}
 
 
52
  )
53
 
54
- # Extract context chunks with URLs
55
- all_candidates = []
56
- chunk_urls = []
57
- for match in res['matches']:
58
- all_candidates.append(match['metadata']['text'])
59
- chunk_urls.append(match['metadata'].get('url', ''))
60
 
61
- print(f"\nRetrieved {len(all_candidates)} candidate chunks for technique '{technique_name}'")
62
-
63
- if not all_candidates:
64
  print(f"WARNING: No chunks found for technique '{technique_name}'")
65
  return {}
66
 
67
- # Apply cross-encoder reranking to get top 5
68
- # Use global reranker loaded once per worker
69
- global _worker_reranker
70
- pairs = [[query, chunk] for chunk in all_candidates]
71
- scores = _worker_reranker.predict(pairs)
72
- ranked = sorted(zip(all_candidates, chunk_urls, scores), key=lambda x: x[2], reverse=True)
73
- context_chunks = [chunk for chunk, _, _ in ranked[:5]]
74
- context_urls = [url for _, url, _ in ranked[:5]]
75
-
76
- print(f"After reranking: {len(context_chunks)} chunks (top 5)")
77
-
78
  # Print the final RAG context being passed to models (only once)
79
  print(f"\n{'='*80}")
80
  print(f"📚 FINAL RAG CONTEXT FOR TECHNIQUE '{technique_name.upper()}'")
@@ -88,6 +70,8 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
88
 
89
  # Run model tournament for this technique
90
  tournament_results = {}
 
 
91
 
92
  for name, model_inst in models.items():
93
  print(f"\n{'-'*60}")
@@ -97,7 +81,6 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
97
  # Generation
98
  answer = rag_engine.get_answer(
99
  model_inst, query, context_chunks,
100
- context_urls=context_urls,
101
  temperature=cfg.gen['temperature']
102
  )
103
 
@@ -118,7 +101,6 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
118
  "Relevancy": rel['score'],
119
  "Claims": faith['details'],
120
  "context_chunks": context_chunks,
121
- "context_urls": context_urls
122
  }
123
 
124
  print(f"\n📊 EVALUATION SCORES:")
@@ -135,7 +117,6 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
135
  "Claims": [],
136
  "error": str(e),
137
  "context_chunks": context_chunks,
138
- "context_urls": context_urls
139
  }
140
 
141
  return tournament_results
@@ -185,13 +166,21 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
185
 
186
  # Aggregate results across all queries
187
  aggregated_results = {}
 
188
 
189
  for query_idx, query_results in all_query_results.items():
190
  for technique_name, model_results in query_results.items():
191
  if technique_name not in aggregated_results:
192
  aggregated_results[technique_name] = {}
193
 
 
 
 
 
194
  for model_name, results in model_results.items():
 
 
 
195
  if model_name not in aggregated_results[technique_name]:
196
  aggregated_results[technique_name][model_name] = {
197
  'Faithfulness': [],
@@ -213,6 +202,15 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
213
  content += "No results available for this technique.\n\n"
214
  continue
215
 
 
 
 
 
 
 
 
 
 
216
  # Create results table with averaged scores
217
  content += "| Model | Avg Faithfulness | Avg Relevancy | Avg Combined |\n"
218
  content += "|-------|------------------|---------------|--------------|\n"
@@ -348,8 +346,8 @@ Based on the ablation study results:
348
  - *Embedding Model:* Jina embeddings (512 dimensions)
349
  - *Vector Database:* Pinecone (serverless, AWS us-east-1)
350
  - *Judge Model:* Openrouter Free models
351
- - *Retrieval:* Top 5 chunks per technique
352
- - *Evaluation Metrics:* Faithfulness (context grounding), Relevancy (query addressing)
353
 
354
  ---
355
 
@@ -364,81 +362,100 @@ This report was automatically generated by the RAG Ablation Study Pipeline.
364
  return output_file
365
 
366
 
367
- # Global variables for worker processes
368
- _worker_proc = None
369
- _worker_evaluator = None
370
- _worker_models = None
371
- _worker_rag_engine = None
372
- _worker_reranker = None
373
-
374
- def init_worker(model_name, evaluator_config):
375
- """Initialize models once per worker process."""
376
- global _worker_proc, _worker_evaluator, _worker_models, _worker_rag_engine, _worker_reranker
377
-
378
- from retriever.processor import ChunkProcessor
379
- from retriever.evaluator import RAGEvaluator
380
- from retriever.generator import RAGGenerator
381
- from sentence_transformers import CrossEncoder
382
- from models.llama_3_8b import Llama3_8B
383
- from models.mistral_7b import Mistral_7b
384
- from models.qwen_2_5 import Qwen2_5
385
- from models.deepseek_v3 import DeepSeek_V3
386
- from models.tiny_aya import TinyAya
387
-
388
- MODEL_MAP = {
389
- "Llama-3-8B": Llama3_8B,
390
- "Mistral-7B": Mistral_7b,
391
- "Qwen-2.5": Qwen2_5,
392
- "DeepSeek-V3": DeepSeek_V3,
393
- "TinyAya": TinyAya
394
- }
395
-
396
- # Load embedding model once
397
- _worker_proc = ChunkProcessor(model_name=model_name, verbose=False)
398
-
399
- # Initialize evaluator
400
- _worker_evaluator = RAGEvaluator(
401
- judge_model=evaluator_config['judge_model'],
402
- embedding_model=_worker_proc.encoder,
403
- api_key=evaluator_config['api_key']
404
  )
405
 
406
- # Initialize models
407
- hf_token = os.getenv("HF_TOKEN")
408
- _worker_models = {name: MODEL_MAP[name](token=hf_token) for name in evaluator_config['model_list']}
 
 
409
 
410
- # Initialize RAG engine
411
- _worker_rag_engine = RAGGenerator()
 
 
 
 
 
 
 
 
 
 
 
 
 
412
 
413
- # Load reranker once per worker
414
- _worker_reranker = CrossEncoder('jinaai/jina-reranker-v1-tiny-en')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
 
 
 
 
 
416
 
417
- def run_rag_for_technique_wrapper(args):
418
- """Wrapper function for parallel execution."""
419
- global _worker_proc, _worker_evaluator, _worker_models, _worker_rag_engine
 
 
 
 
 
 
 
420
 
421
- technique, query, index_name, pinecone_key = args
422
- try:
423
- # Create new connection in worker process
424
- from data.vector_db import get_index_by_name
425
- index = get_index_by_name(pinecone_key, index_name)
426
-
427
- return technique['name'], run_rag_for_technique(
428
- technique_name=technique['name'],
429
- query=query,
430
- index=index,
431
- encoder=_worker_proc.encoder,
432
- models=_worker_models,
433
- evaluator=_worker_evaluator,
434
- rag_engine=_worker_rag_engine
435
- )
436
- except Exception as e:
437
- import traceback
438
- print(f"\n✗ Error processing technique {technique['name']}: {e}")
439
- print(f"Full traceback:")
440
- traceback.print_exc()
441
- return technique['name'], {}
442
 
443
 
444
  def main():
@@ -458,8 +475,9 @@ def main():
458
  # Test queries
459
  test_queries = [
460
  "What is cognitive behavior therapy and how does it work?",
461
- "What are the common cognitive distortions in CBT?",
462
- "How does CBT help with anxiety and depression?"
 
463
  ]
464
 
465
  print("=" * 80)
@@ -478,122 +496,186 @@ def main():
478
  from data.vector_db import get_index_by_name
479
  index_name = f"{cfg.db['base_index_name']}-{cfg.processing['technique']}"
480
 
481
- print(f"\nChecking for existing index: {index_name}")
482
 
483
  try:
484
  # Try to connect to existing index
485
- print("Connecting to Pinecone...")
486
  existing_index = get_index_by_name(pinecone_key, index_name)
487
- print("Getting index stats...")
488
  stats = existing_index.describe_index_stats()
489
  existing_count = stats.get('total_vector_count', 0)
490
 
491
  if existing_count > 0:
492
- print(f"\n✓ Found existing index with {existing_count} vectors")
493
- print("Skipping ingestion - using existing data")
494
 
495
  # Initialize processor (this loads the embedding model)
496
- print("Loading embedding model for retrieval...")
 
 
 
 
497
  from retriever.processor import ChunkProcessor
 
 
 
 
 
498
  proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)
 
 
 
499
  index = existing_index
500
  all_chunks = [] # Empty since we're using existing data
501
  final_chunks = []
502
- print("✓ Processor initialized")
503
  else:
504
- print("\nIndex exists but is empty. Running full ingestion...")
505
  all_chunks, final_chunks, proc, index = ingest_data()
506
  except Exception as e:
507
- print(f"\nIndex check failed: {e}")
508
- print("Running full ingestion...")
 
 
509
  all_chunks, final_chunks, proc, index = ingest_data()
510
 
511
  print(f"\nTechniques to evaluate: {[tech['name'] for tech in CHUNKING_TECHNIQUES]}")
512
 
513
- # Step 2: Initialize components
514
  print("\n" + "=" * 80)
515
- print("STEP 2: INITIALIZING COMPONENTS")
516
  print("=" * 80)
 
 
517
 
518
- # Initialize models
519
- print("\nInitializing models...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
  rag_engine = RAGGenerator()
 
 
 
 
 
521
  models = {name: MODEL_MAP[name](token=hf_token) for name in cfg.model_list}
522
-
523
- # Initialize evaluator
524
- print("Initializing evaluator...")
525
- if not openrouter_key:
526
- raise RuntimeError("OPENROUTER_API_KEY not found in environment variables")
527
 
 
 
528
  evaluator = RAGEvaluator(
529
  judge_model=cfg.gen['judge_model'],
530
  embedding_model=proc.encoder,
531
  api_key=openrouter_key
532
  )
533
-
534
- # Step 3: Run RAG for all techniques in parallel for all queries
535
- print("\n" + "=" * 80)
536
- print("STEP 3: RUNNING RAG FOR ALL 6 TECHNIQUES (IN PARALLEL)")
537
- print("=" * 80)
538
-
539
- # Prepare arguments for parallel execution
540
- num_processes = min(cpu_count(), len(CHUNKING_TECHNIQUES))
541
- print(f"\nUsing {num_processes} parallel processes for {len(CHUNKING_TECHNIQUES)} techniques")
542
-
543
- # Run techniques in parallel for all queries
544
- evaluator_config = {
545
- 'judge_model': cfg.gen['judge_model'],
546
- 'api_key': openrouter_key,
547
- 'model_list': cfg.model_list
548
- }
549
 
550
  all_query_results = {}
551
 
552
  for query_idx, query in enumerate(test_queries):
553
  print(f"\n{'='*80}")
554
- print(f"PROCESSING QUERY {query_idx + 1}/{len(test_queries)}")
555
- print(f"Query: {query}")
556
  print(f"{'='*80}")
 
 
557
 
558
- with Pool(
559
- processes=num_processes,
560
- initializer=init_worker,
561
- initargs=(cfg.processing['embedding_model'], evaluator_config)
562
- ) as pool:
563
- args_list = [
564
- (technique, query, index_name, pinecone_key)
565
- for technique in CHUNKING_TECHNIQUES
566
- ]
567
- results_list = pool.map(run_rag_for_technique_wrapper, args_list)
568
-
569
- # Convert results to dictionary and store
570
- query_results = {name: results for name, results in results_list}
571
- all_query_results[query_idx] = query_results
572
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573
  # Print quick summary for this query
574
  print(f"\n{'='*80}")
575
  print(f"QUERY {query_idx + 1} SUMMARY")
576
  print(f"{'='*80}")
577
- print(f"\n{'Technique':<15} {'Avg Faith':>12} {'Avg Rel':>12} {'Best Model':<20}")
578
- print("-" * 60)
579
 
580
- for technique_name, model_results in query_results.items():
581
  if model_results:
582
- avg_faith = sum(r.get('Faithfulness', 0) for r in model_results.values()) / len(model_results)
583
- avg_rel = sum(r.get('Relevancy', 0) for r in model_results.values()) / len(model_results)
 
 
 
 
584
 
585
  # Find best model
586
  best_model = max(
587
- model_results.items(),
588
  key=lambda x: x[1].get('Faithfulness', 0) + x[1].get('Relevancy', 0)
589
  )
590
  best_name = best_model[0]
591
 
592
- print(f"{technique_name:<15} {avg_faith:>11.1f}% {avg_rel:>12.3f} {best_name:<20}")
593
  else:
594
- print(f"{technique_name:<15} {'N/A':>12} {'N/A':>12} {'N/A':<20}")
595
 
596
- print("-" * 60)
597
 
598
  # Step 4: Generate findings document from all queries
599
  print("\n" + "=" * 80)
@@ -608,45 +690,63 @@ def main():
608
  print("=" * 80)
609
 
610
  print(f"\nQueries processed: {len(test_queries)}")
611
- print(f"Techniques evaluated: {len(CHUNKING_TECHNIQUES)}")
612
  print(f"Models tested: {len(cfg.model_list)}")
613
  print(f"\nFindings document: {findings_file}")
614
 
615
  # Print final summary across all queries
616
- print("\n" + "-" * 60)
617
- print(f"{'Technique':<15} {'Avg Faith':>12} {'Avg Rel':>12} {'Best Model':<20}")
618
- print("-" * 60)
 
 
 
 
 
619
 
620
- # Calculate averages across all queries
621
- for tech_config in CHUNKING_TECHNIQUES:
622
  tech_name = tech_config['name']
623
- all_faith = []
624
- all_rel = []
625
- best_model_name = None
626
- best_combined = 0
627
-
628
- for query_idx, query_results in all_query_results.items():
629
- if tech_name in query_results and query_results[tech_name]:
630
- model_results = query_results[tech_name]
631
- for model_name, results in model_results.items():
632
- faith = results.get('Faithfulness', 0)
633
- rel = results.get('Relevancy', 0)
634
- combined = faith + rel
635
- all_faith.append(faith)
636
- all_rel.append(rel)
637
 
638
- if combined > best_combined:
639
- best_combined = combined
640
- best_model_name = model_name
641
-
642
- if all_faith:
643
- avg_faith = sum(all_faith) / len(all_faith)
644
- avg_rel = sum(all_rel) / len(all_rel)
645
- print(f"{tech_name:<15} {avg_faith:>11.1f}% {avg_rel:>12.3f} {best_model_name or 'N/A':<20}")
646
- else:
647
- print(f"{tech_name:<15} {'N/A':>12} {'N/A':>12} {'N/A':<20}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
 
649
- print("-" * 60)
650
 
651
  print("\n✓ Ablation study complete!")
652
  print(f"✓ Results saved to: {findings_file}")
 
2
  import json
3
  import time
4
  from datetime import datetime
 
 
5
  from dotenv import load_dotenv
6
  from config_loader import cfg
7
 
 
16
  # Import model fleet
17
  from models.llama_3_8b import Llama3_8B
18
  from models.mistral_7b import Mistral_7b
 
 
19
  from models.tiny_aya import TinyAya
20
 
21
  MODEL_MAP = {
22
  "Llama-3-8B": Llama3_8B,
23
  "Mistral-7B": Mistral_7b,
 
 
24
  "TinyAya": TinyAya
25
  }
26
 
27
  load_dotenv()
28
 
29
 
30
+ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluator, rag_engine, retriever, retrieval_strategy):
31
+ """Run RAG pipeline for a specific chunking technique and retrieval strategy."""
32
+
33
+ mode = retrieval_strategy['mode']
34
+ use_mmr = retrieval_strategy['use_mmr']
35
+ strategy_label = retrieval_strategy['label']
36
 
37
  print(f"\n{'='*80}")
38
+ print(f"TECHNIQUE: {technique_name.upper()} | STRATEGY: {strategy_label}")
39
  print(f"{'='*80}")
40
 
41
+ # Use HybridRetriever to retrieve chunks
42
+ context_chunks, chunk_score = retriever.search(
43
+ query=query,
44
+ index=index,
45
+ mode=mode,
46
+ rerank_strategy="cross-encoder",
47
+ use_mmr=use_mmr,
48
+ top_k=50,
49
+ final_k=5,
50
+ technique_name=technique_name,
51
+ verbose=False
52
  )
53
 
54
+ print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
 
 
 
 
 
55
 
56
+ if not context_chunks:
 
 
57
  print(f"WARNING: No chunks found for technique '{technique_name}'")
58
  return {}
59
 
 
 
 
 
 
 
 
 
 
 
 
60
  # Print the final RAG context being passed to models (only once)
61
  print(f"\n{'='*80}")
62
  print(f"📚 FINAL RAG CONTEXT FOR TECHNIQUE '{technique_name.upper()}'")
 
70
 
71
  # Run model tournament for this technique
72
  tournament_results = {}
73
+ tournament_results["_ChunkScore"] = chunk_score # Store at technique level, not per model
74
+ tournament_results["_Strategy"] = strategy_label
75
 
76
  for name, model_inst in models.items():
77
  print(f"\n{'-'*60}")
 
81
  # Generation
82
  answer = rag_engine.get_answer(
83
  model_inst, query, context_chunks,
 
84
  temperature=cfg.gen['temperature']
85
  )
86
 
 
101
  "Relevancy": rel['score'],
102
  "Claims": faith['details'],
103
  "context_chunks": context_chunks,
 
104
  }
105
 
106
  print(f"\n📊 EVALUATION SCORES:")
 
117
  "Claims": [],
118
  "error": str(e),
119
  "context_chunks": context_chunks,
 
120
  }
121
 
122
  return tournament_results
 
166
 
167
  # Aggregate results across all queries
168
  aggregated_results = {}
169
+ chunk_scores_by_query_technique = {} # Store ChunkScore per query+technique
170
 
171
  for query_idx, query_results in all_query_results.items():
172
  for technique_name, model_results in query_results.items():
173
  if technique_name not in aggregated_results:
174
  aggregated_results[technique_name] = {}
175
 
176
+ # Extract ChunkScore (stored at technique level, not per model)
177
+ chunk_score = model_results.get('_ChunkScore', 0)
178
+ chunk_scores_by_query_technique[(query_idx, technique_name)] = chunk_score
179
+
180
  for model_name, results in model_results.items():
181
+ if model_name.startswith('_'):
182
+ continue # Skip metadata keys like _ChunkScore
183
+
184
  if model_name not in aggregated_results[technique_name]:
185
  aggregated_results[technique_name][model_name] = {
186
  'Faithfulness': [],
 
202
  content += "No results available for this technique.\n\n"
203
  continue
204
 
205
+ # Show ChunkScore per query for this technique
206
+ content += "#### Chunk Retrieval Scores (ChunkScore)\n\n"
207
+ content += "| Query | Avg ChunkScore |\n"
208
+ content += "|-------|---------------|\n"
209
+ for q_idx in range(len(queries)):
210
+ score = chunk_scores_by_query_technique.get((q_idx, technique_name), 0)
211
+ content += f"| {q_idx + 1} | {score:.4f} |\n"
212
+ content += "\n"
213
+
214
  # Create results table with averaged scores
215
  content += "| Model | Avg Faithfulness | Avg Relevancy | Avg Combined |\n"
216
  content += "|-------|------------------|---------------|--------------|\n"
 
346
  - *Embedding Model:* Jina embeddings (512 dimensions)
347
  - *Vector Database:* Pinecone (serverless, AWS us-east-1)
348
  - *Judge Model:* Openrouter Free models
349
+ - *Retrieval:* Top 4 chunks per technique
350
+ - *Evaluation Metrics:* Faithfulness (context grounding), Relevancy (query addressing), ChunkScore (reranker confidence)
351
 
352
  ---
353
 
 
362
  return output_file
363
 
364
 
365
+ def run_rag_for_technique_sequential(technique_name, query, index, encoder, models, evaluator, rag_engine, retriever, retrieval_strategy):
366
+ """Run RAG pipeline for a specific chunking technique and retrieval strategy (sequential)."""
367
+
368
+ mode = retrieval_strategy['mode']
369
+ use_mmr = retrieval_strategy['use_mmr']
370
+ strategy_label = retrieval_strategy['label']
371
+
372
+ print(f"\n{'='*80}")
373
+ print(f"TECHNIQUE: {technique_name.upper()} | STRATEGY: {strategy_label}")
374
+ print(f"{'='*80}")
375
+
376
+ # Use HybridRetriever to retrieve chunks
377
+ context_chunks, chunk_score = retriever.search(
378
+ query=query,
379
+ index=index,
380
+ mode=mode,
381
+ rerank_strategy="cross-encoder",
382
+ use_mmr=use_mmr,
383
+ top_k=50,
384
+ final_k=5,
385
+ technique_name=technique_name,
386
+ verbose=False,
387
+ test=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
  )
389
 
390
+ print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
391
+
392
+ if not context_chunks:
393
+ print(f"WARNING: No chunks found for technique '{technique_name}'")
394
+ return {}
395
 
396
+ # Print the final RAG context being passed to models (only once)
397
+ print(f"\n{'='*80}")
398
+ print(f"📚 FINAL RAG CONTEXT FOR TECHNIQUE '{technique_name.upper()}'")
399
+ print(f"{'='*80}")
400
+ for i, chunk in enumerate(context_chunks, 1):
401
+ print(f"\n[Chunk {i}] ({len(chunk)} chars):")
402
+ print(f"{'─'*60}")
403
+ print(chunk)
404
+ print(f"{'─'*60}")
405
+ print(f"\n{'='*80}")
406
+
407
+ # Run model tournament for this technique
408
+ tournament_results = {}
409
+ tournament_results["_ChunkScore"] = chunk_score
410
+ tournament_results["_Strategy"] = strategy_label
411
 
412
+ for name, model_inst in models.items():
413
+ print(f"\n{'-'*60}")
414
+ print(f"Model: {name}")
415
+ print(f"{'-'*60}")
416
+ try:
417
+ # Generation
418
+ answer = rag_engine.get_answer(
419
+ model_inst, query, context_chunks,
420
+ temperature=cfg.gen['temperature']
421
+ )
422
+
423
+ print(f"\n{'─'*60}")
424
+ print(f"📝 FULL ANSWER from {name}:")
425
+ print(f"{'─'*60}")
426
+ print(answer)
427
+ print(f"{'─'*60}")
428
+
429
+ # Faithfulness Evaluation (strict=False reduces API calls from ~22 to ~3 per eval)
430
+ faith = evaluator.evaluate_faithfulness(answer, context_chunks, strict=False)
431
+ # Relevancy Evaluation
432
+ rel = evaluator.evaluate_relevancy(query, answer)
433
+
434
+ tournament_results[name] = {
435
+ "answer": answer,
436
+ "Faithfulness": faith['score'],
437
+ "Relevancy": rel['score'],
438
+ "Claims": faith['details'],
439
+ "context_chunks": context_chunks,
440
+ }
441
 
442
+ print(f"\n📊 EVALUATION SCORES:")
443
+ print(f" Faithfulness: {faith['score']:.1f}%")
444
+ print(f" Relevancy: {rel['score']:.3f}")
445
+ print(f" Combined: {faith['score'] + rel['score']:.3f}")
446
 
447
+ except Exception as e:
448
+ print(f" Error evaluating {name}: {e}")
449
+ tournament_results[name] = {
450
+ "answer": "",
451
+ "Faithfulness": 0,
452
+ "Relevancy": 0,
453
+ "Claims": [],
454
+ "error": str(e),
455
+ "context_chunks": context_chunks,
456
+ }
457
 
458
+ return tournament_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
 
460
 
461
  def main():
 
475
  # Test queries
476
  test_queries = [
477
  "What is cognitive behavior therapy and how does it work?",
478
+ "I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
479
+ "No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
480
+ "I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying."
481
  ]
482
 
483
  print("=" * 80)
 
496
  from data.vector_db import get_index_by_name
497
  index_name = f"{cfg.db['base_index_name']}-{cfg.processing['technique']}"
498
 
499
+ print(f"\n[DEBUG] Checking for existing index: {index_name}")
500
 
501
  try:
502
  # Try to connect to existing index
503
+ print("[DEBUG] Connecting to Pinecone...")
504
  existing_index = get_index_by_name(pinecone_key, index_name)
505
+ print("[DEBUG] Getting index stats...")
506
  stats = existing_index.describe_index_stats()
507
  existing_count = stats.get('total_vector_count', 0)
508
 
509
  if existing_count > 0:
510
+ print(f"\n[DEBUG] ✓ Found existing index with {existing_count} vectors")
511
+ print("[DEBUG] Skipping ingestion - using existing data")
512
 
513
  # Initialize processor (this loads the embedding model)
514
+ print("[DEBUG] About to load embedding model...")
515
+ print(f"[DEBUG] Model: {cfg.processing['embedding_model']}")
516
+ import sys
517
+ sys.stdout.flush()
518
+
519
  from retriever.processor import ChunkProcessor
520
+ print("[DEBUG] ChunkProcessor imported successfully")
521
+ sys.stdout.flush()
522
+
523
+ print("[DEBUG] Creating ChunkProcessor instance...")
524
+ sys.stdout.flush()
525
  proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)
526
+ print("[DEBUG] ChunkProcessor created successfully")
527
+ sys.stdout.flush()
528
+
529
  index = existing_index
530
  all_chunks = [] # Empty since we're using existing data
531
  final_chunks = []
532
+ print("[DEBUG] ✓ Processor initialized")
533
  else:
534
+ print("\n[DEBUG] Index exists but is empty. Running full ingestion...")
535
  all_chunks, final_chunks, proc, index = ingest_data()
536
  except Exception as e:
537
+ print(f"\n[DEBUG] Index check failed: {e}")
538
+ import traceback
539
+ traceback.print_exc()
540
+ print("[DEBUG] Running full ingestion...")
541
  all_chunks, final_chunks, proc, index = ingest_data()
542
 
543
  print(f"\nTechniques to evaluate: {[tech['name'] for tech in CHUNKING_TECHNIQUES]}")
544
 
545
+ # Step 2: Components will be initialized in Step 3 (shared across all sequential runs)
546
  print("\n" + "=" * 80)
547
+ print("[DEBUG] STEP 2: PREPARING FOR SEQUENTIAL EXECUTION")
548
  print("=" * 80)
549
+ print(f"[DEBUG] Techniques to evaluate: {[t['name'] for t in CHUNKING_TECHNIQUES]}")
550
+ # print(f"[DEBUG] Filtered techniques: {TECHNIQUES_TO_EVALUATE}")
551
 
552
+ # Define retrieval strategies to test
553
+ RETRIEVAL_STRATEGIES = [
554
+ {"mode": "hybrid", "use_mmr": False, "label": "hybrid-no-mmr"},
555
+ ]
556
+
557
+ # Filter to only 4 techniques to reduce memory usage
558
+ TECHNIQUES_TO_EVALUATE = ["markdown", "recursive", "paragraph"]
559
+ CHUNKING_TECHNIQUES_FILTERED = [t for t in CHUNKING_TECHNIQUES if t['name'] in TECHNIQUES_TO_EVALUATE]
560
+
561
+ # Step 3: Run RAG for all techniques x strategies SEQUENTIALLY (to avoid OOM)
562
+ print("\n" + "=" * 80)
563
+ print(f"STEP 3: RUNNING RAG FOR {len(CHUNKING_TECHNIQUES_FILTERED)} TECHNIQUES x {len(RETRIEVAL_STRATEGIES)} STRATEGIES (SEQUENTIAL)")
564
+ print("=" * 80)
565
+ print(f"\nTechniques: {TECHNIQUES_TO_EVALUATE}")
566
+ print(f"\nRetrieval Strategies:")
567
+ for i, strat in enumerate(RETRIEVAL_STRATEGIES, 1):
568
+ mmr_status = "with MMR" if strat['use_mmr'] else "no MMR"
569
+ print(f" {i}. {strat['label']}: mode={strat['mode']}, {mmr_status}")
570
+
571
+ # Initialize components once (shared across all sequential runs)
572
+ print("\n[DEBUG] Initializing components...")
573
+ import sys
574
+ sys.stdout.flush()
575
+
576
+ print("[DEBUG] Creating RAGGenerator...")
577
+ sys.stdout.flush()
578
  rag_engine = RAGGenerator()
579
+ print("[DEBUG] RAGGenerator created")
580
+ sys.stdout.flush()
581
+
582
+ print(f"[DEBUG] Loading models: {cfg.model_list}")
583
+ sys.stdout.flush()
584
  models = {name: MODEL_MAP[name](token=hf_token) for name in cfg.model_list}
585
+ print("[DEBUG] Models loaded successfully")
586
+ sys.stdout.flush()
 
 
 
587
 
588
+ print("[DEBUG] Creating RAGEvaluator...")
589
+ sys.stdout.flush()
590
  evaluator = RAGEvaluator(
591
  judge_model=cfg.gen['judge_model'],
592
  embedding_model=proc.encoder,
593
  api_key=openrouter_key
594
  )
595
+ print("[DEBUG] RAGEvaluator created")
596
+ sys.stdout.flush()
597
+
598
+ print("[DEBUG] Creating HybridRetriever...")
599
+ sys.stdout.flush()
600
+ retriever = HybridRetriever(
601
+ embed_model=proc.encoder,
602
+ rerank_model_name='rerank-2.5',
603
+ verbose=False
604
+ )
605
+ print("[DEBUG] HybridRetriever created")
606
+ sys.stdout.flush()
607
+
608
+ print("[DEBUG] All components initialized successfully.\n")
 
 
609
 
610
  all_query_results = {}
611
 
612
  for query_idx, query in enumerate(test_queries):
613
  print(f"\n{'='*80}")
614
+ print(f"[DEBUG] PROCESSING QUERY {query_idx + 1}/{len(test_queries)}")
615
+ print(f"[DEBUG] Query: {query}")
616
  print(f"{'='*80}")
617
+ import sys
618
+ sys.stdout.flush()
619
 
620
+ query_results = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
621
 
622
+ for technique in CHUNKING_TECHNIQUES_FILTERED:
623
+ for strategy in RETRIEVAL_STRATEGIES:
624
+ result_key = f"{technique['name']}__{strategy['label']}"
625
+ print(f"\n[DEBUG] Processing: {result_key}")
626
+ sys.stdout.flush()
627
+
628
+ try:
629
+ result = run_rag_for_technique_sequential(
630
+ technique_name=technique['name'],
631
+ query=query,
632
+ index=index,
633
+ encoder=proc.encoder,
634
+ models=models,
635
+ evaluator=evaluator,
636
+ rag_engine=rag_engine,
637
+ retriever=retriever,
638
+ retrieval_strategy=strategy
639
+ )
640
+ print(f"[DEBUG] Result for {result_key}: {len(result)} keys")
641
+ query_results[result_key] = result
642
+ except Exception as e:
643
+ import traceback
644
+ print(f"\n[DEBUG] ✗ Error processing {result_key}: {e}")
645
+ traceback.print_exc()
646
+ sys.stdout.flush()
647
+ query_results[result_key] = {}
648
+
649
+ all_query_results[query_idx] = query_results
650
+
651
  # Print quick summary for this query
652
  print(f"\n{'='*80}")
653
  print(f"QUERY {query_idx + 1} SUMMARY")
654
  print(f"{'='*80}")
655
+ print(f"\n{'Technique':<15} {'Strategy':<20} {'ChunkScore':>12} {'Avg Faith':>12} {'Avg Rel':>12} {'Best Model':<20}")
656
+ print("-" * 92)
657
 
658
+ for result_key, model_results in query_results.items():
659
  if model_results:
660
+ chunk_score = model_results.get('_ChunkScore', 0)
661
+ strategy = model_results.get('_Strategy', '')
662
+ # Exclude _ChunkScore and _Strategy from model averaging
663
+ model_only = {k: v for k, v in model_results.items() if not k.startswith('_')}
664
+ avg_faith = sum(r.get('Faithfulness', 0) for r in model_only.values()) / len(model_only) if model_only else 0
665
+ avg_rel = sum(r.get('Relevancy', 0) for r in model_only.values()) / len(model_only) if model_only else 0
666
 
667
  # Find best model
668
  best_model = max(
669
+ model_only.items(),
670
  key=lambda x: x[1].get('Faithfulness', 0) + x[1].get('Relevancy', 0)
671
  )
672
  best_name = best_model[0]
673
 
674
+ print(f"{result_key:<15} {strategy:<20} {chunk_score:>12.4f} {avg_faith:>11.1f}% {avg_rel:>12.3f} {best_name:<20}")
675
  else:
676
+ print(f"{result_key:<15} {'':<20} {'N/A':>12} {'N/A':>12} {'N/A':>12} {'N/A':<20}")
677
 
678
+ print("-" * 92)
679
 
680
  # Step 4: Generate findings document from all queries
681
  print("\n" + "=" * 80)
 
690
  print("=" * 80)
691
 
692
  print(f"\nQueries processed: {len(test_queries)}")
693
+ print(f"Techniques evaluated: {len(CHUNKING_TECHNIQUES_FILTERED)} ({TECHNIQUES_TO_EVALUATE})")
694
  print(f"Models tested: {len(cfg.model_list)}")
695
  print(f"\nFindings document: {findings_file}")
696
 
697
  # Print final summary across all queries
698
+ print("\n" + "-" * 92)
699
+ print(f"{'Technique':<15} {'Strategy':<20} {'ChunkScore':>12} {'Avg Faith':>12} {'Avg Rel':>12} {'Best Model':<20}")
700
+ print("-" * 92)
701
+
702
+ # Define retrieval strategies (same as above)
703
+ RETRIEVAL_STRATEGIES = [
704
+ {"mode": "hybrid", "use_mmr": False, "label": "hybrid-no-mmr"},
705
+ ]
706
 
707
+ # Calculate averages across all queries for each technique x strategy
708
+ for tech_config in CHUNKING_TECHNIQUES_FILTERED:
709
  tech_name = tech_config['name']
710
+ for strategy in RETRIEVAL_STRATEGIES:
711
+ strategy_label = strategy['label']
712
+ result_key = f"{tech_name}__{strategy_label}"
713
+
714
+ all_faith = []
715
+ all_rel = []
716
+ all_chunk_scores = []
717
+ best_model_name = None
718
+ best_combined = 0
719
+
720
+ for query_idx, query_results in all_query_results.items():
721
+ if result_key in query_results and query_results[result_key]:
722
+ model_results = query_results[result_key]
 
723
 
724
+ # Extract ChunkScore
725
+ chunk_score = model_results.get('_ChunkScore', 0)
726
+ all_chunk_scores.append(chunk_score)
727
+
728
+ # Exclude _ChunkScore and _Strategy from model averaging
729
+ model_only = {k: v for k, v in model_results.items() if not k.startswith('_')}
730
+ for model_name, results in model_only.items():
731
+ faith = results.get('Faithfulness', 0)
732
+ rel = results.get('Relevancy', 0)
733
+ combined = faith + rel
734
+ all_faith.append(faith)
735
+ all_rel.append(rel)
736
+
737
+ if combined > best_combined:
738
+ best_combined = combined
739
+ best_model_name = model_name
740
+
741
+ if all_faith:
742
+ avg_faith = sum(all_faith) / len(all_faith)
743
+ avg_rel = sum(all_rel) / len(all_rel)
744
+ avg_chunk_score = sum(all_chunk_scores) / len(all_chunk_scores) if all_chunk_scores else 0
745
+ print(f"{tech_name:<15} {strategy_label:<20} {avg_chunk_score:>12.4f} {avg_faith:>11.1f}% {avg_rel:>12.3f} {best_model_name or 'N/A':<20}")
746
+ else:
747
+ print(f"{tech_name:<15} {strategy_label:<20} {'N/A':>12} {'N/A':>12} {'N/A':>12} {'N/A':<20}")
748
 
749
+ print("-" * 92)
750
 
751
  print("\n✓ Ablation study complete!")
752
  print(f"✓ Results saved to: {findings_file}")
requirements.txt CHANGED
@@ -95,3 +95,6 @@ zstandard==0.25.0
95
  groq==1.1.2
96
  jiter==0.13.0
97
  openai==2.30.0
 
 
 
 
95
  groq==1.1.2
96
  jiter==0.13.0
97
  openai==2.30.0
98
+ pinecone-text>=0.11.0
99
+ voyageai==0.3.7
100
+
retriever/evaluator.py CHANGED
@@ -10,7 +10,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
10
  # ------------------------------------------------------------------
11
 
12
  class GroqJudge:
13
- def __init__(self, api_key: str, model: str = "deepseek/deepseek-v3.2",):
14
  """
15
  Wraps OpenRouter's chat completions to match the .generate(prompt) interface
16
  expected by RAGEvaluator.
@@ -27,8 +27,6 @@ class GroqJudge:
27
 
28
  # Fallback models in order of preference (OpenRouter free models)
29
  self.fallback_models = [
30
- "deepseek/deepseek-v3.2",
31
- "qwen/qwen3.6-plus-preview:free",
32
  "stepfun/step-3.5-flash:free",
33
  "nvidia/nemotron-3-super-120b-a12b:free",
34
  "z-ai/glm-4.5-air:free",
@@ -228,7 +226,10 @@ class RAGEvaluator:
228
  return {"score": 0, "queries": []}
229
 
230
  # --- Step B: Similarity (single batched encode call) ---
231
- all_vecs = self.encoder.encode([query] + gen_queries)
 
 
 
232
  original_vec = all_vecs[0:1]
233
  generated_vecs = all_vecs[1:]
234
 
 
10
  # ------------------------------------------------------------------
11
 
12
  class GroqJudge:
13
+ def __init__(self, api_key: str, model: str = "stepfun/step-3.5-flash:free"):
14
  """
15
  Wraps OpenRouter's chat completions to match the .generate(prompt) interface
16
  expected by RAGEvaluator.
 
27
 
28
  # Fallback models in order of preference (OpenRouter free models)
29
  self.fallback_models = [
 
 
30
  "stepfun/step-3.5-flash:free",
31
  "nvidia/nemotron-3-super-120b-a12b:free",
32
  "z-ai/glm-4.5-air:free",
 
226
  return {"score": 0, "queries": []}
227
 
228
  # --- Step B: Similarity (single batched encode call) ---
229
+ try:
230
+ all_vecs = self.encoder.encode([query] + gen_queries)
231
+ except AttributeError:
232
+ all_vecs = np.array([self.encoder.encode(text) for text in [query] + gen_queries])
233
  original_vec = all_vecs[0:1]
234
  generated_vecs = all_vecs[1:]
235
 
retriever/generator.py CHANGED
@@ -8,21 +8,22 @@ class RAGGenerator:
8
  else:
9
  context_text = "\n\n".join([f"[Source {i+1}]: {c}" for i, c in enumerate(retrieved_contexts)])
10
 
11
- return f"""You are a specialized Cognitive Behavioral Therapy (CBT) assistant. Your task is to provide accurate, clinical, and structured answers based ONLY on the provided textbook excerpts.
12
 
13
  INSTRUCTIONS:
14
- 1. Use the provided Sources to answer the question.
15
- 2. CITATIONS: You must cite the sources used in your answer (e.g., "CBT is based on the cognitive model [Source 1]").
16
- 3. FORMAT: Use clear headers and bullet points for complex explanations.
17
- 4. GROUNDING: If the sources do not contain the answer, explicitly state: "The provided excerpts from the textbook do not contain information to answer this specific question." Do not use your own internal knowledge.
18
- 5. TONE: Maintain a professional, empathetic, and academic tone.
19
-
20
- RETRIVED TEXTBOOK CONTEXT:
 
21
  {context_text}
22
 
23
- USER QUESTION: {query}
24
 
25
- ACADEMIC ANSWER (WITH CITATIONS):"""
26
 
27
  def get_answer(self, model_instance, query, retrieved_contexts, context_urls=None, **kwargs):
28
  """Uses a specific model instance to generate the final answer."""
@@ -42,4 +43,4 @@ ACADEMIC ANSWER (WITH CITATIONS):"""
42
  # Fallback for model wrappers that only expose sync generation.
43
  answer = model_instance.generate(prompt, **kwargs)
44
  if answer:
45
- yield answer
 
8
  else:
9
  context_text = "\n\n".join([f"[Source {i+1}]: {c}" for i, c in enumerate(retrieved_contexts)])
10
 
11
+ return f"""You are an empathetic Cognitive Behavioral Therapy (CBT) therapist speaking directly to a client. **Your task is to provide a therapeutic, helpful response based ONLY on the provided clinical documents and excerpts**.
12
 
13
  INSTRUCTIONS:
14
+ 1. THERAPEUTIC DIALOGUE: Respond directly to the user as your client. Start by briefly validating their feelings, then gently apply CBT concepts, psychoeducation, or interventions found STRICTLY in the provided documents.
15
+ 2. PATIENT EXAMPLES & NAMES (CRITICAL): The provided documents contain transcripts and examples of other patients and therapists (e.g., Abe, Judith, Joseph). These are illustrative case studies ONLY. DO NOT assume the user is "Abe" or any other person mentioned in the text. NEVER address or refer to the user by these names. Extract the CBT concepts/techniques demonstrated in these transcripts and apply them to the current user's unique situation.
16
+ 3. GROUNDING (NO OPINIONS): Do not give your own opinions, general life advice, or use outside knowledge. Every therapeutic concept, identified cognitive distortion, or suggested exercise must come directly from the provided text.
17
+ 4. CITATIONS: You must cite the sources used in your response to show where the clinical guidance comes from (e.g., "It sounds like you might be experiencing what is known as 'all-or-nothing thinking' [Source 1]").
18
+ 5. FORMAT: Use clear Markdown formatting. Use paragraphs for conversational tone, and bullet points if you are breaking down specific steps, questions, or exercises found in the text.
19
+ 6. MISSING INFO: If the provided excerpts do not contain relevant CBT concepts to address the client's specific statement, explicitly state: "While I hear how difficult this is for you, the clinical materials I have right now do not contain specific steps to address this." Do not invent therapeutic advice.
20
+
21
+ RETRIEVED CLINICAL CONTEXT:
22
  {context_text}
23
 
24
+ CLIENT STATEMENT: {query}
25
 
26
+ THERAPEUTIC RESPONSE (GROUNDED IN SOURCES):"""
27
 
28
  def get_answer(self, model_instance, query, retrieved_contexts, context_urls=None, **kwargs):
29
  """Uses a specific model instance to generate the final answer."""
 
43
  # Fallback for model wrappers that only expose sync generation.
44
  answer = model_instance.generate(prompt, **kwargs)
45
  if answer:
46
+ yield answer
retriever/processor.py CHANGED
@@ -88,14 +88,72 @@ class MarkdownTextSplitter:
88
 
89
 
90
  class ChunkProcessor:
91
- def __init__(self, model_name='all-MiniLM-L6-v2', verbose: bool = True, load_hf_embeddings: bool = False):
 
 
 
 
 
 
 
 
 
92
  self.model_name = model_name
 
93
  self._use_remote_code = self._requires_remote_code(model_name)
 
94
  st_kwargs = {"trust_remote_code": True} if self._use_remote_code else {}
95
- self.encoder = SentenceTransformer(model_name, **st_kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  self.verbose = verbose
97
  hf_kwargs = {"model_kwargs": {"trust_remote_code": True}} if self._use_remote_code else {}
98
  self.hf_embeddings = HuggingFaceEmbeddings(model_name=model_name, **hf_kwargs) if load_hf_embeddings else None
 
99
 
100
  def _requires_remote_code(self, model_name: str) -> bool:
101
  normalized = (model_name or "").strip().lower()
 
88
 
89
 
90
  class ChunkProcessor:
91
+ def __init__(self, model_name='jinaai/jina-embeddings-v2-small-en', verbose: bool = True, load_hf_embeddings: bool = False):
92
+ import sys
93
+ import os
94
+
95
+ # Set environment variables to limit memory usage BEFORE importing torch
96
+ os.environ["OMP_NUM_THREADS"] = "2"
97
+ os.environ["MKL_NUM_THREADS"] = "2"
98
+ os.environ["OPENBLAS_NUM_THREADS"] = "2"
99
+
100
+ print(f"[DEBUG-ChunkProcessor] Starting init with model: {model_name}", flush=True)
101
  self.model_name = model_name
102
+ print(f"[DEBUG-ChunkProcessor] Checking if remote code needed...", flush=True)
103
  self._use_remote_code = self._requires_remote_code(model_name)
104
+ print(f"[DEBUG-ChunkProcessor] Remote code needed: {self._use_remote_code}", flush=True)
105
  st_kwargs = {"trust_remote_code": True} if self._use_remote_code else {}
106
+
107
+ # Set torch threads to limit parallelism
108
+ import torch
109
+ torch.set_num_threads(2)
110
+ torch.set_num_interop_threads(2)
111
+ print(f"[DEBUG-ChunkProcessor] Torch threads set to 2", flush=True)
112
+
113
+ print(f"[DEBUG-ChunkProcessor] Loading SentenceTransformer with kwargs: {st_kwargs}", flush=True)
114
+ sys.stdout.flush()
115
+
116
+ # Do not explicitly force cpu if cuda is available, let SentenceTransformer handle it or specify explicit map.
117
+ import torch
118
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
119
+ import numpy as np
120
+ try:
121
+ if self._use_remote_code:
122
+ print("[DEBUG-ChunkProcessor] Using HuggingFaceEmbeddings-based encoder for remote model", flush=True)
123
+ hf_kwargs = {"model_kwargs": {"trust_remote_code": True}}
124
+ hf = HuggingFaceEmbeddings(model_name=model_name, **hf_kwargs)
125
+
126
+ class HFEncoderShim:
127
+ def __init__(self, hf_client):
128
+ self._hf = hf_client
129
+ def encode(self, text: str):
130
+ vecs = self._hf.embed_documents([text])
131
+ return np.array(vecs[0], dtype=float)
132
+
133
+ self.encoder = HFEncoderShim(hf)
134
+ self.hf_embeddings = hf
135
+ else:
136
+ device = "cuda" if torch.cuda.is_available() else "cpu"
137
+ self.encoder = SentenceTransformer(model_name, device=device, **st_kwargs)
138
+ print("[DEBUG-ChunkProcessor] SentenceTransformer loaded successfully", flush=True)
139
+ except Exception as e:
140
+ print(f"[DEBUG-ChunkProcessor] encoder init failed: {e}. Falling back to HuggingFaceEmbeddings.", flush=True)
141
+ hf_kwargs = {"model_kwargs": {"trust_remote_code": True}} if self._use_remote_code else {}
142
+ hf = HuggingFaceEmbeddings(model_name=model_name, **hf_kwargs)
143
+ class HFEncoderShim:
144
+ def __init__(self, hf_client):
145
+ self._hf = hf_client
146
+ def encode(self, text: str):
147
+ vecs = self._hf.embed_documents([text])
148
+ return np.array(vecs[0], dtype=float)
149
+ self.encoder = HFEncoderShim(hf)
150
+ self.hf_embeddings = hf
151
+ print(f"[DEBUG-ChunkProcessor] SentenceTransformer loaded successfully", flush=True)
152
+ sys.stdout.flush()
153
  self.verbose = verbose
154
  hf_kwargs = {"model_kwargs": {"trust_remote_code": True}} if self._use_remote_code else {}
155
  self.hf_embeddings = HuggingFaceEmbeddings(model_name=model_name, **hf_kwargs) if load_hf_embeddings else None
156
+ print(f"[DEBUG-ChunkProcessor] ChunkProcessor init complete", flush=True)
157
 
158
  def _requires_remote_code(self, model_name: str) -> bool:
159
  normalized = (model_name or "").strip().lower()
retriever/retriever.py CHANGED
@@ -5,41 +5,58 @@ from rank_bm25 import BM25Okapi
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  from typing import Optional, List
7
 
 
 
8
  # changed mmr to return final k, as a param, prev was hardcoded to 3
9
  # --@Qamare
10
 
11
  # Try to import FlashRank for CPU optimization, fallback to sentence-transformers
12
- try:
13
- from flashrank import Ranker, RerankRequest
14
- FLASHRANK_AVAILABLE = True
15
- except ImportError:
16
- from sentence_transformers import CrossEncoder
17
- FLASHRANK_AVAILABLE = False
18
 
19
  class HybridRetriever:
20
- def __init__(self, final_chunks, embed_model, rerank_model_name='jinaai/jina-reranker-v1-tiny-en', verbose: bool = True):
21
- self.final_chunks = final_chunks
 
 
22
  self.embed_model = embed_model
23
  self.verbose = verbose
24
  self.rerank_model_name = self._normalize_rerank_model_name(rerank_model_name)
25
-
26
- # Use FlashRank if available (faster on CPU), otherwise fallback to sentence-transformers
27
- if FLASHRANK_AVAILABLE:
 
 
 
 
 
28
  try:
29
- self.rerank_model = Ranker(model_name=self.rerank_model_name)
30
- self.use_flashrank = True
31
- except Exception:
32
- from sentence_transformers import CrossEncoder as STCrossEncoder
33
- self.rerank_model = STCrossEncoder(self.rerank_model_name)
34
- self.use_flashrank = False
35
- else:
36
- self.rerank_model = CrossEncoder(self.rerank_model_name)
37
- self.use_flashrank = False
 
 
 
 
 
 
 
 
 
 
38
 
39
- # Better tokenization for BM25 (strips punctuation)
40
- self.tokenized_corpus = [self._tokenize(chunk['metadata']['text']) for chunk in final_chunks]
41
- self.bm25 = BM25Okapi(self.tokenized_corpus)
42
- self.technique_to_indices = self._build_chunking_index_map()
43
 
44
  def _normalize_rerank_model_name(self, model_name: str) -> str:
45
  normalized = (model_name or "").strip()
@@ -76,32 +93,70 @@ class HybridRetriever:
76
  # Retrieval
77
  # ------------------------------------------------------------------
78
 
79
- def _semantic_search(self, query, index, top_k, chunking_technique: Optional[str] = None) -> tuple[np.ndarray, List[str]]:
80
  query_vector = self.embed_model.encode(query)
81
  query_kwargs = {
82
  "vector": query_vector.tolist(),
83
  "top_k": top_k,
84
  "include_metadata": True,
85
  }
86
- if chunking_technique:
87
- query_kwargs["filter"] = {"chunking_technique": {"$eq": chunking_technique}}
88
- res = index.query(**query_kwargs)
 
 
 
89
  chunks = [match['metadata']['text'] for match in res['matches']]
90
  return query_vector, chunks
91
 
92
- def _bm25_search(self, query, top_k, chunking_technique: Optional[str] = None) -> List[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  tokenized_query = self._tokenize(query)
94
- scores = self.bm25.get_scores(tokenized_query)
95
-
96
- if chunking_technique:
97
- candidate_indices = self.technique_to_indices.get(chunking_technique, [])
98
- if not candidate_indices:
99
- return []
100
- top_indices = sorted(candidate_indices, key=lambda i: scores[i], reverse=True)[:top_k]
101
- else:
102
- top_indices = np.argsort(scores)[::-1][:top_k]
103
-
104
- return [self.final_chunks[i]['metadata']['text'] for i in top_indices]
105
 
106
  # ------------------------------------------------------------------
107
  # Fusion
@@ -119,20 +174,22 @@ class HybridRetriever:
119
  # Reranking
120
  # ------------------------------------------------------------------
121
 
122
- def _cross_encoder_rerank(self, query, chunks, final_k) -> List[str]:
123
- if self.use_flashrank:
124
- # Use FlashRank for CPU-optimized reranking
125
- passages = [{"id": i, "text": chunk} for i, chunk in enumerate(chunks)]
126
- rerank_request = RerankRequest(query=query, passages=passages)
127
- results = self.rerank_model.rerank(rerank_request)
128
- ranked_chunks = [res['text'] for res in results]
129
- return ranked_chunks[:final_k]
130
- else:
131
- # Fallback to sentence-transformers CrossEncoder
132
- pairs = [[query, chunk] for chunk in chunks]
133
- scores = self.rerank_model.predict(pairs)
134
- ranked = sorted(zip(chunks, scores), key=lambda x: x[1], reverse=True)
135
- return [chunk for chunk, _ in ranked[:final_k]]
 
 
136
 
137
  # ------------------------------------------------------------------
138
  # MMR (applied after reranking as a diversity filter)
@@ -155,7 +212,7 @@ class HybridRetriever:
155
  # STEP 1: Encode chunks to get embeddings
156
  print(f" [MMR DEBUG] Encoding {len(chunks)} chunks...")
157
  try:
158
- chunk_embeddings = self.embed_model.encode(chunks)
159
  print(f" [MMR DEBUG] Chunk embeddings shape: {chunk_embeddings.shape}")
160
  except Exception as e:
161
  print(f" [MMR DEBUG] ERROR encoding chunks: {e}")
@@ -246,24 +303,30 @@ class HybridRetriever:
246
  # Main search
247
  # ------------------------------------------------------------------
248
 
249
- def search(self, query, index, top_k=25, final_k=5, mode="hybrid",
250
- chunking_technique: Optional[str] = None,
251
  rerank_strategy="cross-encoder", use_mmr=False, lambda_param=0.5,
252
- verbose: Optional[bool] = None) -> List[str]:
 
 
253
  """
254
  :param mode: "semantic", "bm25", or "hybrid"
255
  :param rerank_strategy: "cross-encoder", "rrf", or "none"
256
  :param use_mmr: Whether to apply MMR diversity filter after reranking
257
  :param lambda_param: MMR trade-off between relevance (1.0) and diversity (0.0)
 
 
258
  """
259
  should_print = verbose if verbose is not None else self.verbose
260
- requested_technique = self._normalize_chunking_technique(chunking_technique)
261
  total_start = time.perf_counter()
262
  semantic_time = 0.0
263
  bm25_time = 0.0
264
  rerank_time = 0.0
265
  mmr_time = 0.0
266
 
 
 
 
267
  if should_print:
268
  self._print_search_header(query, mode, rerank_strategy, top_k, final_k)
269
  if requested_technique:
@@ -283,25 +346,32 @@ class HybridRetriever:
283
 
284
  if mode in ["bm25", "hybrid"]:
285
  bm25_start = time.perf_counter()
286
- bm25_chunks = self._bm25_search(query, top_k, requested_technique)
287
  bm25_time = time.perf_counter() - bm25_start
288
  if should_print:
289
  self._print_candidates("BM25 Search", bm25_chunks)
290
  print(f"BM25 time: {bm25_time:.3f}s")
 
 
 
291
 
292
  # 2. Fuse / rerank
293
  rerank_start = time.perf_counter()
 
294
  if rerank_strategy == "rrf":
295
  candidates = self._rrf_score(semantic_chunks, bm25_chunks)[:final_k]
296
  label = "RRF"
297
  elif rerank_strategy == "cross-encoder":
298
  combined = list(dict.fromkeys(semantic_chunks + bm25_chunks))
299
- candidates = self._cross_encoder_rerank(query, combined, final_k)
300
  label = "Cross-Encoder"
301
  else: # "none"
302
  candidates = list(dict.fromkeys(semantic_chunks + bm25_chunks))[:final_k]
303
  label = "No Reranking"
304
  rerank_time = time.perf_counter() - rerank_start
 
 
 
305
 
306
  # 3. MMR diversity filter (applied after reranking)
307
  if use_mmr and candidates:
@@ -313,13 +383,17 @@ class HybridRetriever:
313
  label += " + MMR"
314
  mmr_time = time.perf_counter() - mmr_start
315
 
 
 
 
 
316
  total_time = time.perf_counter() - total_start
317
 
318
  if should_print:
319
  self._print_final_results(candidates, label)
320
  self._print_timing_summary(semantic_time, bm25_time, rerank_time, mmr_time, total_time)
321
 
322
- return candidates
323
 
324
  # ------------------------------------------------------------------
325
  # Printing
 
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  from typing import Optional, List
7
 
8
+ #
9
+
10
  # changed mmr to return final k, as a param, prev was hardcoded to 3
11
  # --@Qamare
12
 
13
  # Try to import FlashRank for CPU optimization, fallback to sentence-transformers
14
+ # try:
15
+ # from flashrank import Ranker, RerankRequest
16
+ # FLASHRANK_AVAILABLE = True
17
+ # except ImportError:
18
+ # from sentence_transformers import CrossEncoder
19
+ # FLASHRANK_AVAILABLE = False
20
 
21
  class HybridRetriever:
22
+ def __init__(self, embed_model, rerank_model_name='jinaai/jina-reranker-v1-tiny-en', verbose: bool = True):
23
+ import sys
24
+ import os
25
+ print(f"[DEBUG-HybridRetriever] Starting init", flush=True)
26
  self.embed_model = embed_model
27
  self.verbose = verbose
28
  self.rerank_model_name = self._normalize_rerank_model_name(rerank_model_name)
29
+ print(f"[DEBUG-HybridRetriever] Rerank model name: {self.rerank_model_name}", flush=True)
30
+
31
+ self.vo_client = None
32
+ self.ce_reranker = None
33
+ self.reranker_backend = "cross-encoder"
34
+
35
+ voyage_api_key = os.getenv("VOYAGE_API_KEY")
36
+ if voyage_api_key:
37
  try:
38
+ import voyageai
39
+ self.vo_client = voyageai.Client(api_key=voyage_api_key)
40
+ self.reranker_backend = "voyageai"
41
+ # Voyage uses model IDs like rerank-2.5; keep a safe default.
42
+ if not self.rerank_model_name.startswith("rerank-"):
43
+ self.rerank_model_name = "rerank-2.5"
44
+ print(f"[DEBUG-HybridRetriever] Voyage AI client initialized", flush=True)
45
+ except Exception as exc:
46
+ print(f"[DEBUG-HybridRetriever] Voyage unavailable ({exc}); falling back to cross-encoder", flush=True)
47
+
48
+ if self.vo_client is None:
49
+ from sentence_transformers import CrossEncoder
50
+ ce_model_name = self.rerank_model_name
51
+ if not ce_model_name.startswith("cross-encoder/"):
52
+ ce_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
53
+ self.ce_reranker = CrossEncoder(ce_model_name)
54
+ self.rerank_model_name = ce_model_name
55
+ self.reranker_backend = "cross-encoder"
56
+ print(f"[DEBUG-HybridRetriever] Cross-encoder reranker initialized: {ce_model_name}", flush=True)
57
 
58
+ sys.stdout.flush()
59
+ print(f"[DEBUG-HybridRetriever] Init complete", flush=True)
 
 
60
 
61
  def _normalize_rerank_model_name(self, model_name: str) -> str:
62
  normalized = (model_name or "").strip()
 
93
  # Retrieval
94
  # ------------------------------------------------------------------
95
 
96
+ def _semantic_search(self, query, index, top_k, technique_name: Optional[str] = None) -> tuple[np.ndarray, List[str]]:
97
  query_vector = self.embed_model.encode(query)
98
  query_kwargs = {
99
  "vector": query_vector.tolist(),
100
  "top_k": top_k,
101
  "include_metadata": True,
102
  }
103
+ if technique_name:
104
+ query_kwargs["filter"] = {"chunking_technique": {"$eq": technique_name}}
105
+
106
+ res = index.query(
107
+ **query_kwargs
108
+ )
109
  chunks = [match['metadata']['text'] for match in res['matches']]
110
  return query_vector, chunks
111
 
112
+ def _bm25_search(self, query, index, top_k=50, technique_name: Optional[str] = None) -> List[str]:
113
+ try:
114
+ import os
115
+ from pinecone import Pinecone
116
+ from pinecone_text.sparse import BM25Encoder
117
+ encoder = BM25Encoder().default()
118
+ pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
119
+ sparse_index = pc.Index("cbt-book-sparse")
120
+ sparse_vector = encoder.encode_queries(query)
121
+ query_kwargs = {
122
+ "sparse_vector": sparse_vector,
123
+ "top_k": top_k,
124
+ "include_metadata": True,
125
+ }
126
+ if technique_name:
127
+ query_kwargs["filter"] = {"chunking_technique": {"$eq": technique_name}}
128
+
129
+ res = sparse_index.query(**query_kwargs)
130
+ return [match["metadata"]["text"] for match in res["matches"]]
131
+ except Exception as e:
132
+ print(f"Error in BM25 search against Pinecone: {e}")
133
+ return []
134
+
135
+ """Fetch chunks from Pinecone and perform BM25 ranking locally."""
136
+ # Fetch more candidates than needed for BM25 to rank against
137
+ # Use a reasonable multiplier to get enough candidates without over-fetching
138
+ fetch_limit = min(top_k * 4,25) # e.g., 4*4=16, capped at 50
139
+ res = index.query(
140
+ vector=[0.0] * 512, # Dummy vector (BM25 doesn't use embeddings)
141
+ top_k=fetch_limit,
142
+ include_metadata=True,
143
+ filter={"chunking_technique": {"$eq": technique_name}}
144
+ )
145
+
146
+ # Extract chunks
147
+ chunks = [match['metadata']['text'] for match in res['matches']]
148
+ if not chunks:
149
+ return []
150
+
151
+ # Build BM25 index on these chunks
152
+ tokenized_corpus = [self._tokenize(chunk) for chunk in chunks]
153
+ bm25 = BM25Okapi(tokenized_corpus)
154
+
155
+ # Score query against chunks
156
  tokenized_query = self._tokenize(query)
157
+ scores = bm25.get_scores(tokenized_query)
158
+ top_indices = np.argsort(scores)[::-1][:top_k]
159
+ return [chunks[i] for i in top_indices]
 
 
 
 
 
 
 
 
160
 
161
  # ------------------------------------------------------------------
162
  # Fusion
 
174
  # Reranking
175
  # ------------------------------------------------------------------
176
 
177
+ def _cross_encoder_rerank(self, query, chunks, final_k) -> tuple[List[str], List[float]]:
178
+ if not chunks:
179
+ return [], []
180
+
181
+ if self.vo_client is not None:
182
+ reranking = self.vo_client.rerank(query, chunks, model=self.rerank_model_name, top_k=final_k)
183
+ ranked_chunks = [result.document for result in reranking.results]
184
+ ranked_scores = [result.relevance_score for result in reranking.results]
185
+ return ranked_chunks, ranked_scores
186
+
187
+ pairs = [[query, chunk] for chunk in chunks]
188
+ scores = self.ce_reranker.predict(pairs)
189
+ ranked_indices = np.argsort(scores)[::-1][:final_k]
190
+ ranked_chunks = [chunks[i] for i in ranked_indices]
191
+ ranked_scores = [float(scores[i]) for i in ranked_indices]
192
+ return ranked_chunks, ranked_scores
193
 
194
  # ------------------------------------------------------------------
195
  # MMR (applied after reranking as a diversity filter)
 
212
  # STEP 1: Encode chunks to get embeddings
213
  print(f" [MMR DEBUG] Encoding {len(chunks)} chunks...")
214
  try:
215
+ chunk_embeddings = np.array([self.embed_model.encode(c) for c in chunks])
216
  print(f" [MMR DEBUG] Chunk embeddings shape: {chunk_embeddings.shape}")
217
  except Exception as e:
218
  print(f" [MMR DEBUG] ERROR encoding chunks: {e}")
 
303
  # Main search
304
  # ------------------------------------------------------------------
305
 
306
+ def search(self, query, index, top_k=50, final_k=5, mode="hybrid",
 
307
  rerank_strategy="cross-encoder", use_mmr=False, lambda_param=0.5,
308
+ technique_name: Optional[str] = None,
309
+ chunking_technique: Optional[str] = None,
310
+ verbose: Optional[bool] = None, test: bool = False) -> tuple[List[str], float]:
311
  """
312
  :param mode: "semantic", "bm25", or "hybrid"
313
  :param rerank_strategy: "cross-encoder", "rrf", or "none"
314
  :param use_mmr: Whether to apply MMR diversity filter after reranking
315
  :param lambda_param: MMR trade-off between relevance (1.0) and diversity (0.0)
316
+ :param technique_name: Chunking technique to filter by (default: "markdown")
317
+ :returns: Tuple of (ranked_chunks, avg_chunk_score)
318
  """
319
  should_print = verbose if verbose is not None else self.verbose
320
+ requested_technique = self._normalize_chunking_technique(chunking_technique or technique_name)
321
  total_start = time.perf_counter()
322
  semantic_time = 0.0
323
  bm25_time = 0.0
324
  rerank_time = 0.0
325
  mmr_time = 0.0
326
 
327
+ if use_mmr:
328
+ final_k = 10
329
+
330
  if should_print:
331
  self._print_search_header(query, mode, rerank_strategy, top_k, final_k)
332
  if requested_technique:
 
346
 
347
  if mode in ["bm25", "hybrid"]:
348
  bm25_start = time.perf_counter()
349
+ bm25_chunks = self._bm25_search(query, index, top_k, requested_technique)
350
  bm25_time = time.perf_counter() - bm25_start
351
  if should_print:
352
  self._print_candidates("BM25 Search", bm25_chunks)
353
  print(f"BM25 time: {bm25_time:.3f}s")
354
+ print("All BM25 results:")
355
+ for i, chunk in enumerate(bm25_chunks):
356
+ print(f" [{i}] {chunk[:200]}..." if len(chunk) > 200 else f" [{i}] {chunk}")
357
 
358
  # 2. Fuse / rerank
359
  rerank_start = time.perf_counter()
360
+ chunk_scores = []
361
  if rerank_strategy == "rrf":
362
  candidates = self._rrf_score(semantic_chunks, bm25_chunks)[:final_k]
363
  label = "RRF"
364
  elif rerank_strategy == "cross-encoder":
365
  combined = list(dict.fromkeys(semantic_chunks + bm25_chunks))
366
+ candidates, chunk_scores = self._cross_encoder_rerank(query, combined, final_k)
367
  label = "Cross-Encoder"
368
  else: # "none"
369
  candidates = list(dict.fromkeys(semantic_chunks + bm25_chunks))[:final_k]
370
  label = "No Reranking"
371
  rerank_time = time.perf_counter() - rerank_start
372
+
373
+ # Compute average chunk score
374
+ avg_chunk_score = float(np.mean(chunk_scores)) if chunk_scores else 0.0
375
 
376
  # 3. MMR diversity filter (applied after reranking)
377
  if use_mmr and candidates:
 
383
  label += " + MMR"
384
  mmr_time = time.perf_counter() - mmr_start
385
 
386
+ if test and rerank_strategy != "cross-encoder" and candidates:
387
+ _, test_scores = self._cross_encoder_rerank(query, candidates, len(candidates))
388
+ avg_chunk_score = float(np.mean(test_scores)) if test_scores else 0.0
389
+
390
  total_time = time.perf_counter() - total_start
391
 
392
  if should_print:
393
  self._print_final_results(candidates, label)
394
  self._print_timing_summary(semantic_time, bm25_time, rerank_time, mmr_time, total_time)
395
 
396
+ return candidates, avg_chunk_score
397
 
398
  # ------------------------------------------------------------------
399
  # Printing
test.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["OMP_NUM_THREADS"] = "1"
3
+ os.environ["MKL_NUM_THREADS"] = "1"
4
+ import sys
5
+ import traceback
6
+ from datetime import datetime
7
+ from dotenv import load_dotenv
8
+
9
+ from config_loader import cfg
10
+ from data.vector_db import get_index_by_name
11
+ from retriever.retriever import HybridRetriever
12
+ from retriever.processor import ChunkProcessor
13
+ from data.ingest import CHUNKING_TECHNIQUES
14
+
15
+ def generate_retrieval_report(all_results, queries, output_file="retrieval_report.md"):
16
+ """
17
+ Generates a Markdown document summarizing the retrieved chunks
18
+ for each query, chunking technique, and retrieval strategy.
19
+ """
20
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
21
+
22
+ content = f"# Retrieval Testing Report\n\n*Generated:* {timestamp}\n\n"
23
+ content += "## Test Queries\n\n"
24
+ for i, q in enumerate(queries, 1):
25
+ content += f"{i}. {q}\n"
26
+
27
+ content += "\n## Retrieval Results by Query\n\n"
28
+
29
+ for q_idx, q_results in all_results.items():
30
+ content += f"### Query {q_idx + 1}: {queries[q_idx]}\n\n"
31
+
32
+ for tech_strat_key, chunks_data in q_results.items():
33
+ content += f"#### Strategy & Technique: {tech_strat_key}\n\n"
34
+
35
+ chunks = chunks_data.get('chunks', [])
36
+ score = chunks_data.get('score', 0)
37
+
38
+ content += f"**ChunkScore:** {score:.4f} | **Chunks retrieved:** {len(chunks)}\n\n"
39
+
40
+ if not chunks:
41
+ content += "*No chunks retrieved.*\n\n"
42
+ else:
43
+ for i, chunk in enumerate(chunks, 1):
44
+ content += f"**[Chunk {i}]** ({len(chunk)} chars):\n"
45
+ content += f"```text\n{chunk}\n```\n\n"
46
+
47
+ content += "---\n\n"
48
+
49
+ with open(output_file, 'w', encoding='utf-8') as f:
50
+ f.write(content)
51
+
52
+ print(f"\nRetrieval report saved to: {output_file}")
53
+
54
+
55
+ def main():
56
+ # Load environment variables
57
+ load_dotenv()
58
+
59
+ pinecone_key = os.getenv("PINECONE_API_KEY")
60
+ if not pinecone_key:
61
+ raise RuntimeError("PINECONE_API_KEY not found in environment variables")
62
+
63
+ test_queries = [
64
+ "What is cognitive behavior therapy and how does it work?",
65
+ "I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
66
+ "No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
67
+ "I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying."
68
+ ]
69
+
70
+ # TECHNIQUES_TO_EVALUATE = ["fixed", "semantic", "markdown", "page"]
71
+ # Use all 7 chunking techniques from ingest.py
72
+ CHUNKING_TECHNIQUES_FILTERED = CHUNKING_TECHNIQUES
73
+ print(f"Testing all {len(CHUNKING_TECHNIQUES_FILTERED)} chunking techniques:")
74
+ for tech in CHUNKING_TECHNIQUES_FILTERED:
75
+ print(f" - {tech['name']}: {tech['description']}")
76
+
77
+ RETRIEVAL_STRATEGIES = [
78
+ {"mode": "semantic", "use_mmr": False, "label": "semantic-no-mmr"},
79
+ {"mode": "semantic", "use_mmr": True, "label": "semantic-with-mmr"},
80
+ {"mode": "hybrid", "use_mmr": False, "label": "hybrid-no-mmr"},
81
+ {"mode": "hybrid", "use_mmr": True, "label": "hybrid-with-mmr"},
82
+ {"mode": "bm25", "use_mmr": False, "label": "bm25-no-mmr"},
83
+ ]
84
+
85
+ print("Initializing ChunkProcessor to load Embedding Model...")
86
+ proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)
87
+
88
+
89
+ print("Initializing HybridRetriever...")
90
+ retriever = HybridRetriever(
91
+ embed_model=proc.encoder,
92
+ rerank_model_name='jinaai/jina-reranker-v1-tiny-en',
93
+ verbose=False
94
+ )
95
+
96
+ all_query_results = {}
97
+
98
+ for query_idx, query in enumerate(test_queries):
99
+ print(f"\n{'='*80}")
100
+ print(f"PROCESSING QUERY {query_idx + 1}/{len(test_queries)}: {query}")
101
+ print(f"{'='*80}")
102
+
103
+ query_results = {}
104
+
105
+ # Connect to the single index where all techniques are stored with metadata differentiation
106
+ index_name = "cbt-book-recursive"
107
+ try:
108
+ index = get_index_by_name(pinecone_key, index_name)
109
+ stats = index.describe_index_stats()
110
+ if stats.get('total_vector_count', 0) == 0:
111
+ print(f" [!] Warning: Index {index_name} is empty. Proceeding for sparse test.")
112
+ except Exception as e:
113
+ print(f" [X] Failed to connect to index {index_name}: {e}")
114
+ continue
115
+
116
+ for technique in CHUNKING_TECHNIQUES_FILTERED:
117
+ technique_name = technique['name']
118
+
119
+ for strategy in RETRIEVAL_STRATEGIES:
120
+ result_key = f"{technique_name} + {strategy['label']}"
121
+ print(f"\nEvaluating: {result_key}")
122
+
123
+ try:
124
+ context_chunks, chunk_score = retriever.search(
125
+ query=query,
126
+ index=index,
127
+ mode=strategy['mode'],
128
+ rerank_strategy="cross-encoder",
129
+ use_mmr=strategy['use_mmr'],
130
+ top_k=25,
131
+ final_k=4,
132
+ technique_name=technique_name,
133
+ verbose=False,
134
+ test=True
135
+ )
136
+
137
+ query_results[result_key] = {
138
+ 'chunks': context_chunks,
139
+ 'score': chunk_score
140
+ }
141
+ print(f" -> Retrieved {len(context_chunks)} chunks (Score: {chunk_score:.4f})")
142
+
143
+ except Exception as e:
144
+ print(f" -> Error retrieving for {result_key}: {e}")
145
+
146
+ all_query_results[query_idx] = query_results
147
+
148
+ # Generate isolated retrieval test report
149
+ generate_retrieval_report(all_query_results, test_queries)
150
+
151
+
152
+ if __name__ == '__main__':
153
+ main()