Spaces:

Qar-Raz
/

NLP-RAG

Sleeping

App Files Files Community

Qar-Raz commited on Apr 5

Commit

c27a4e3

verified ·

1 Parent(s): f5ff6c4

Sync backend Docker context from GitHub main

Browse files

Files changed (14) hide show

Dockerfile +2 -1
backend/routes/predict.py +2 -1
backend/routes/predict_stream.py +4 -1
backend/services/startup.py +0 -1
config.yaml +3 -5
data/__init__.py +0 -0
data/vector_db.py +0 -245
main.py +304 -204
requirements.txt +3 -0
retriever/evaluator.py +5 -4
retriever/generator.py +12 -11
retriever/processor.py +60 -2
retriever/retriever.py +136 -62
test.py +153 -0

Dockerfile CHANGED Viewed

@@ -17,7 +17,8 @@ RUN pip install --upgrade pip && pip install -r requirements.txt
 COPY . .
-# Fail fast during build if critical runtime modules are missing from context.
 RUN test -d /app/backend && test -d /app/retriever && test -d /app/models && test -f /app/config.yaml
 # Hugging Face Spaces exposes apps on port 7860 by default.

 COPY . .
+# Fail fast during build if critical runtime code is missing from context.
+# changed this cuz we no longer have monorep
 RUN test -d /app/backend && test -d /app/retriever && test -d /app/models && test -f /app/config.yaml
 # Hugging Face Spaces exposes apps on port 7860 by default.

backend/routes/predict.py CHANGED Viewed

@@ -39,7 +39,7 @@ def predict(payload: PredictRequest) -> PredictResponse:
     model_resolve_time = time.perf_counter() - model_resolve_start
     retrieval_start = time.perf_counter()
-    contexts = retriever.search(
         query,
         index,
         chunking_technique=payload.chunking_technique,
@@ -72,6 +72,7 @@ def predict(payload: PredictRequest) -> PredictResponse:
         f"lambda={payload.lambda_param:.2f} | temp={payload.temperature:.2f} | "
         f"chunking={payload.chunking_technique} | "
         f"top_k={payload.top_k} | final_k={payload.final_k} | returned={len(contexts)} | "
         f"precheck={precheck_time:.3f}s | "
         f"state_access={state_access_time:.3f}s | model_resolve={model_resolve_time:.3f}s | "
         f"retrieval={retrieval_time:.3f}s | inference={inference_time:.3f}s | "

     model_resolve_time = time.perf_counter() - model_resolve_start
     retrieval_start = time.perf_counter()
+    contexts, chunk_score = retriever.search(
         query,
         index,
         chunking_technique=payload.chunking_technique,
         f"lambda={payload.lambda_param:.2f} | temp={payload.temperature:.2f} | "
         f"chunking={payload.chunking_technique} | "
         f"top_k={payload.top_k} | final_k={payload.final_k} | returned={len(contexts)} | "
+        f"chunk_score={chunk_score:.4f} | "
         f"precheck={precheck_time:.3f}s | "
         f"state_access={state_access_time:.3f}s | model_resolve={model_resolve_time:.3f}s | "
         f"retrieval={retrieval_time:.3f}s | inference={inference_time:.3f}s | "

backend/routes/predict_stream.py CHANGED Viewed

@@ -47,7 +47,7 @@ def predict_stream(payload: PredictRequest) -> StreamingResponse:
     model_resolve_time = time.perf_counter() - model_resolve_start
     retrieval_start = time.perf_counter()
-    contexts = retriever.search(
         query,
         index,
         chunking_technique=payload.chunking_technique,
@@ -80,6 +80,7 @@ def predict_stream(payload: PredictRequest) -> StreamingResponse:
                         "requested_top_k": payload.top_k,
                         "requested_final_k": payload.final_k,
                         "returned_context_count": len(contexts),
                         "use_mmr": payload.use_mmr,
                         "lambda_param": payload.lambda_param,
                     },
@@ -114,6 +115,7 @@ def predict_stream(payload: PredictRequest) -> StreamingResponse:
                         "requested_top_k": payload.top_k,
                         "requested_final_k": payload.final_k,
                         "returned_context_count": len(contexts),
                         "use_mmr": payload.use_mmr,
                         "lambda_param": payload.lambda_param,
                     },
@@ -127,6 +129,7 @@ def predict_stream(payload: PredictRequest) -> StreamingResponse:
                 f"lambda={payload.lambda_param:.2f} | temp={payload.temperature:.2f} | "
                 f"chunking={payload.chunking_technique} | "
                 f"top_k={payload.top_k} | final_k={payload.final_k} | returned={len(contexts)} | "
                 f"precheck={precheck_time:.3f}s | "
                 f"state_access={state_access_time:.3f}s | model_resolve={model_resolve_time:.3f}s | "
                 f"retrieval={retrieval_time:.3f}s | first_token={first_token_latency if first_token_latency is not None else -1:.3f}s | "

     model_resolve_time = time.perf_counter() - model_resolve_start
     retrieval_start = time.perf_counter()
+    contexts, chunk_score = retriever.search(
         query,
         index,
         chunking_technique=payload.chunking_technique,
                         "requested_top_k": payload.top_k,
                         "requested_final_k": payload.final_k,
                         "returned_context_count": len(contexts),
+                        "chunk_score": chunk_score,
                         "use_mmr": payload.use_mmr,
                         "lambda_param": payload.lambda_param,
                     },
                         "requested_top_k": payload.top_k,
                         "requested_final_k": payload.final_k,
                         "returned_context_count": len(contexts),
+                        "chunk_score": chunk_score,
                         "use_mmr": payload.use_mmr,
                         "lambda_param": payload.lambda_param,
                     },
                 f"lambda={payload.lambda_param:.2f} | temp={payload.temperature:.2f} | "
                 f"chunking={payload.chunking_technique} | "
                 f"top_k={payload.top_k} | final_k={payload.final_k} | returned={len(contexts)} | "
+                f"chunk_score={chunk_score:.4f} | "
                 f"precheck={precheck_time:.3f}s | "
                 f"state_access={state_access_time:.3f}s | model_resolve={model_resolve_time:.3f}s | "
                 f"retrieval={retrieval_time:.3f}s | first_token={first_token_latency if first_token_latency is not None else -1:.3f}s | "

backend/services/startup.py CHANGED Viewed

@@ -67,7 +67,6 @@ def initialize_runtime_state(state: dict[str, Any]) -> None:
     retriever_start = time.perf_counter()
     retriever = HybridRetriever(
-        final_chunks,
         proc.encoder,
         rerank_model_name=rerank_model_name,
         verbose=False,

     retriever_start = time.perf_counter()
     retriever = HybridRetriever(
         proc.encoder,
         rerank_model_name=rerank_model_name,
         verbose=False,

config.yaml CHANGED Viewed

@@ -27,20 +27,18 @@ retrieval:
   mode: "hybrid"
   # Options: cross-encoder, rrf
   rerank_strategy: "cross-encoder"
-  use_mmr: true
-  top_k: 10
   final_k: 5
 generation:
   temperature: 0.
   max_new_tokens: 512
   # The model used to Judge the others (OpenRouter)
-  judge_model: "stepfun/step-3.5-flash:free"
 # List of contestants in the tournament
 models:
   - "Llama-3-8B"
   - "Mistral-7B"
-  - "Qwen-2.5"
-  - "DeepSeek-V3"
   - "TinyAya"

   mode: "hybrid"
   # Options: cross-encoder, rrf
   rerank_strategy: "cross-encoder"
+  use_mmr: False
+  top_k: 50
   final_k: 5
 generation:
   temperature: 0.
   max_new_tokens: 512
   # The model used to Judge the others (OpenRouter)
+  judge_model: "deepseek/deepseek-v3.2"
 # List of contestants in the tournament
 models:
   - "Llama-3-8B"
   - "Mistral-7B"
   - "TinyAya"

data/__init__.py DELETED Viewed

File without changes

data/vector_db.py DELETED Viewed

@@ -1,245 +0,0 @@
-import time
-import re
-import json
-from pathlib import Path
-from typing import Any, Dict, List
-from pinecone import Pinecone, ServerlessSpec
-# Added cacheing to reduce consecutive startup time
-# --@Qamar
-def slugify_technique(name):
-    """Converts 'Sentence Splitter' to 'sentence-splitter' for Pinecone naming."""
-    return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
-def get_index_by_name(api_key: str, index_name: str):
-    """
-    Directly connects to a Pinecone index by its full string name.
-    Useful for the API/Production side where the name is already known.
-    """
-    pc = Pinecone(api_key=api_key)
-    # Check if it exists first to avoid a 404 crash
-    existing_indexes = [idx.name for idx in pc.list_indexes()]
-    if index_name not in existing_indexes:
-        raise ValueError(f"Index '{index_name}' does not exist in your Pinecone project.")
-    print(f" Connecting to Index: {index_name}")
-    return pc.Index(index_name)
-def get_pinecone_index(api_key, base_name, technique, dimension=384, metric="cosine"):
-    """
-    Creates/Returns an index specifically for a technique.
-    Example: 'arxiv-index-token'
-    """
-    pc = Pinecone(api_key=api_key)
-    tech_slug = slugify_technique(technique)
-    full_index_name = f"{base_name}-{tech_slug}"
-    existing_indexes = [idx.name for idx in pc.list_indexes()]
-    if full_index_name not in existing_indexes:
-        print(f" Creating specialized index: {full_index_name}...")
-        pc.create_index(
-            name=full_index_name,
-            dimension=dimension,
-            metric=metric,
-            spec=ServerlessSpec(cloud="aws", region="us-east-1")
-        )
-        # Wait for index to spin up
-        while not pc.describe_index(full_index_name).status['ready']:
-            time.sleep(1)
-    # Use our new helper to return the index object
-    return get_index_by_name(api_key, full_index_name)
-def refresh_pinecone_index(index, final_chunks, batch_size=100):
-    """
-    Refreshes the specific index. Since index is now technique-specific,
-    we just check if it's already populated.
-    """
-    if not final_chunks:
-        print("No chunks provided to refresh.")
-        return False
-    try:
-        # Check current stats for this specific index
-        stats = index.describe_index_stats()
-        current_count = stats.get('total_vector_count', 0)
-        expected_count = len(final_chunks)
-        print(f" Index Stats -> Existing: {current_count} | New Chunks: {expected_count}")
-        if current_count == 0:
-            print(f"➕ Index is empty. Upserting {expected_count} vectors...")
-            vectors = prepare_vectors_for_upsert(final_chunks)
-            upsert_to_pinecone(index, vectors, batch_size)
-            return True
-        elif current_count < expected_count:
-            # Simple check to see if we need to top up or refresh
-            print(f" Vector count mismatch ({current_count} < {expected_count}). Updating index...")
-            vectors = prepare_vectors_for_upsert(final_chunks)
-            upsert_to_pinecone(index, vectors, batch_size)
-            return True
-        else:
-            print(f" Index is already populated with {current_count} vectors. Ready for search.")
-            return False
-    except Exception as e:
-        print(f" Error refreshing index: {e}")
-        return False
-# Utility functions remain the same as previous version
-def prepare_vectors_for_upsert(final_chunks):
-    vectors = []
-    for chunk in final_chunks:
-        meta = chunk.get('metadata', {})
-        metadata_payload = dict(meta) if isinstance(meta, dict) else {}
-        metadata_payload.setdefault('text', meta.get('text', "") if isinstance(meta, dict) else "")
-        metadata_payload.setdefault('title', meta.get('title', "") if isinstance(meta, dict) else "")
-        metadata_payload.setdefault('url', meta.get('url', "") if isinstance(meta, dict) else "")
-        metadata_payload.setdefault('chunk_index', meta.get('chunk_index', 0) if isinstance(meta, dict) else 0)
-        metadata_payload.setdefault('technique', meta.get('technique', "unknown") if isinstance(meta, dict) else "unknown")
-        metadata_payload.setdefault('chunking_technique', meta.get('chunking_technique', "unknown") if isinstance(meta, dict) else "unknown")
-        vectors.append({
-            'id': chunk['id'],
-            'values': chunk['values'],
-            'metadata': metadata_payload
-        })
-    return vectors
-def upsert_to_pinecone(index, chunks, batch_size=100):
-    for i in range(0, len(chunks), batch_size):
-        batch = chunks[i : i + batch_size]
-        index.upsert(vectors=batch)
-# Some methods for loading chunks back from Pinecone with local caching to speed up BM25 initialization
-def _sanitize_index_name(index_name: str) -> str:
-    return re.sub(r'[^a-zA-Z0-9._-]+', '-', index_name).strip('-') or 'default-index'
-def _chunk_cache_path(cache_dir: str, index_name: str) -> Path:
-    cache_root = Path(cache_dir)
-    cache_root.mkdir(parents=True, exist_ok=True)
-    safe_name = _sanitize_index_name(index_name)
-    return cache_root / f"bm25_chunks_{safe_name}.json"
-def _read_chunk_cache(path: Path) -> Dict[str, Any]:
-    with path.open("r", encoding="utf-8") as f:
-        return json.load(f)
-def _write_chunk_cache(path: Path, payload: Dict[str, Any]) -> None:
-    with path.open("w", encoding="utf-8") as f:
-        json.dump(payload, f)
-def load_chunks_with_local_cache(
-    index,
-    index_name: str,
-    cache_dir: str = ".cache",
-    batch_size: int = 100,
-    force_refresh: bool = False,
-) -> tuple[List[Dict[str, Any]], str]:
-    cache_file = _chunk_cache_path(cache_dir=cache_dir, index_name=index_name)
-    stats = index.describe_index_stats()
-    current_count = stats.get("total_vector_count", 0)
-    if not force_refresh and cache_file.exists():
-        try:
-            cached_payload = _read_chunk_cache(cache_file)
-            cached_meta = cached_payload.get("meta", {})
-            cached_count = cached_meta.get("vector_count", -1)
-            cached_chunks = cached_payload.get("chunks", [])
-            if cached_count == current_count and cached_chunks:
-                print(
-                    f" Loaded BM25 chunk cache: {cache_file} "
-                    f"(chunks={len(cached_chunks)}, vectors={cached_count})"
-                )
-                return cached_chunks, "cache"
-            print(
-                " BM25 cache stale or empty. "
-                f"cache_vectors={cached_count}, pinecone_vectors={current_count}. Refreshing..."
-            )
-        except Exception as e:
-            print(f" Failed to read BM25 cache ({cache_file}): {e}. Refreshing from Pinecone...")
-    chunks = load_chunks_from_pinecone(index=index, batch_size=batch_size)
-    payload = {
-        "meta": {
-            "index_name": index_name,
-            "vector_count": current_count,
-            "updated_at_epoch_s": int(time.time()),
-        },
-        "chunks": chunks,
-    }
-    try:
-        _write_chunk_cache(cache_file, payload)
-        print(f" Saved BM25 chunk cache: {cache_file} (chunks={len(chunks)})")
-    except Exception as e:
-        print(f" Failed to write BM25 cache ({cache_file}): {e}")
-    return chunks, "pinecone"
-def load_chunks_from_pinecone(index, batch_size: int = 100) -> list[dict[str, any]]:
-    """
-    Scans the Pinecone index to retrieve all text metadata for the BM25 corpus.
-    """
-    stats = index.describe_index_stats()
-    namespaces = list(stats.get('namespaces', {}).keys())
-    # If no namespaces are explicitly named, Pinecone uses an empty string for the default
-    if not namespaces:
-        namespaces = [""]
-    all_chunks: List[Dict[str, Any]] = []
-    seen_ids = set()
-    print(f"Loading vectors for BM25 from namespaces: {namespaces}")
-    for ns in namespaces:
-        # Pinecone's list() generator returns batches of IDs
-        for id_batch in index.list(namespace=ns, limit=batch_size):
-            if not id_batch:
-                continue
-            # Fetch the actual content (metadata) for this batch of IDs
-            fetched = index.fetch(ids=id_batch, namespace=ns)
-            vectors = getattr(fetched, "vectors", {})
-            for vector_id, vector_data in vectors.items():
-                if vector_id in seen_ids:
-                    continue
-                seen_ids.add(vector_id)
-                # Safely extract metadata
-                metadata = getattr(vector_data, "metadata", {})
-                if metadata is None:
-                    metadata = {}
-                if not isinstance(metadata, dict):
-                    metadata = dict(metadata)
-                text = metadata.get("text")
-                if not text:
-                    continue
-                all_chunks.append({
-                    "id": vector_id,
-                    "metadata": metadata
-                })
-        print(f" Finished namespace: '{ns if ns else 'default'}'")
-    print(f"Total chunks loaded into memory: {len(all_chunks)}")
-    return all_chunks

main.py CHANGED Viewed

@@ -2,8 +2,6 @@ import os
 import json
 import time
 from datetime import datetime
-from multiprocessing import Pool, cpu_count
-from functools import partial
 from dotenv import load_dotenv
 from config_loader import cfg
@@ -18,63 +16,47 @@ from data.ingest import ingest_data, CHUNKING_TECHNIQUES
 # Import model fleet
 from models.llama_3_8b import Llama3_8B
 from models.mistral_7b import Mistral_7b
-from models.qwen_2_5 import Qwen2_5
-from models.deepseek_v3 import DeepSeek_V3
 from models.tiny_aya import TinyAya
 MODEL_MAP = {
     "Llama-3-8B": Llama3_8B,
     "Mistral-7B": Mistral_7b,
-    "Qwen-2.5": Qwen2_5,
-    "DeepSeek-V3": DeepSeek_V3,
     "TinyAya": TinyAya
 }
 load_dotenv()
-def run_rag_for_technique(technique_name, query, index, encoder, models, evaluator, rag_engine):
-    """Run RAG pipeline for a specific chunking technique."""
     print(f"\n{'='*80}")
-    print(f"TECHNIQUE: {technique_name.upper()}")
     print(f"{'='*80}")
-    # Filter chunks by technique metadata
-    query_vector = encoder.encode(query).tolist()
-    # Query with metadata filter for this technique - get more candidates for reranking
-    res = index.query(
-        vector=query_vector,
-        top_k=25,
-        include_metadata=True,
-        filter={"technique": {"$eq": technique_name}}
     )
-    # Extract context chunks with URLs
-    all_candidates = []
-    chunk_urls = []
-    for match in res['matches']:
-        all_candidates.append(match['metadata']['text'])
-        chunk_urls.append(match['metadata'].get('url', ''))
-    print(f"\nRetrieved {len(all_candidates)} candidate chunks for technique '{technique_name}'")
-    if not all_candidates:
         print(f"WARNING: No chunks found for technique '{technique_name}'")
         return {}
-    # Apply cross-encoder reranking to get top 5
-    # Use global reranker loaded once per worker
-    global _worker_reranker
-    pairs = [[query, chunk] for chunk in all_candidates]
-    scores = _worker_reranker.predict(pairs)
-    ranked = sorted(zip(all_candidates, chunk_urls, scores), key=lambda x: x[2], reverse=True)
-    context_chunks = [chunk for chunk, _, _ in ranked[:5]]
-    context_urls = [url for _, url, _ in ranked[:5]]
-    print(f"After reranking: {len(context_chunks)} chunks (top 5)")
     # Print the final RAG context being passed to models (only once)
     print(f"\n{'='*80}")
     print(f"📚 FINAL RAG CONTEXT FOR TECHNIQUE '{technique_name.upper()}'")
@@ -88,6 +70,8 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
     # Run model tournament for this technique
     tournament_results = {}
     for name, model_inst in models.items():
         print(f"\n{'-'*60}")
@@ -97,7 +81,6 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
             # Generation
             answer = rag_engine.get_answer(
                 model_inst, query, context_chunks,
-                context_urls=context_urls,
                 temperature=cfg.gen['temperature']
             )
@@ -118,7 +101,6 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
                 "Relevancy": rel['score'],
                 "Claims": faith['details'],
                 "context_chunks": context_chunks,
-                "context_urls": context_urls
             }
             print(f"\n📊 EVALUATION SCORES:")
@@ -135,7 +117,6 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
                 "Claims": [],
                 "error": str(e),
                 "context_chunks": context_chunks,
-                "context_urls": context_urls
             }
     return tournament_results
@@ -185,13 +166,21 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
     # Aggregate results across all queries
     aggregated_results = {}
     for query_idx, query_results in all_query_results.items():
         for technique_name, model_results in query_results.items():
             if technique_name not in aggregated_results:
                 aggregated_results[technique_name] = {}
             for model_name, results in model_results.items():
                 if model_name not in aggregated_results[technique_name]:
                     aggregated_results[technique_name][model_name] = {
                         'Faithfulness': [],
@@ -213,6 +202,15 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
             content += "No results available for this technique.\n\n"
             continue
         # Create results table with averaged scores
         content += "| Model | Avg Faithfulness | Avg Relevancy | Avg Combined |\n"
         content += "|-------|------------------|---------------|--------------|\n"
@@ -348,8 +346,8 @@ Based on the ablation study results:
 - *Embedding Model:* Jina embeddings (512 dimensions)
 - *Vector Database:* Pinecone (serverless, AWS us-east-1)
 - *Judge Model:* Openrouter Free models
-- *Retrieval:* Top 5 chunks per technique
-- *Evaluation Metrics:* Faithfulness (context grounding), Relevancy (query addressing)
 ---
@@ -364,81 +362,100 @@ This report was automatically generated by the RAG Ablation Study Pipeline.
     return output_file
-# Global variables for worker processes
-_worker_proc = None
-_worker_evaluator = None
-_worker_models = None
-_worker_rag_engine = None
-_worker_reranker = None
-def init_worker(model_name, evaluator_config):
-    """Initialize models once per worker process."""
-    global _worker_proc, _worker_evaluator, _worker_models, _worker_rag_engine, _worker_reranker
-    from retriever.processor import ChunkProcessor
-    from retriever.evaluator import RAGEvaluator
-    from retriever.generator import RAGGenerator
-    from sentence_transformers import CrossEncoder
-    from models.llama_3_8b import Llama3_8B
-    from models.mistral_7b import Mistral_7b
-    from models.qwen_2_5 import Qwen2_5
-    from models.deepseek_v3 import DeepSeek_V3
-    from models.tiny_aya import TinyAya
-    MODEL_MAP = {
-        "Llama-3-8B": Llama3_8B,
-        "Mistral-7B": Mistral_7b,
-        "Qwen-2.5": Qwen2_5,
-        "DeepSeek-V3": DeepSeek_V3,
-        "TinyAya": TinyAya
-    }
-    # Load embedding model once
-    _worker_proc = ChunkProcessor(model_name=model_name, verbose=False)
-    # Initialize evaluator
-    _worker_evaluator = RAGEvaluator(
-        judge_model=evaluator_config['judge_model'],
-        embedding_model=_worker_proc.encoder,
-        api_key=evaluator_config['api_key']
     )
-    # Initialize models
-    hf_token = os.getenv("HF_TOKEN")
-    _worker_models = {name: MODEL_MAP[name](token=hf_token) for name in evaluator_config['model_list']}
-    # Initialize RAG engine
-    _worker_rag_engine = RAGGenerator()
-    # Load reranker once per worker
-    _worker_reranker = CrossEncoder('jinaai/jina-reranker-v1-tiny-en')
-def run_rag_for_technique_wrapper(args):
-    """Wrapper function for parallel execution."""
-    global _worker_proc, _worker_evaluator, _worker_models, _worker_rag_engine
-    technique, query, index_name, pinecone_key = args
-    try:
-        # Create new connection in worker process
-        from data.vector_db import get_index_by_name
-        index = get_index_by_name(pinecone_key, index_name)
-        return technique['name'], run_rag_for_technique(
-            technique_name=technique['name'],
-            query=query,
-            index=index,
-            encoder=_worker_proc.encoder,
-            models=_worker_models,
-            evaluator=_worker_evaluator,
-            rag_engine=_worker_rag_engine
-        )
-    except Exception as e:
-        import traceback
-        print(f"\n✗ Error processing technique {technique['name']}: {e}")
-        print(f"Full traceback:")
-        traceback.print_exc()
-        return technique['name'], {}
 def main():
@@ -458,8 +475,9 @@ def main():
     # Test queries
     test_queries = [
         "What is cognitive behavior therapy and how does it work?",
-        "What are the common cognitive distortions in CBT?",
-        "How does CBT help with anxiety and depression?"
     ]
     print("=" * 80)
@@ -478,122 +496,186 @@ def main():
     from data.vector_db import get_index_by_name
     index_name = f"{cfg.db['base_index_name']}-{cfg.processing['technique']}"
-    print(f"\nChecking for existing index: {index_name}")
     try:
         # Try to connect to existing index
-        print("Connecting to Pinecone...")
         existing_index = get_index_by_name(pinecone_key, index_name)
-        print("Getting index stats...")
         stats = existing_index.describe_index_stats()
         existing_count = stats.get('total_vector_count', 0)
         if existing_count > 0:
-            print(f"\n✓ Found existing index with {existing_count} vectors")
-            print("Skipping ingestion - using existing data")
             # Initialize processor (this loads the embedding model)
-            print("Loading embedding model for retrieval...")
             from retriever.processor import ChunkProcessor
             proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)
             index = existing_index
             all_chunks = []  # Empty since we're using existing data
             final_chunks = []
-            print("✓ Processor initialized")
         else:
-            print("\nIndex exists but is empty. Running full ingestion...")
             all_chunks, final_chunks, proc, index = ingest_data()
     except Exception as e:
-        print(f"\nIndex check failed: {e}")
-        print("Running full ingestion...")
         all_chunks, final_chunks, proc, index = ingest_data()
     print(f"\nTechniques to evaluate: {[tech['name'] for tech in CHUNKING_TECHNIQUES]}")
-    # Step 2: Initialize components
     print("\n" + "=" * 80)
-    print("STEP 2: INITIALIZING COMPONENTS")
     print("=" * 80)
-    # Initialize models
-    print("\nInitializing models...")
     rag_engine = RAGGenerator()
     models = {name: MODEL_MAP[name](token=hf_token) for name in cfg.model_list}
-    # Initialize evaluator
-    print("Initializing evaluator...")
-    if not openrouter_key:
-        raise RuntimeError("OPENROUTER_API_KEY not found in environment variables")
     evaluator = RAGEvaluator(
         judge_model=cfg.gen['judge_model'],
         embedding_model=proc.encoder,
         api_key=openrouter_key
     )
-    # Step 3: Run RAG for all techniques in parallel for all queries
-    print("\n" + "=" * 80)
-    print("STEP 3: RUNNING RAG FOR ALL 6 TECHNIQUES (IN PARALLEL)")
-    print("=" * 80)
-    # Prepare arguments for parallel execution
-    num_processes = min(cpu_count(), len(CHUNKING_TECHNIQUES))
-    print(f"\nUsing {num_processes} parallel processes for {len(CHUNKING_TECHNIQUES)} techniques")
-    # Run techniques in parallel for all queries
-    evaluator_config = {
-        'judge_model': cfg.gen['judge_model'],
-        'api_key': openrouter_key,
-        'model_list': cfg.model_list
-    }
     all_query_results = {}
     for query_idx, query in enumerate(test_queries):
         print(f"\n{'='*80}")
-        print(f"PROCESSING QUERY {query_idx + 1}/{len(test_queries)}")
-        print(f"Query: {query}")
         print(f"{'='*80}")
-        with Pool(
-            processes=num_processes,
-            initializer=init_worker,
-            initargs=(cfg.processing['embedding_model'], evaluator_config)
-        ) as pool:
-            args_list = [
-                (technique, query, index_name, pinecone_key)
-                for technique in CHUNKING_TECHNIQUES
-            ]
-            results_list = pool.map(run_rag_for_technique_wrapper, args_list)
-        # Convert results to dictionary and store
-        query_results = {name: results for name, results in results_list}
-        all_query_results[query_idx] = query_results
         # Print quick summary for this query
         print(f"\n{'='*80}")
         print(f"QUERY {query_idx + 1} SUMMARY")
         print(f"{'='*80}")
-        print(f"\n{'Technique':<15} {'Avg Faith':>12} {'Avg Rel':>12} {'Best Model':<20}")
-        print("-" * 60)
-        for technique_name, model_results in query_results.items():
             if model_results:
-                avg_faith = sum(r.get('Faithfulness', 0) for r in model_results.values()) / len(model_results)
-                avg_rel = sum(r.get('Relevancy', 0) for r in model_results.values()) / len(model_results)
                 # Find best model
                 best_model = max(
-                    model_results.items(),
                     key=lambda x: x[1].get('Faithfulness', 0) + x[1].get('Relevancy', 0)
                 )
                 best_name = best_model[0]
-                print(f"{technique_name:<15} {avg_faith:>11.1f}% {avg_rel:>12.3f} {best_name:<20}")
             else:
-                print(f"{technique_name:<15} {'N/A':>12} {'N/A':>12} {'N/A':<20}")
-        print("-" * 60)
     # Step 4: Generate findings document from all queries
     print("\n" + "=" * 80)
@@ -608,45 +690,63 @@ def main():
     print("=" * 80)
     print(f"\nQueries processed: {len(test_queries)}")
-    print(f"Techniques evaluated: {len(CHUNKING_TECHNIQUES)}")
     print(f"Models tested: {len(cfg.model_list)}")
     print(f"\nFindings document: {findings_file}")
     # Print final summary across all queries
-    print("\n" + "-" * 60)
-    print(f"{'Technique':<15} {'Avg Faith':>12} {'Avg Rel':>12} {'Best Model':<20}")
-    print("-" * 60)
-    # Calculate averages across all queries
-    for tech_config in CHUNKING_TECHNIQUES:
         tech_name = tech_config['name']
-        all_faith = []
-        all_rel = []
-        best_model_name = None
-        best_combined = 0
-        for query_idx, query_results in all_query_results.items():
-            if tech_name in query_results and query_results[tech_name]:
-                model_results = query_results[tech_name]
-                for model_name, results in model_results.items():
-                    faith = results.get('Faithfulness', 0)
-                    rel = results.get('Relevancy', 0)
-                    combined = faith + rel
-                    all_faith.append(faith)
-                    all_rel.append(rel)
-                    if combined > best_combined:
-                        best_combined = combined
-                        best_model_name = model_name
-        if all_faith:
-            avg_faith = sum(all_faith) / len(all_faith)
-            avg_rel = sum(all_rel) / len(all_rel)
-            print(f"{tech_name:<15} {avg_faith:>11.1f}% {avg_rel:>12.3f} {best_model_name or 'N/A':<20}")
-        else:
-            print(f"{tech_name:<15} {'N/A':>12} {'N/A':>12} {'N/A':<20}")
-    print("-" * 60)
     print("\n✓ Ablation study complete!")
     print(f"✓ Results saved to: {findings_file}")

 import json
 import time
 from datetime import datetime
 from dotenv import load_dotenv
 from config_loader import cfg
 # Import model fleet
 from models.llama_3_8b import Llama3_8B
 from models.mistral_7b import Mistral_7b
 from models.tiny_aya import TinyAya
 MODEL_MAP = {
     "Llama-3-8B": Llama3_8B,
     "Mistral-7B": Mistral_7b,
     "TinyAya": TinyAya
 }
 load_dotenv()
+def run_rag_for_technique(technique_name, query, index, encoder, models, evaluator, rag_engine, retriever, retrieval_strategy):
+    """Run RAG pipeline for a specific chunking technique and retrieval strategy."""
+    mode = retrieval_strategy['mode']
+    use_mmr = retrieval_strategy['use_mmr']
+    strategy_label = retrieval_strategy['label']
     print(f"\n{'='*80}")
+    print(f"TECHNIQUE: {technique_name.upper()} | STRATEGY: {strategy_label}")
     print(f"{'='*80}")
+    # Use HybridRetriever to retrieve chunks
+    context_chunks, chunk_score = retriever.search(
+        query=query,
+        index=index,
+        mode=mode,
+        rerank_strategy="cross-encoder",
+        use_mmr=use_mmr,
+        top_k=50,
+        final_k=5,
+        technique_name=technique_name,
+        verbose=False
     )
+    print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
+    if not context_chunks:
         print(f"WARNING: No chunks found for technique '{technique_name}'")
         return {}
     # Print the final RAG context being passed to models (only once)
     print(f"\n{'='*80}")
     print(f"📚 FINAL RAG CONTEXT FOR TECHNIQUE '{technique_name.upper()}'")
     # Run model tournament for this technique
     tournament_results = {}
+    tournament_results["_ChunkScore"] = chunk_score  # Store at technique level, not per model
+    tournament_results["_Strategy"] = strategy_label
     for name, model_inst in models.items():
         print(f"\n{'-'*60}")
             # Generation
             answer = rag_engine.get_answer(
                 model_inst, query, context_chunks,
                 temperature=cfg.gen['temperature']
             )
                 "Relevancy": rel['score'],
                 "Claims": faith['details'],
                 "context_chunks": context_chunks,
             }
             print(f"\n📊 EVALUATION SCORES:")
                 "Claims": [],
                 "error": str(e),
                 "context_chunks": context_chunks,
             }
     return tournament_results
     # Aggregate results across all queries
     aggregated_results = {}
+    chunk_scores_by_query_technique = {}  # Store ChunkScore per query+technique
     for query_idx, query_results in all_query_results.items():
         for technique_name, model_results in query_results.items():
             if technique_name not in aggregated_results:
                 aggregated_results[technique_name] = {}
+            # Extract ChunkScore (stored at technique level, not per model)
+            chunk_score = model_results.get('_ChunkScore', 0)
+            chunk_scores_by_query_technique[(query_idx, technique_name)] = chunk_score
             for model_name, results in model_results.items():
+                if model_name.startswith('_'):
+                    continue  # Skip metadata keys like _ChunkScore
                 if model_name not in aggregated_results[technique_name]:
                     aggregated_results[technique_name][model_name] = {
                         'Faithfulness': [],
             content += "No results available for this technique.\n\n"
             continue
+        # Show ChunkScore per query for this technique
+        content += "#### Chunk Retrieval Scores (ChunkScore)\n\n"
+        content += "| Query | Avg ChunkScore |\n"
+        content += "|-------|---------------|\n"
+        for q_idx in range(len(queries)):
+            score = chunk_scores_by_query_technique.get((q_idx, technique_name), 0)
+            content += f"| {q_idx + 1} | {score:.4f} |\n"
+        content += "\n"
         # Create results table with averaged scores
         content += "| Model | Avg Faithfulness | Avg Relevancy | Avg Combined |\n"
         content += "|-------|------------------|---------------|--------------|\n"
 - *Embedding Model:* Jina embeddings (512 dimensions)
 - *Vector Database:* Pinecone (serverless, AWS us-east-1)
 - *Judge Model:* Openrouter Free models
+- *Retrieval:* Top 4 chunks per technique
+- *Evaluation Metrics:* Faithfulness (context grounding), Relevancy (query addressing), ChunkScore (reranker confidence)
 ---
     return output_file
+def run_rag_for_technique_sequential(technique_name, query, index, encoder, models, evaluator, rag_engine, retriever, retrieval_strategy):
+    """Run RAG pipeline for a specific chunking technique and retrieval strategy (sequential)."""
+    mode = retrieval_strategy['mode']
+    use_mmr = retrieval_strategy['use_mmr']
+    strategy_label = retrieval_strategy['label']
+    print(f"\n{'='*80}")
+    print(f"TECHNIQUE: {technique_name.upper()} | STRATEGY: {strategy_label}")
+    print(f"{'='*80}")
+    # Use HybridRetriever to retrieve chunks
+    context_chunks, chunk_score = retriever.search(
+        query=query,
+        index=index,
+        mode=mode,
+        rerank_strategy="cross-encoder",
+        use_mmr=use_mmr,
+        top_k=50,
+        final_k=5,
+        technique_name=technique_name,
+        verbose=False,
+        test=True
     )
+    print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
+    if not context_chunks:
+        print(f"WARNING: No chunks found for technique '{technique_name}'")
+        return {}
+    # Print the final RAG context being passed to models (only once)
+    print(f"\n{'='*80}")
+    print(f"📚 FINAL RAG CONTEXT FOR TECHNIQUE '{technique_name.upper()}'")
+    print(f"{'='*80}")
+    for i, chunk in enumerate(context_chunks, 1):
+        print(f"\n[Chunk {i}] ({len(chunk)} chars):")
+        print(f"{'─'*60}")
+        print(chunk)
+        print(f"{'─'*60}")
+    print(f"\n{'='*80}")
+    # Run model tournament for this technique
+    tournament_results = {}
+    tournament_results["_ChunkScore"] = chunk_score
+    tournament_results["_Strategy"] = strategy_label
+    for name, model_inst in models.items():
+        print(f"\n{'-'*60}")
+        print(f"Model: {name}")
+        print(f"{'-'*60}")
+        try:
+            # Generation
+            answer = rag_engine.get_answer(
+                model_inst, query, context_chunks,
+                temperature=cfg.gen['temperature']
+            )
+            print(f"\n{'─'*60}")
+            print(f"📝 FULL ANSWER from {name}:")
+            print(f"{'─'*60}")
+            print(answer)
+            print(f"{'─'*60}")
+            # Faithfulness Evaluation (strict=False reduces API calls from ~22 to ~3 per eval)
+            faith = evaluator.evaluate_faithfulness(answer, context_chunks, strict=False)
+            # Relevancy Evaluation
+            rel = evaluator.evaluate_relevancy(query, answer)
+            tournament_results[name] = {
+                "answer": answer,
+                "Faithfulness": faith['score'],
+                "Relevancy": rel['score'],
+                "Claims": faith['details'],
+                "context_chunks": context_chunks,
+            }
+            print(f"\n📊 EVALUATION SCORES:")
+            print(f"  Faithfulness: {faith['score']:.1f}%")
+            print(f"  Relevancy: {rel['score']:.3f}")
+            print(f"  Combined: {faith['score'] + rel['score']:.3f}")
+        except Exception as e:
+            print(f"  Error evaluating {name}: {e}")
+            tournament_results[name] = {
+                "answer": "",
+                "Faithfulness": 0,
+                "Relevancy": 0,
+                "Claims": [],
+                "error": str(e),
+                "context_chunks": context_chunks,
+            }
+    return tournament_results
 def main():
     # Test queries
     test_queries = [
         "What is cognitive behavior therapy and how does it work?",
+        "I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
+        "No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
+        "I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying."
     ]
     print("=" * 80)
     from data.vector_db import get_index_by_name
     index_name = f"{cfg.db['base_index_name']}-{cfg.processing['technique']}"
+    print(f"\n[DEBUG] Checking for existing index: {index_name}")
     try:
         # Try to connect to existing index
+        print("[DEBUG] Connecting to Pinecone...")
         existing_index = get_index_by_name(pinecone_key, index_name)
+        print("[DEBUG] Getting index stats...")
         stats = existing_index.describe_index_stats()
         existing_count = stats.get('total_vector_count', 0)
         if existing_count > 0:
+            print(f"\n[DEBUG] ✓ Found existing index with {existing_count} vectors")
+            print("[DEBUG] Skipping ingestion - using existing data")
             # Initialize processor (this loads the embedding model)
+            print("[DEBUG] About to load embedding model...")
+            print(f"[DEBUG] Model: {cfg.processing['embedding_model']}")
+            import sys
+            sys.stdout.flush()
             from retriever.processor import ChunkProcessor
+            print("[DEBUG] ChunkProcessor imported successfully")
+            sys.stdout.flush()
+            print("[DEBUG] Creating ChunkProcessor instance...")
+            sys.stdout.flush()
             proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)
+            print("[DEBUG] ChunkProcessor created successfully")
+            sys.stdout.flush()
             index = existing_index
             all_chunks = []  # Empty since we're using existing data
             final_chunks = []
+            print("[DEBUG] ✓ Processor initialized")
         else:
+            print("\n[DEBUG] Index exists but is empty. Running full ingestion...")
             all_chunks, final_chunks, proc, index = ingest_data()
     except Exception as e:
+        print(f"\n[DEBUG] Index check failed: {e}")
+        import traceback
+        traceback.print_exc()
+        print("[DEBUG] Running full ingestion...")
         all_chunks, final_chunks, proc, index = ingest_data()
     print(f"\nTechniques to evaluate: {[tech['name'] for tech in CHUNKING_TECHNIQUES]}")
+    # Step 2: Components will be initialized in Step 3 (shared across all sequential runs)
     print("\n" + "=" * 80)
+    print("[DEBUG] STEP 2: PREPARING FOR SEQUENTIAL EXECUTION")
     print("=" * 80)
+    print(f"[DEBUG] Techniques to evaluate: {[t['name'] for t in CHUNKING_TECHNIQUES]}")
+    # print(f"[DEBUG] Filtered techniques: {TECHNIQUES_TO_EVALUATE}")
+    # Define retrieval strategies to test
+    RETRIEVAL_STRATEGIES = [
+        {"mode": "hybrid", "use_mmr": False, "label": "hybrid-no-mmr"},
+    ]
+    # Filter to only 4 techniques to reduce memory usage
+    TECHNIQUES_TO_EVALUATE = ["markdown", "recursive", "paragraph"]
+    CHUNKING_TECHNIQUES_FILTERED = [t for t in CHUNKING_TECHNIQUES if t['name'] in TECHNIQUES_TO_EVALUATE]
+    # Step 3: Run RAG for all techniques x strategies SEQUENTIALLY (to avoid OOM)
+    print("\n" + "=" * 80)
+    print(f"STEP 3: RUNNING RAG FOR {len(CHUNKING_TECHNIQUES_FILTERED)} TECHNIQUES x {len(RETRIEVAL_STRATEGIES)} STRATEGIES (SEQUENTIAL)")
+    print("=" * 80)
+    print(f"\nTechniques: {TECHNIQUES_TO_EVALUATE}")
+    print(f"\nRetrieval Strategies:")
+    for i, strat in enumerate(RETRIEVAL_STRATEGIES, 1):
+        mmr_status = "with MMR" if strat['use_mmr'] else "no MMR"
+        print(f"  {i}. {strat['label']}: mode={strat['mode']}, {mmr_status}")
+    # Initialize components once (shared across all sequential runs)
+    print("\n[DEBUG] Initializing components...")
+    import sys
+    sys.stdout.flush()
+    print("[DEBUG] Creating RAGGenerator...")
+    sys.stdout.flush()
     rag_engine = RAGGenerator()
+    print("[DEBUG] RAGGenerator created")
+    sys.stdout.flush()
+    print(f"[DEBUG] Loading models: {cfg.model_list}")
+    sys.stdout.flush()
     models = {name: MODEL_MAP[name](token=hf_token) for name in cfg.model_list}
+    print("[DEBUG] Models loaded successfully")
+    sys.stdout.flush()
+    print("[DEBUG] Creating RAGEvaluator...")
+    sys.stdout.flush()
     evaluator = RAGEvaluator(
         judge_model=cfg.gen['judge_model'],
         embedding_model=proc.encoder,
         api_key=openrouter_key
     )
+    print("[DEBUG] RAGEvaluator created")
+    sys.stdout.flush()
+    print("[DEBUG] Creating HybridRetriever...")
+    sys.stdout.flush()
+    retriever = HybridRetriever(
+        embed_model=proc.encoder,
+        rerank_model_name='rerank-2.5',
+        verbose=False
+    )
+    print("[DEBUG] HybridRetriever created")
+    sys.stdout.flush()
+    print("[DEBUG] All components initialized successfully.\n")
     all_query_results = {}
     for query_idx, query in enumerate(test_queries):
         print(f"\n{'='*80}")
+        print(f"[DEBUG] PROCESSING QUERY {query_idx + 1}/{len(test_queries)}")
+        print(f"[DEBUG] Query: {query}")
         print(f"{'='*80}")
+        import sys
+        sys.stdout.flush()
+        query_results = {}
+        for technique in CHUNKING_TECHNIQUES_FILTERED:
+            for strategy in RETRIEVAL_STRATEGIES:
+                result_key = f"{technique['name']}__{strategy['label']}"
+                print(f"\n[DEBUG] Processing: {result_key}")
+                sys.stdout.flush()
+                try:
+                    result = run_rag_for_technique_sequential(
+                        technique_name=technique['name'],
+                        query=query,
+                        index=index,
+                        encoder=proc.encoder,
+                        models=models,
+                        evaluator=evaluator,
+                        rag_engine=rag_engine,
+                        retriever=retriever,
+                        retrieval_strategy=strategy
+                    )
+                    print(f"[DEBUG] Result for {result_key}: {len(result)} keys")
+                    query_results[result_key] = result
+                except Exception as e:
+                    import traceback
+                    print(f"\n[DEBUG] ✗ Error processing {result_key}: {e}")
+                    traceback.print_exc()
+                    sys.stdout.flush()
+                    query_results[result_key] = {}
+        all_query_results[query_idx] = query_results
         # Print quick summary for this query
         print(f"\n{'='*80}")
         print(f"QUERY {query_idx + 1} SUMMARY")
         print(f"{'='*80}")
+        print(f"\n{'Technique':<15} {'Strategy':<20} {'ChunkScore':>12} {'Avg Faith':>12} {'Avg Rel':>12} {'Best Model':<20}")
+        print("-" * 92)
+        for result_key, model_results in query_results.items():
             if model_results:
+                chunk_score = model_results.get('_ChunkScore', 0)
+                strategy = model_results.get('_Strategy', '')
+                # Exclude _ChunkScore and _Strategy from model averaging
+                model_only = {k: v for k, v in model_results.items() if not k.startswith('_')}
+                avg_faith = sum(r.get('Faithfulness', 0) for r in model_only.values()) / len(model_only) if model_only else 0
+                avg_rel = sum(r.get('Relevancy', 0) for r in model_only.values()) / len(model_only) if model_only else 0
                 # Find best model
                 best_model = max(
+                    model_only.items(),
                     key=lambda x: x[1].get('Faithfulness', 0) + x[1].get('Relevancy', 0)
                 )
                 best_name = best_model[0]
+                print(f"{result_key:<15} {strategy:<20} {chunk_score:>12.4f} {avg_faith:>11.1f}% {avg_rel:>12.3f} {best_name:<20}")
             else:
+                print(f"{result_key:<15} {'':<20} {'N/A':>12} {'N/A':>12} {'N/A':>12} {'N/A':<20}")
+        print("-" * 92)
     # Step 4: Generate findings document from all queries
     print("\n" + "=" * 80)
     print("=" * 80)
     print(f"\nQueries processed: {len(test_queries)}")
+    print(f"Techniques evaluated: {len(CHUNKING_TECHNIQUES_FILTERED)} ({TECHNIQUES_TO_EVALUATE})")
     print(f"Models tested: {len(cfg.model_list)}")
     print(f"\nFindings document: {findings_file}")
     # Print final summary across all queries
+    print("\n" + "-" * 92)
+    print(f"{'Technique':<15} {'Strategy':<20} {'ChunkScore':>12} {'Avg Faith':>12} {'Avg Rel':>12} {'Best Model':<20}")
+    print("-" * 92)
+    # Define retrieval strategies (same as above)
+    RETRIEVAL_STRATEGIES = [
+        {"mode": "hybrid", "use_mmr": False, "label": "hybrid-no-mmr"},
+    ]
+    # Calculate averages across all queries for each technique x strategy
+    for tech_config in CHUNKING_TECHNIQUES_FILTERED:
         tech_name = tech_config['name']
+        for strategy in RETRIEVAL_STRATEGIES:
+            strategy_label = strategy['label']
+            result_key = f"{tech_name}__{strategy_label}"
+            all_faith = []
+            all_rel = []
+            all_chunk_scores = []
+            best_model_name = None
+            best_combined = 0
+            for query_idx, query_results in all_query_results.items():
+                if result_key in query_results and query_results[result_key]:
+                    model_results = query_results[result_key]
+                    # Extract ChunkScore
+                    chunk_score = model_results.get('_ChunkScore', 0)
+                    all_chunk_scores.append(chunk_score)
+                    # Exclude _ChunkScore and _Strategy from model averaging
+                    model_only = {k: v for k, v in model_results.items() if not k.startswith('_')}
+                    for model_name, results in model_only.items():
+                        faith = results.get('Faithfulness', 0)
+                        rel = results.get('Relevancy', 0)
+                        combined = faith + rel
+                        all_faith.append(faith)
+                        all_rel.append(rel)
+                        if combined > best_combined:
+                            best_combined = combined
+                            best_model_name = model_name
+            if all_faith:
+                avg_faith = sum(all_faith) / len(all_faith)
+                avg_rel = sum(all_rel) / len(all_rel)
+                avg_chunk_score = sum(all_chunk_scores) / len(all_chunk_scores) if all_chunk_scores else 0
+                print(f"{tech_name:<15} {strategy_label:<20} {avg_chunk_score:>12.4f} {avg_faith:>11.1f}% {avg_rel:>12.3f} {best_model_name or 'N/A':<20}")
+            else:
+                print(f"{tech_name:<15} {strategy_label:<20} {'N/A':>12} {'N/A':>12} {'N/A':>12} {'N/A':<20}")
+    print("-" * 92)
     print("\n✓ Ablation study complete!")
     print(f"✓ Results saved to: {findings_file}")

requirements.txt CHANGED Viewed

@@ -95,3 +95,6 @@ zstandard==0.25.0
 groq==1.1.2
 jiter==0.13.0
 openai==2.30.0

 groq==1.1.2
 jiter==0.13.0
 openai==2.30.0
+pinecone-text>=0.11.0
+voyageai==0.3.7

retriever/evaluator.py CHANGED Viewed

@@ -10,7 +10,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 # ------------------------------------------------------------------
 class GroqJudge:
-    def __init__(self, api_key: str, model: str =    "deepseek/deepseek-v3.2",):
         """
         Wraps OpenRouter's chat completions to match the .generate(prompt) interface
         expected by RAGEvaluator.
@@ -27,8 +27,6 @@ class GroqJudge:
         # Fallback models in order of preference (OpenRouter free models)
         self.fallback_models = [
-            "deepseek/deepseek-v3.2",
-            "qwen/qwen3.6-plus-preview:free",
             "stepfun/step-3.5-flash:free",
             "nvidia/nemotron-3-super-120b-a12b:free",
             "z-ai/glm-4.5-air:free",
@@ -228,7 +226,10 @@ class RAGEvaluator:
             return {"score": 0, "queries": []}
         # --- Step B: Similarity (single batched encode call) ---
-        all_vecs = self.encoder.encode([query] + gen_queries)
         original_vec = all_vecs[0:1]
         generated_vecs = all_vecs[1:]

 # ------------------------------------------------------------------
 class GroqJudge:
+    def __init__(self, api_key: str, model: str = "stepfun/step-3.5-flash:free"):
         """
         Wraps OpenRouter's chat completions to match the .generate(prompt) interface
         expected by RAGEvaluator.
         # Fallback models in order of preference (OpenRouter free models)
         self.fallback_models = [
             "stepfun/step-3.5-flash:free",
             "nvidia/nemotron-3-super-120b-a12b:free",
             "z-ai/glm-4.5-air:free",
             return {"score": 0, "queries": []}
         # --- Step B: Similarity (single batched encode call) ---
+        try:
+            all_vecs = self.encoder.encode([query] + gen_queries)
+        except AttributeError:
+            all_vecs = np.array([self.encoder.encode(text) for text in [query] + gen_queries])
         original_vec = all_vecs[0:1]
         generated_vecs = all_vecs[1:]

retriever/generator.py CHANGED Viewed

@@ -8,21 +8,22 @@ class RAGGenerator:
         else:
             context_text = "\n\n".join([f"[Source {i+1}]: {c}" for i, c in enumerate(retrieved_contexts)])
-        return f"""You are a specialized Cognitive Behavioral Therapy (CBT) assistant. Your task is to provide accurate, clinical, and structured answers based ONLY on the provided textbook excerpts.
 INSTRUCTIONS:
-1. Use the provided Sources to answer the question.
-2. CITATIONS: You must cite the sources used in your answer (e.g., "CBT is based on the cognitive model [Source 1]").
-3. FORMAT: Use clear headers and bullet points for complex explanations.
-4. GROUNDING: If the sources do not contain the answer, explicitly state: "The provided excerpts from the textbook do not contain information to answer this specific question." Do not use your own internal knowledge.
-5. TONE: Maintain a professional, empathetic, and academic tone.
-RETRIVED TEXTBOOK CONTEXT:
 {context_text}
-USER QUESTION: {query}
-ACADEMIC ANSWER (WITH CITATIONS):"""
     def get_answer(self, model_instance, query, retrieved_contexts, context_urls=None, **kwargs):
         """Uses a specific model instance to generate the final answer."""
@@ -42,4 +43,4 @@ ACADEMIC ANSWER (WITH CITATIONS):"""
         # Fallback for model wrappers that only expose sync generation.
         answer = model_instance.generate(prompt, **kwargs)
         if answer:
-            yield answer

         else:
             context_text = "\n\n".join([f"[Source {i+1}]: {c}" for i, c in enumerate(retrieved_contexts)])
+        return f"""You are an empathetic Cognitive Behavioral Therapy (CBT) therapist speaking directly to a client. **Your task is to provide a therapeutic, helpful response based ONLY on the provided clinical documents and excerpts**.
 INSTRUCTIONS:
+1. THERAPEUTIC DIALOGUE: Respond directly to the user as your client. Start by briefly validating their feelings, then gently apply CBT concepts, psychoeducation, or interventions found STRICTLY in the provided documents.
+2. PATIENT EXAMPLES & NAMES (CRITICAL): The provided documents contain transcripts and examples of other patients and therapists (e.g., Abe, Judith, Joseph). These are illustrative case studies ONLY. DO NOT assume the user is "Abe" or any other person mentioned in the text. NEVER address or refer to the user by these names. Extract the CBT concepts/techniques demonstrated in these transcripts and apply them to the current user's unique situation.
+3. GROUNDING (NO OPINIONS): Do not give your own opinions, general life advice, or use outside knowledge. Every therapeutic concept, identified cognitive distortion, or suggested exercise must come directly from the provided text.
+4. CITATIONS: You must cite the sources used in your response to show where the clinical guidance comes from (e.g., "It sounds like you might be experiencing what is known as 'all-or-nothing thinking' [Source 1]").
+5. FORMAT: Use clear Markdown formatting. Use paragraphs for conversational tone, and bullet points if you are breaking down specific steps, questions, or exercises found in the text.
+6. MISSING INFO: If the provided excerpts do not contain relevant CBT concepts to address the client's specific statement, explicitly state: "While I hear how difficult this is for you, the clinical materials I have right now do not contain specific steps to address this." Do not invent therapeutic advice.
+RETRIEVED CLINICAL CONTEXT:
 {context_text}
+CLIENT STATEMENT: {query}
+THERAPEUTIC RESPONSE (GROUNDED IN SOURCES):"""
     def get_answer(self, model_instance, query, retrieved_contexts, context_urls=None, **kwargs):
         """Uses a specific model instance to generate the final answer."""
         # Fallback for model wrappers that only expose sync generation.
         answer = model_instance.generate(prompt, **kwargs)
         if answer:
+            yield answer

retriever/processor.py CHANGED Viewed

@@ -88,14 +88,72 @@ class MarkdownTextSplitter:
 class ChunkProcessor:
-    def __init__(self, model_name='all-MiniLM-L6-v2', verbose: bool = True, load_hf_embeddings: bool = False):
         self.model_name = model_name
         self._use_remote_code = self._requires_remote_code(model_name)
         st_kwargs = {"trust_remote_code": True} if self._use_remote_code else {}
-        self.encoder = SentenceTransformer(model_name, **st_kwargs)
         self.verbose = verbose
         hf_kwargs = {"model_kwargs": {"trust_remote_code": True}} if self._use_remote_code else {}
         self.hf_embeddings = HuggingFaceEmbeddings(model_name=model_name, **hf_kwargs) if load_hf_embeddings else None
     def _requires_remote_code(self, model_name: str) -> bool:
         normalized = (model_name or "").strip().lower()

 class ChunkProcessor:
+    def __init__(self, model_name='jinaai/jina-embeddings-v2-small-en', verbose: bool = True, load_hf_embeddings: bool = False):
+        import sys
+        import os
+        # Set environment variables to limit memory usage BEFORE importing torch
+        os.environ["OMP_NUM_THREADS"] = "2"
+        os.environ["MKL_NUM_THREADS"] = "2"
+        os.environ["OPENBLAS_NUM_THREADS"] = "2"
+        print(f"[DEBUG-ChunkProcessor] Starting init with model: {model_name}", flush=True)
         self.model_name = model_name
+        print(f"[DEBUG-ChunkProcessor] Checking if remote code needed...", flush=True)
         self._use_remote_code = self._requires_remote_code(model_name)
+        print(f"[DEBUG-ChunkProcessor] Remote code needed: {self._use_remote_code}", flush=True)
         st_kwargs = {"trust_remote_code": True} if self._use_remote_code else {}
+        # Set torch threads to limit parallelism
+        import torch
+        torch.set_num_threads(2)
+        torch.set_num_interop_threads(2)
+        print(f"[DEBUG-ChunkProcessor] Torch threads set to 2", flush=True)
+        print(f"[DEBUG-ChunkProcessor] Loading SentenceTransformer with kwargs: {st_kwargs}", flush=True)
+        sys.stdout.flush()
+        # Do not explicitly force cpu if cuda is available, let SentenceTransformer handle it or specify explicit map.
+        import torch
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        import numpy as np
+        try:
+            if self._use_remote_code:
+                print("[DEBUG-ChunkProcessor] Using HuggingFaceEmbeddings-based encoder for remote model", flush=True)
+                hf_kwargs = {"model_kwargs": {"trust_remote_code": True}}
+                hf = HuggingFaceEmbeddings(model_name=model_name, **hf_kwargs)
+                class HFEncoderShim:
+                    def __init__(self, hf_client):
+                        self._hf = hf_client
+                    def encode(self, text: str):
+                        vecs = self._hf.embed_documents([text])
+                        return np.array(vecs[0], dtype=float)
+                self.encoder = HFEncoderShim(hf)
+                self.hf_embeddings = hf
+            else:
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+                self.encoder = SentenceTransformer(model_name, device=device, **st_kwargs)
+                print("[DEBUG-ChunkProcessor] SentenceTransformer loaded successfully", flush=True)
+        except Exception as e:
+            print(f"[DEBUG-ChunkProcessor] encoder init failed: {e}. Falling back to HuggingFaceEmbeddings.", flush=True)
+            hf_kwargs = {"model_kwargs": {"trust_remote_code": True}} if self._use_remote_code else {}
+            hf = HuggingFaceEmbeddings(model_name=model_name, **hf_kwargs)
+            class HFEncoderShim:
+                def __init__(self, hf_client):
+                    self._hf = hf_client
+                def encode(self, text: str):
+                    vecs = self._hf.embed_documents([text])
+                    return np.array(vecs[0], dtype=float)
+            self.encoder = HFEncoderShim(hf)
+            self.hf_embeddings = hf
+        print(f"[DEBUG-ChunkProcessor] SentenceTransformer loaded successfully", flush=True)
+        sys.stdout.flush()
         self.verbose = verbose
         hf_kwargs = {"model_kwargs": {"trust_remote_code": True}} if self._use_remote_code else {}
         self.hf_embeddings = HuggingFaceEmbeddings(model_name=model_name, **hf_kwargs) if load_hf_embeddings else None
+        print(f"[DEBUG-ChunkProcessor] ChunkProcessor init complete", flush=True)
     def _requires_remote_code(self, model_name: str) -> bool:
         normalized = (model_name or "").strip().lower()

retriever/retriever.py CHANGED Viewed

@@ -5,41 +5,58 @@ from rank_bm25 import BM25Okapi
 from sklearn.metrics.pairwise import cosine_similarity
 from typing import Optional, List
 # changed mmr to return final k, as a param, prev was hardcoded to 3
 # --@Qamare
 # Try to import FlashRank for CPU optimization, fallback to sentence-transformers
-try:
-    from flashrank import Ranker, RerankRequest
-    FLASHRANK_AVAILABLE = True
-except ImportError:
-    from sentence_transformers import CrossEncoder
-    FLASHRANK_AVAILABLE = False
 class HybridRetriever:
-    def __init__(self, final_chunks, embed_model, rerank_model_name='jinaai/jina-reranker-v1-tiny-en', verbose: bool = True):
-        self.final_chunks = final_chunks
         self.embed_model = embed_model
         self.verbose = verbose
         self.rerank_model_name = self._normalize_rerank_model_name(rerank_model_name)
-        # Use FlashRank if available (faster on CPU), otherwise fallback to sentence-transformers
-        if FLASHRANK_AVAILABLE:
             try:
-                self.rerank_model = Ranker(model_name=self.rerank_model_name)
-                self.use_flashrank = True
-            except Exception:
-                from sentence_transformers import CrossEncoder as STCrossEncoder
-                self.rerank_model = STCrossEncoder(self.rerank_model_name)
-                self.use_flashrank = False
-        else:
-            self.rerank_model = CrossEncoder(self.rerank_model_name)
-            self.use_flashrank = False
-        # Better tokenization for BM25 (strips punctuation)
-        self.tokenized_corpus = [self._tokenize(chunk['metadata']['text']) for chunk in final_chunks]
-        self.bm25 = BM25Okapi(self.tokenized_corpus)
-        self.technique_to_indices = self._build_chunking_index_map()
     def _normalize_rerank_model_name(self, model_name: str) -> str:
         normalized = (model_name or "").strip()
@@ -76,32 +93,70 @@ class HybridRetriever:
     # Retrieval
     # ------------------------------------------------------------------
-    def _semantic_search(self, query, index, top_k, chunking_technique: Optional[str] = None) -> tuple[np.ndarray, List[str]]:
         query_vector = self.embed_model.encode(query)
         query_kwargs = {
             "vector": query_vector.tolist(),
             "top_k": top_k,
             "include_metadata": True,
         }
-        if chunking_technique:
-            query_kwargs["filter"] = {"chunking_technique": {"$eq": chunking_technique}}
-        res = index.query(**query_kwargs)
         chunks = [match['metadata']['text'] for match in res['matches']]
         return query_vector, chunks
-    def _bm25_search(self, query, top_k, chunking_technique: Optional[str] = None) -> List[str]:
         tokenized_query = self._tokenize(query)
-        scores = self.bm25.get_scores(tokenized_query)
-        if chunking_technique:
-            candidate_indices = self.technique_to_indices.get(chunking_technique, [])
-            if not candidate_indices:
-                return []
-            top_indices = sorted(candidate_indices, key=lambda i: scores[i], reverse=True)[:top_k]
-        else:
-            top_indices = np.argsort(scores)[::-1][:top_k]
-        return [self.final_chunks[i]['metadata']['text'] for i in top_indices]
     # ------------------------------------------------------------------
     # Fusion
@@ -119,20 +174,22 @@ class HybridRetriever:
     # Reranking
     # ------------------------------------------------------------------
-    def _cross_encoder_rerank(self, query, chunks, final_k) -> List[str]:
-        if self.use_flashrank:
-            # Use FlashRank for CPU-optimized reranking
-            passages = [{"id": i, "text": chunk} for i, chunk in enumerate(chunks)]
-            rerank_request = RerankRequest(query=query, passages=passages)
-            results = self.rerank_model.rerank(rerank_request)
-            ranked_chunks = [res['text'] for res in results]
-            return ranked_chunks[:final_k]
-        else:
-            # Fallback to sentence-transformers CrossEncoder
-            pairs = [[query, chunk] for chunk in chunks]
-            scores = self.rerank_model.predict(pairs)
-            ranked = sorted(zip(chunks, scores), key=lambda x: x[1], reverse=True)
-            return [chunk for chunk, _ in ranked[:final_k]]
     # ------------------------------------------------------------------
     # MMR (applied after reranking as a diversity filter)
@@ -155,7 +212,7 @@ class HybridRetriever:
         # STEP 1: Encode chunks to get embeddings
         print(f"    [MMR DEBUG] Encoding {len(chunks)} chunks...")
         try:
-            chunk_embeddings = self.embed_model.encode(chunks)
             print(f"    [MMR DEBUG] Chunk embeddings shape: {chunk_embeddings.shape}")
         except Exception as e:
             print(f"    [MMR DEBUG] ERROR encoding chunks: {e}")
@@ -246,24 +303,30 @@ class HybridRetriever:
     # Main search
     # ------------------------------------------------------------------
-    def search(self, query, index, top_k=25, final_k=5, mode="hybrid",
-               chunking_technique: Optional[str] = None,
                rerank_strategy="cross-encoder", use_mmr=False, lambda_param=0.5,
-               verbose: Optional[bool] = None) -> List[str]:
         """
         :param mode:             "semantic", "bm25", or "hybrid"
         :param rerank_strategy:  "cross-encoder", "rrf", or "none"
         :param use_mmr:          Whether to apply MMR diversity filter after reranking
         :param lambda_param:     MMR trade-off between relevance (1.0) and diversity (0.0)
         """
         should_print = verbose if verbose is not None else self.verbose
-        requested_technique = self._normalize_chunking_technique(chunking_technique)
         total_start = time.perf_counter()
         semantic_time = 0.0
         bm25_time = 0.0
         rerank_time = 0.0
         mmr_time = 0.0
         if should_print:
             self._print_search_header(query, mode, rerank_strategy, top_k, final_k)
             if requested_technique:
@@ -283,25 +346,32 @@ class HybridRetriever:
         if mode in ["bm25", "hybrid"]:
             bm25_start = time.perf_counter()
-            bm25_chunks = self._bm25_search(query, top_k, requested_technique)
             bm25_time = time.perf_counter() - bm25_start
             if should_print:
                 self._print_candidates("BM25 Search", bm25_chunks)
                 print(f"BM25 time: {bm25_time:.3f}s")
         # 2. Fuse / rerank
         rerank_start = time.perf_counter()
         if rerank_strategy == "rrf":
             candidates = self._rrf_score(semantic_chunks, bm25_chunks)[:final_k]
             label = "RRF"
         elif rerank_strategy == "cross-encoder":
             combined = list(dict.fromkeys(semantic_chunks + bm25_chunks))
-            candidates = self._cross_encoder_rerank(query, combined, final_k)
             label = "Cross-Encoder"
         else:  # "none"
             candidates = list(dict.fromkeys(semantic_chunks + bm25_chunks))[:final_k]
             label = "No Reranking"
         rerank_time = time.perf_counter() - rerank_start
         # 3. MMR diversity filter (applied after reranking)
         if use_mmr and candidates:
@@ -313,13 +383,17 @@ class HybridRetriever:
             label += " + MMR"
             mmr_time = time.perf_counter() - mmr_start
         total_time = time.perf_counter() - total_start
         if should_print:
             self._print_final_results(candidates, label)
             self._print_timing_summary(semantic_time, bm25_time, rerank_time, mmr_time, total_time)
-        return candidates
     # ------------------------------------------------------------------
     # Printing

 from sklearn.metrics.pairwise import cosine_similarity
 from typing import Optional, List
+#
 # changed mmr to return final k, as a param, prev was hardcoded to 3
 # --@Qamare
 # Try to import FlashRank for CPU optimization, fallback to sentence-transformers
+# try:
+    # from flashrank import Ranker, RerankRequest
+    # FLASHRANK_AVAILABLE = True
+# except ImportError:
+    # from sentence_transformers import CrossEncoder
+    # FLASHRANK_AVAILABLE = False
 class HybridRetriever:
+    def __init__(self, embed_model, rerank_model_name='jinaai/jina-reranker-v1-tiny-en', verbose: bool = True):
+        import sys
+        import os
+        print(f"[DEBUG-HybridRetriever] Starting init", flush=True)
         self.embed_model = embed_model
         self.verbose = verbose
         self.rerank_model_name = self._normalize_rerank_model_name(rerank_model_name)
+        print(f"[DEBUG-HybridRetriever] Rerank model name: {self.rerank_model_name}", flush=True)
+        self.vo_client = None
+        self.ce_reranker = None
+        self.reranker_backend = "cross-encoder"
+        voyage_api_key = os.getenv("VOYAGE_API_KEY")
+        if voyage_api_key:
             try:
+                import voyageai
+                self.vo_client = voyageai.Client(api_key=voyage_api_key)
+                self.reranker_backend = "voyageai"
+                # Voyage uses model IDs like rerank-2.5; keep a safe default.
+                if not self.rerank_model_name.startswith("rerank-"):
+                    self.rerank_model_name = "rerank-2.5"
+                print(f"[DEBUG-HybridRetriever] Voyage AI client initialized", flush=True)
+            except Exception as exc:
+                print(f"[DEBUG-HybridRetriever] Voyage unavailable ({exc}); falling back to cross-encoder", flush=True)
+        if self.vo_client is None:
+            from sentence_transformers import CrossEncoder
+            ce_model_name = self.rerank_model_name
+            if not ce_model_name.startswith("cross-encoder/"):
+                ce_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+            self.ce_reranker = CrossEncoder(ce_model_name)
+            self.rerank_model_name = ce_model_name
+            self.reranker_backend = "cross-encoder"
+            print(f"[DEBUG-HybridRetriever] Cross-encoder reranker initialized: {ce_model_name}", flush=True)
+        sys.stdout.flush()
+        print(f"[DEBUG-HybridRetriever] Init complete", flush=True)
     def _normalize_rerank_model_name(self, model_name: str) -> str:
         normalized = (model_name or "").strip()
     # Retrieval
     # ------------------------------------------------------------------
+    def _semantic_search(self, query, index, top_k, technique_name: Optional[str] = None) -> tuple[np.ndarray, List[str]]:
         query_vector = self.embed_model.encode(query)
         query_kwargs = {
             "vector": query_vector.tolist(),
             "top_k": top_k,
             "include_metadata": True,
         }
+        if technique_name:
+            query_kwargs["filter"] = {"chunking_technique": {"$eq": technique_name}}
+        res = index.query(
+            **query_kwargs
+        )
         chunks = [match['metadata']['text'] for match in res['matches']]
         return query_vector, chunks
+    def _bm25_search(self, query, index, top_k=50, technique_name: Optional[str] = None) -> List[str]:
+        try:
+            import os
+            from pinecone import Pinecone
+            from pinecone_text.sparse import BM25Encoder
+            encoder = BM25Encoder().default()
+            pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
+            sparse_index = pc.Index("cbt-book-sparse")
+            sparse_vector = encoder.encode_queries(query)
+            query_kwargs = {
+                "sparse_vector": sparse_vector,
+                "top_k": top_k,
+                "include_metadata": True,
+            }
+            if technique_name:
+                query_kwargs["filter"] = {"chunking_technique": {"$eq": technique_name}}
+            res = sparse_index.query(**query_kwargs)
+            return [match["metadata"]["text"] for match in res["matches"]]
+        except Exception as e:
+            print(f"Error in BM25 search against Pinecone: {e}")
+            return []
+        """Fetch chunks from Pinecone and perform BM25 ranking locally."""
+        # Fetch more candidates than needed for BM25 to rank against
+        # Use a reasonable multiplier to get enough candidates without over-fetching
+        fetch_limit = min(top_k * 4,25)  # e.g., 4*4=16, capped at 50
+        res = index.query(
+            vector=[0.0] * 512,  # Dummy vector (BM25 doesn't use embeddings)
+            top_k=fetch_limit,
+            include_metadata=True,
+            filter={"chunking_technique": {"$eq": technique_name}}
+        )
+        # Extract chunks
+        chunks = [match['metadata']['text'] for match in res['matches']]
+        if not chunks:
+            return []
+        # Build BM25 index on these chunks
+        tokenized_corpus = [self._tokenize(chunk) for chunk in chunks]
+        bm25 = BM25Okapi(tokenized_corpus)
+        # Score query against chunks
         tokenized_query = self._tokenize(query)
+        scores = bm25.get_scores(tokenized_query)
+        top_indices = np.argsort(scores)[::-1][:top_k]
+        return [chunks[i] for i in top_indices]
     # ------------------------------------------------------------------
     # Fusion
     # Reranking
     # ------------------------------------------------------------------
+    def _cross_encoder_rerank(self, query, chunks, final_k) -> tuple[List[str], List[float]]:
+        if not chunks:
+            return [], []
+        if self.vo_client is not None:
+            reranking = self.vo_client.rerank(query, chunks, model=self.rerank_model_name, top_k=final_k)
+            ranked_chunks = [result.document for result in reranking.results]
+            ranked_scores = [result.relevance_score for result in reranking.results]
+            return ranked_chunks, ranked_scores
+        pairs = [[query, chunk] for chunk in chunks]
+        scores = self.ce_reranker.predict(pairs)
+        ranked_indices = np.argsort(scores)[::-1][:final_k]
+        ranked_chunks = [chunks[i] for i in ranked_indices]
+        ranked_scores = [float(scores[i]) for i in ranked_indices]
+        return ranked_chunks, ranked_scores
     # ------------------------------------------------------------------
     # MMR (applied after reranking as a diversity filter)
         # STEP 1: Encode chunks to get embeddings
         print(f"    [MMR DEBUG] Encoding {len(chunks)} chunks...")
         try:
+            chunk_embeddings = np.array([self.embed_model.encode(c) for c in chunks])
             print(f"    [MMR DEBUG] Chunk embeddings shape: {chunk_embeddings.shape}")
         except Exception as e:
             print(f"    [MMR DEBUG] ERROR encoding chunks: {e}")
     # Main search
     # ------------------------------------------------------------------
+    def search(self, query, index, top_k=50, final_k=5, mode="hybrid",
                rerank_strategy="cross-encoder", use_mmr=False, lambda_param=0.5,
+               technique_name: Optional[str] = None,
+               chunking_technique: Optional[str] = None,
+               verbose: Optional[bool] = None, test: bool = False) -> tuple[List[str], float]:
         """
         :param mode:             "semantic", "bm25", or "hybrid"
         :param rerank_strategy:  "cross-encoder", "rrf", or "none"
         :param use_mmr:          Whether to apply MMR diversity filter after reranking
         :param lambda_param:     MMR trade-off between relevance (1.0) and diversity (0.0)
+        :param technique_name:   Chunking technique to filter by (default: "markdown")
+        :returns:                Tuple of (ranked_chunks, avg_chunk_score)
         """
         should_print = verbose if verbose is not None else self.verbose
+        requested_technique = self._normalize_chunking_technique(chunking_technique or technique_name)
         total_start = time.perf_counter()
         semantic_time = 0.0
         bm25_time = 0.0
         rerank_time = 0.0
         mmr_time = 0.0
+        if use_mmr:
+            final_k = 10
         if should_print:
             self._print_search_header(query, mode, rerank_strategy, top_k, final_k)
             if requested_technique:
         if mode in ["bm25", "hybrid"]:
             bm25_start = time.perf_counter()
+            bm25_chunks = self._bm25_search(query, index, top_k, requested_technique)
             bm25_time = time.perf_counter() - bm25_start
             if should_print:
                 self._print_candidates("BM25 Search", bm25_chunks)
                 print(f"BM25 time: {bm25_time:.3f}s")
+                print("All BM25 results:")
+                for i, chunk in enumerate(bm25_chunks):
+                    print(f"  [{i}] {chunk[:200]}..." if len(chunk) > 200 else f"  [{i}] {chunk}")
         # 2. Fuse / rerank
         rerank_start = time.perf_counter()
+        chunk_scores = []
         if rerank_strategy == "rrf":
             candidates = self._rrf_score(semantic_chunks, bm25_chunks)[:final_k]
             label = "RRF"
         elif rerank_strategy == "cross-encoder":
             combined = list(dict.fromkeys(semantic_chunks + bm25_chunks))
+            candidates, chunk_scores = self._cross_encoder_rerank(query, combined, final_k)
             label = "Cross-Encoder"
         else:  # "none"
             candidates = list(dict.fromkeys(semantic_chunks + bm25_chunks))[:final_k]
             label = "No Reranking"
         rerank_time = time.perf_counter() - rerank_start
+        # Compute average chunk score
+        avg_chunk_score = float(np.mean(chunk_scores)) if chunk_scores else 0.0
         # 3. MMR diversity filter (applied after reranking)
         if use_mmr and candidates:
             label += " + MMR"
             mmr_time = time.perf_counter() - mmr_start
+        if test and rerank_strategy != "cross-encoder" and candidates:
+            _, test_scores = self._cross_encoder_rerank(query, candidates, len(candidates))
+            avg_chunk_score = float(np.mean(test_scores)) if test_scores else 0.0
         total_time = time.perf_counter() - total_start
         if should_print:
             self._print_final_results(candidates, label)
             self._print_timing_summary(semantic_time, bm25_time, rerank_time, mmr_time, total_time)
+        return candidates, avg_chunk_score
     # ------------------------------------------------------------------
     # Printing

test.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+os.environ["MKL_NUM_THREADS"] = "1"
+import sys
+import traceback
+from datetime import datetime
+from dotenv import load_dotenv
+from config_loader import cfg
+from data.vector_db import get_index_by_name
+from retriever.retriever import HybridRetriever
+from retriever.processor import ChunkProcessor
+from data.ingest import CHUNKING_TECHNIQUES
+def generate_retrieval_report(all_results, queries, output_file="retrieval_report.md"):
+    """
+    Generates a Markdown document summarizing the retrieved chunks
+    for each query, chunking technique, and retrieval strategy.
+    """
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    content = f"# Retrieval Testing Report\n\n*Generated:* {timestamp}\n\n"
+    content += "## Test Queries\n\n"
+    for i, q in enumerate(queries, 1):
+        content += f"{i}. {q}\n"
+    content += "\n## Retrieval Results by Query\n\n"
+    for q_idx, q_results in all_results.items():
+        content += f"### Query {q_idx + 1}: {queries[q_idx]}\n\n"
+        for tech_strat_key, chunks_data in q_results.items():
+            content += f"#### Strategy & Technique: {tech_strat_key}\n\n"
+            chunks = chunks_data.get('chunks', [])
+            score = chunks_data.get('score', 0)
+            content += f"**ChunkScore:** {score:.4f} | **Chunks retrieved:** {len(chunks)}\n\n"
+            if not chunks:
+                content += "*No chunks retrieved.*\n\n"
+            else:
+                for i, chunk in enumerate(chunks, 1):
+                    content += f"**[Chunk {i}]** ({len(chunk)} chars):\n"
+                    content += f"```text\n{chunk}\n```\n\n"
+            content += "---\n\n"
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write(content)
+    print(f"\nRetrieval report saved to: {output_file}")
+def main():
+    # Load environment variables
+    load_dotenv()
+    pinecone_key = os.getenv("PINECONE_API_KEY")
+    if not pinecone_key:
+        raise RuntimeError("PINECONE_API_KEY not found in environment variables")
+    test_queries = [
+        "What is cognitive behavior therapy and how does it work?",
+        "I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
+        "No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
+        "I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying."
+    ]
+    # TECHNIQUES_TO_EVALUATE = ["fixed", "semantic", "markdown", "page"]
+    # Use all 7 chunking techniques from ingest.py
+    CHUNKING_TECHNIQUES_FILTERED = CHUNKING_TECHNIQUES
+    print(f"Testing all {len(CHUNKING_TECHNIQUES_FILTERED)} chunking techniques:")
+    for tech in CHUNKING_TECHNIQUES_FILTERED:
+        print(f"  - {tech['name']}: {tech['description']}")
+    RETRIEVAL_STRATEGIES = [
+        {"mode": "semantic", "use_mmr": False, "label": "semantic-no-mmr"},
+        {"mode": "semantic", "use_mmr": True,  "label": "semantic-with-mmr"},
+        {"mode": "hybrid",   "use_mmr": False, "label": "hybrid-no-mmr"},
+        {"mode": "hybrid",   "use_mmr": True,  "label": "hybrid-with-mmr"},
+        {"mode": "bm25",     "use_mmr": False, "label": "bm25-no-mmr"},
+    ]
+    print("Initializing ChunkProcessor to load Embedding Model...")
+    proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)
+    print("Initializing HybridRetriever...")
+    retriever = HybridRetriever(
+        embed_model=proc.encoder,
+        rerank_model_name='jinaai/jina-reranker-v1-tiny-en',
+        verbose=False
+    )
+    all_query_results = {}
+    for query_idx, query in enumerate(test_queries):
+        print(f"\n{'='*80}")
+        print(f"PROCESSING QUERY {query_idx + 1}/{len(test_queries)}: {query}")
+        print(f"{'='*80}")
+        query_results = {}
+        # Connect to the single index where all techniques are stored with metadata differentiation
+        index_name = "cbt-book-recursive"
+        try:
+            index = get_index_by_name(pinecone_key, index_name)
+            stats = index.describe_index_stats()
+            if stats.get('total_vector_count', 0) == 0:
+                print(f"  [!] Warning: Index {index_name} is empty. Proceeding for sparse test.")
+        except Exception as e:
+            print(f"  [X] Failed to connect to index {index_name}: {e}")
+            continue
+        for technique in CHUNKING_TECHNIQUES_FILTERED:
+            technique_name = technique['name']
+            for strategy in RETRIEVAL_STRATEGIES:
+                result_key = f"{technique_name} + {strategy['label']}"
+                print(f"\nEvaluating: {result_key}")
+                try:
+                    context_chunks, chunk_score = retriever.search(
+                        query=query,
+                        index=index,
+                        mode=strategy['mode'],
+                        rerank_strategy="cross-encoder",
+                        use_mmr=strategy['use_mmr'],
+                        top_k=25,
+                        final_k=4,
+                        technique_name=technique_name,
+                        verbose=False,
+                        test=True
+                    )
+                    query_results[result_key] = {
+                        'chunks': context_chunks,
+                        'score': chunk_score
+                    }
+                    print(f"  -> Retrieved {len(context_chunks)} chunks (Score: {chunk_score:.4f})")
+                except Exception as e:
+                    print(f"  -> Error retrieving for {result_key}: {e}")
+        all_query_results[query_idx] = query_results
+    # Generate isolated retrieval test report
+    generate_retrieval_report(all_query_results, test_queries)
+if __name__ == '__main__':
+    main()