Spaces:

Qar-Raz
/

NLP-RAG

Sleeping

App Files Files Community

Qar-Raz commited on Apr 5

Commit

c64aaec

verified ·

1 Parent(s): 8f37cc7

Sync backend Docker context from GitHub main

Browse files

Files changed (10) hide show

config.yaml +4 -3
data/ingest.py +49 -42
main.py +114 -22
models/deepseek_v3.py +1 -1
models/llama_3_8b.py +3 -3
models/mistral_7b.py +12 -16
models/qwen_2_5.py +2 -2
models/qwen_3_5_9b.py +32 -0
models/tiny_aya.py +2 -2
retriever/retriever.py +22 -0

config.yaml CHANGED Viewed

@@ -29,16 +29,17 @@ retrieval:
   rerank_strategy: "cross-encoder"
   use_mmr: False
   top_k: 50
-  final_k: 5
 generation:
   temperature: 0.
-  max_new_tokens: 512
   # The model used to Judge the others (OpenRouter)
   judge_model: "deepseek/deepseek-v3.2"
 # List of contestants in the tournament
 models:
   - "Llama-3-8B"
   - "Mistral-7B"
-  - "TinyAya"

   rerank_strategy: "cross-encoder"
   use_mmr: False
   top_k: 50
+  final_k: 4
 generation:
   temperature: 0.
+  max_new_tokens: 1500
   # The model used to Judge the others (OpenRouter)
   judge_model: "deepseek/deepseek-v3.2"
 # List of contestants in the tournament
 models:
+  - "TinyAya"
   - "Llama-3-8B"
+  - "Qwen-3.5-9B"
   - "Mistral-7B"

data/ingest.py CHANGED Viewed

@@ -15,27 +15,34 @@ from retriever.processor import ChunkProcessor
 # 6 different chunking techniques for ablation study
 CHUNKING_TECHNIQUES = [
-    # {
-    #     "name": "fixed",
-    #     "description": "Fixed-size chunking - splits every N characters (may cut sentences mid-way)",
-    #     "chunk_size": 1000,
-    #     "chunk_overlap": 100,
-    #     "kwargs": {"separator": ""},  # No separator for fixed splitting
-    # },
-    # {
-    #     "name": "sentence",
-    #     "description": "Sentence-level chunking - respects sentence boundaries (NLTK)",
-    #     "chunk_size": 1000,
-    #     "chunk_overlap": 100,
-    #     "kwargs": {},
-    # },
-    # {
-    #     "name": "paragraph",
-    #     "description": "Paragraph-level chunking - uses natural paragraph breaks",
-    #     "chunk_size": 2500,
-    #     "chunk_overlap": 100,
-    #     "kwargs": {"separator": "\n\n"},  # Split on paragraph breaks
-    # },
     {
         "name": "semantic",
         "description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
@@ -43,27 +50,27 @@ CHUNKING_TECHNIQUES = [
         "chunk_overlap": 100,
         "kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
     },
-    # {
-    #     "name": "recursive",
-    #     "description": "Recursive chunking - hierarchical splitting (paragraphs → sentences → words → chars)",
-    #     "chunk_size": 2000,
-    #     "chunk_overlap": 100,
-    #     "kwargs": {"separators": ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""], "keep_separator": True},
-    # },
-    # {
-    #     "name": "page",
-    #     "description": "Page-level chunking - uses entire book pages as-is",
-    #     "chunk_size": 10000,  # Very large to keep full pages
-    #     "chunk_overlap": 0,   # No overlap between pages
-    #     "kwargs": {"separator": "--- Page"},  # Split on page markers
-    # },
-    # {
-    #     "name": "markdown",
-    #     "description": "Markdown header chunking - splits by headers (#, ##, ###, ####) with 4k char limit",
-    #     "chunk_size": 4000,  # Max 4k chars per chunk
-    #     "chunk_overlap": 0,  # No overlap for markdown
-    #     "kwargs": {},  # Custom implementation
-    # },
 ]

 # 6 different chunking techniques for ablation study
 CHUNKING_TECHNIQUES = [
+    {
+        "name": "fixed",
+        "description": "Fixed-size chunking - splits every N characters (may cut sentences mid-way)",
+        "chunk_size": 1000,
+        "chunk_overlap": 100,
+        "kwargs": {"separator": ""},  # No separator for fixed splitting
+    },
+    {
+        "name": "sentence",
+        "description": "Sentence-level chunking - respects sentence boundaries (NLTK)",
+        "chunk_size": 1000,
+        "chunk_overlap": 100,
+        "kwargs": {},
+    },
+    {
+        "name": "paragraph",
+        "description": "Paragraph-level chunking - uses natural paragraph breaks",
+        "chunk_size": 2500,
+        "chunk_overlap": 100,
+        "kwargs": {"separator": "\n\n"},  # Split on paragraph breaks
+    },
+    {
+        "name": "semantic",
+        "description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
+        "chunk_size": 2000,
+        "chunk_overlap": 100,
+        "kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
+    },
     {
         "name": "semantic",
         "description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
         "chunk_overlap": 100,
         "kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
     },
+    {
+        "name": "recursive",
+        "description": "Recursive chunking - hierarchical splitting (paragraphs → sentences → words → chars)",
+        "chunk_size": 2000,
+        "chunk_overlap": 100,
+        "kwargs": {"separators": ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""], "keep_separator": True},
+    },
+    {
+        "name": "page",
+        "description": "Page-level chunking - uses entire book pages as-is",
+        "chunk_size": 10000,  # Very large to keep full pages
+        "chunk_overlap": 0,   # No overlap between pages
+        "kwargs": {"separator": "--- Page"},  # Split on page markers
+    },
+    {
+        "name": "markdown",
+        "description": "Markdown header chunking - splits by headers (#, ##, ###, ####) with 4k char limit",
+        "chunk_size": 4000,  # Max 4k chars per chunk
+        "chunk_overlap": 0,  # No overlap for markdown
+        "kwargs": {},  # Custom implementation
+    },
 ]

main.py CHANGED Viewed

@@ -14,14 +14,16 @@ from data.data_loader import load_cbt_book, get_book_stats
 from data.ingest import ingest_data, CHUNKING_TECHNIQUES
 # Import model fleet
 from models.llama_3_8b import Llama3_8B
 from models.mistral_7b import Mistral_7b
 from models.tiny_aya import TinyAya
 MODEL_MAP = {
     "Llama-3-8B": Llama3_8B,
     "Mistral-7B": Mistral_7b,
-    "TinyAya": TinyAya
 }
 load_dotenv()
@@ -39,6 +41,7 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
     print(f"{'='*80}")
     # Use HybridRetriever to retrieve chunks
     context_chunks, chunk_score = retriever.search(
         query=query,
         index=index,
@@ -46,11 +49,12 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
         rerank_strategy="cross-encoder",
         use_mmr=use_mmr,
         top_k=50,
-        final_k=5,
         technique_name=technique_name,
         verbose=False
     )
     print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
     if not context_chunks:
@@ -72,6 +76,7 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
     tournament_results = {}
     tournament_results["_ChunkScore"] = chunk_score  # Store at technique level, not per model
     tournament_results["_Strategy"] = strategy_label
     for name, model_inst in models.items():
         print(f"\n{'-'*60}")
@@ -79,10 +84,12 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
         print(f"{'-'*60}")
         try:
             # Generation
-            answer = rag_engine.get_answer(
                 model_inst, query, context_chunks,
                 temperature=cfg.gen['temperature']
             )
             print(f"\n{'─'*60}")
             print(f"📝 FULL ANSWER from {name}:")
@@ -100,6 +107,10 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
                 "Faithfulness": faith['score'],
                 "Relevancy": rel['score'],
                 "Claims": faith['details'],
                 "context_chunks": context_chunks,
             }
@@ -107,6 +118,10 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
             print(f"  Faithfulness: {faith['score']:.1f}%")
             print(f"  Relevancy: {rel['score']:.3f}")
             print(f"  Combined: {faith['score'] + rel['score']:.3f}")
         except Exception as e:
             print(f"  Error evaluating {name}: {e}")
@@ -115,6 +130,10 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
                 "Faithfulness": 0,
                 "Relevancy": 0,
                 "Claims": [],
                 "error": str(e),
                 "context_chunks": context_chunks,
             }
@@ -186,13 +205,26 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
                         'Faithfulness': [],
                         'Relevancy': [],
                         'answers': [],
                         'context_chunks': results.get('context_chunks', []),
-                        'context_urls': results.get('context_urls', [])
                     }
                 aggregated_results[technique_name][model_name]['Faithfulness'].append(results.get('Faithfulness', 0))
                 aggregated_results[technique_name][model_name]['Relevancy'].append(results.get('Relevancy', 0))
                 aggregated_results[technique_name][model_name]['answers'].append(results.get('answer', ''))
     # Add results for each technique
     for technique_name, model_results in aggregated_results.items():
@@ -212,14 +244,19 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
         content += "\n"
         # Create results table with averaged scores
-        content += "| Model | Avg Faithfulness | Avg Relevancy | Avg Combined |\n"
-        content += "|-------|------------------|---------------|--------------|\n"
         for model_name, results in model_results.items():
             avg_faith = sum(results['Faithfulness']) / len(results['Faithfulness']) if results['Faithfulness'] else 0
             avg_rel = sum(results['Relevancy']) / len(results['Relevancy']) if results['Relevancy'] else 0
             avg_combined = avg_faith + avg_rel
-            content += f"| {model_name} | {avg_faith:.1f}% | {avg_rel:.3f} | {avg_combined:.3f} |\n"
         # Find best model for this technique
         if model_results:
@@ -266,7 +303,26 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
             # Show answers from each query
             for q_idx, answer in enumerate(answers):
                 content += f"📝 *Answer for Query {q_idx + 1}:*\n\n"
-                content += f"\n{answer}\n\n\n"
             content += "---\n\n"
@@ -275,8 +331,8 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
 ### Overall Performance Ranking (Across All Queries)
-| Rank | Technique | Avg Faithfulness | Avg Relevancy | Avg Combined |
-|------|-----------|------------------|---------------|--------------|
 """
     # Calculate averages for each technique across all queries
@@ -292,10 +348,21 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
             avg_faith = sum(all_faith) / len(all_faith) if all_faith else 0
             avg_rel = sum(all_rel) / len(all_rel) if all_rel else 0
             avg_combined = avg_faith + avg_rel
             technique_averages[technique_name] = {
                 'faith': avg_faith,
                 'rel': avg_rel,
-                'combined': avg_combined
             }
     # Sort by combined score
@@ -306,7 +373,7 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
     )
     for rank, (technique_name, averages) in enumerate(sorted_techniques, 1):
-        content += f"| {rank} | {technique_name} | {averages['faith']:.1f}% | {averages['rel']:.3f} | {averages['combined']:.3f} |\n"
     content += """
 ### Key Findings
@@ -362,6 +429,7 @@ This report was automatically generated by the RAG Ablation Study Pipeline.
     return output_file
 def run_rag_for_technique_sequential(technique_name, query, index, encoder, models, evaluator, rag_engine, retriever, retrieval_strategy):
     """Run RAG pipeline for a specific chunking technique and retrieval strategy (sequential)."""
@@ -374,6 +442,7 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
     print(f"{'='*80}")
     # Use HybridRetriever to retrieve chunks
     context_chunks, chunk_score = retriever.search(
         query=query,
         index=index,
@@ -381,12 +450,13 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
         rerank_strategy="cross-encoder",
         use_mmr=use_mmr,
         top_k=50,
-        final_k=5,
         technique_name=technique_name,
         verbose=False,
         test=True
     )
     print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
     if not context_chunks:
@@ -408,6 +478,7 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
     tournament_results = {}
     tournament_results["_ChunkScore"] = chunk_score
     tournament_results["_Strategy"] = strategy_label
     for name, model_inst in models.items():
         print(f"\n{'-'*60}")
@@ -415,10 +486,13 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
         print(f"{'-'*60}")
         try:
             # Generation
-            answer = rag_engine.get_answer(
-                model_inst, query, context_chunks,
-                temperature=cfg.gen['temperature']
             )
             print(f"\n{'─'*60}")
             print(f"📝 FULL ANSWER from {name}:")
@@ -436,6 +510,10 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
                 "Faithfulness": faith['score'],
                 "Relevancy": rel['score'],
                 "Claims": faith['details'],
                 "context_chunks": context_chunks,
             }
@@ -443,6 +521,10 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
             print(f"  Faithfulness: {faith['score']:.1f}%")
             print(f"  Relevancy: {rel['score']:.3f}")
             print(f"  Combined: {faith['score'] + rel['score']:.3f}")
         except Exception as e:
             print(f"  Error evaluating {name}: {e}")
@@ -451,6 +533,10 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
                 "Faithfulness": 0,
                 "Relevancy": 0,
                 "Claims": [],
                 "error": str(e),
                 "context_chunks": context_chunks,
             }
@@ -474,11 +560,17 @@ def main():
     # Test queries
     test_queries = [
-        "What is cognitive behavior therapy and how does it work?",
-        "I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
-        "No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
-        "I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying."
-    ]
     print("=" * 80)
     print("RAG ABLATION STUDY - 6 CHUNKING TECHNIQUES")
@@ -555,7 +647,7 @@ def main():
     ]
     # Filter to only 4 techniques to reduce memory usage
-    TECHNIQUES_TO_EVALUATE = ["markdown", "recursive", "paragraph"]
     CHUNKING_TECHNIQUES_FILTERED = [t for t in CHUNKING_TECHNIQUES if t['name'] in TECHNIQUES_TO_EVALUATE]
     # Step 3: Run RAG for all techniques x strategies SEQUENTIALLY (to avoid OOM)

 from data.ingest import ingest_data, CHUNKING_TECHNIQUES
 # Import model fleet
+from models.qwen_3_5_9b import Qwen_3_5_9B
 from models.llama_3_8b import Llama3_8B
 from models.mistral_7b import Mistral_7b
 from models.tiny_aya import TinyAya
 MODEL_MAP = {
+    "Qwen-3.5-9B": Qwen_3_5_9B,
+    "TinyAya": TinyAya,
     "Llama-3-8B": Llama3_8B,
     "Mistral-7B": Mistral_7b,
 }
 load_dotenv()
     print(f"{'='*80}")
     # Use HybridRetriever to retrieve chunks
+    retrieval_start_time = time.time()
     context_chunks, chunk_score = retriever.search(
         query=query,
         index=index,
         rerank_strategy="cross-encoder",
         use_mmr=use_mmr,
         top_k=50,
+        final_k=4,
         technique_name=technique_name,
         verbose=False
     )
+    retrieval_time = time.time() - retrieval_start_time
     print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
     if not context_chunks:
     tournament_results = {}
     tournament_results["_ChunkScore"] = chunk_score  # Store at technique level, not per model
     tournament_results["_Strategy"] = strategy_label
+    tournament_results["_retrieval_time"] = retrieval_time
     for name, model_inst in models.items():
         print(f"\n{'-'*60}")
         print(f"{'-'*60}")
         try:
             # Generation
+            inference_start_time = time.time()
+            answer = rag_engine.get_answer(model_inst, query, context_chunks,
                 model_inst, query, context_chunks,
                 temperature=cfg.gen['temperature']
             )
+            inference_time = time.time() - inference_start_time
             print(f"\n{'─'*60}")
             print(f"📝 FULL ANSWER from {name}:")
                 "Faithfulness": faith['score'],
                 "Relevancy": rel['score'],
                 "Claims": faith['details'],
+                "GenQueries": rel.get('queries', []),
+                "retrieval_time": retrieval_time,
+                "inference_time": inference_time,
+                "total_time": retrieval_time + inference_time,
                 "context_chunks": context_chunks,
             }
             print(f"  Faithfulness: {faith['score']:.1f}%")
             print(f"  Relevancy: {rel['score']:.3f}")
             print(f"  Combined: {faith['score'] + rel['score']:.3f}")
+            print(f"⏱️ LATENCY METRICS:")
+            print(f"  Retrieval: {retrieval_time:.2f}s")
+            print(f"  Inference: {inference_time:.2f}s")
+            print(f"  Total Response: {retrieval_time + inference_time:.2f}s")
         except Exception as e:
             print(f"  Error evaluating {name}: {e}")
                 "Faithfulness": 0,
                 "Relevancy": 0,
                 "Claims": [],
+                "GenQueries": [],
+                "retrieval_time": retrieval_time,
+                "inference_time": 0,
+                "total_time": retrieval_time,
                 "error": str(e),
                 "context_chunks": context_chunks,
             }
                         'Faithfulness': [],
                         'Relevancy': [],
                         'answers': [],
+                        'claims': [],
+                        'gen_queries': [],
                         'context_chunks': results.get('context_chunks', []),
+                        'context_urls': results.get('context_urls', []),
+                        'retrieval_time': [],
+                        'inference_time': [],
+                        'total_time': []
                     }
                 aggregated_results[technique_name][model_name]['Faithfulness'].append(results.get('Faithfulness', 0))
                 aggregated_results[technique_name][model_name]['Relevancy'].append(results.get('Relevancy', 0))
                 aggregated_results[technique_name][model_name]['answers'].append(results.get('answer', ''))
+                aggregated_results[technique_name][model_name]['claims'].append(results.get('Claims', []))
+                aggregated_results[technique_name][model_name]['gen_queries'].append(results.get('GenQueries', []))
+                if 'retrieval_time' in results:
+                    aggregated_results[technique_name][model_name]['retrieval_time'].append(results['retrieval_time'])
+                if 'inference_time' in results:
+                    aggregated_results[technique_name][model_name]['inference_time'].append(results['inference_time'])
+                if 'total_time' in results:
+                    aggregated_results[technique_name][model_name]['total_time'].append(results['total_time'])
     # Add results for each technique
     for technique_name, model_results in aggregated_results.items():
         content += "\n"
         # Create results table with averaged scores
+        content += "| Model | Avg Faithfulness | Avg Relevancy | Avg Combined | Avg Retrieval | Avg Inference | Avg Total |\n"
+        content += "|-------|------------------|---------------|--------------|---------------|---------------|-----------|\n"
         for model_name, results in model_results.items():
             avg_faith = sum(results['Faithfulness']) / len(results['Faithfulness']) if results['Faithfulness'] else 0
             avg_rel = sum(results['Relevancy']) / len(results['Relevancy']) if results['Relevancy'] else 0
             avg_combined = avg_faith + avg_rel
+            avg_ret = sum(results.get('retrieval_time', [0])) / len(results.get('retrieval_time', [1])) if results.get('retrieval_time') else 0
+            avg_inf = sum(results.get('inference_time', [0])) / len(results.get('inference_time', [1])) if results.get('inference_time') else 0
+            avg_tot = sum(results.get('total_time', [0])) / len(results.get('total_time', [1])) if results.get('total_time') else 0
+            content += f"| {model_name} | {avg_faith:.1f}% | {avg_rel:.3f} | {avg_combined:.3f} | {avg_ret:.2f}s | {avg_inf:.2f}s | {avg_tot:.2f}s |\n"
         # Find best model for this technique
         if model_results:
             # Show answers from each query
             for q_idx, answer in enumerate(answers):
                 content += f"📝 *Answer for Query {q_idx + 1}:*\n\n"
+                content += f"\n{answer}\n\n"
+                # Add extracted claims
+                claims = results.get('claims', [])[q_idx] if q_idx < len(results.get('claims', [])) else []
+                if claims:
+                    content += f"**Extracted Claims (Faithfulness):**\n"
+                    for claim in claims:
+                        status = "✅" if "Yes" in claim.get('verdict', '') else "❌"
+                        content += f"- {status} {claim.get('claim', '')}\n"
+                    content += "\n"
+                # Add generated queries
+                gen_queries = results.get('gen_queries', [])[q_idx] if q_idx < len(results.get('gen_queries', [])) else []
+                if gen_queries:
+                    content += f"**Generated Queries (Relevancy):**\n"
+                    for q in gen_queries:
+                        content += f"- {q}\n"
+                    content += "\n"
+                content += "\n"
             content += "---\n\n"
 ### Overall Performance Ranking (Across All Queries)
+| Rank | Technique | Avg Faithfulness | Avg Relevancy | Avg Combined | Avg Retrieval | Avg Inference | Avg Total |
+|------|-----------|------------------|---------------|--------------|---------------|---------------|-----------|
 """
     # Calculate averages for each technique across all queries
             avg_faith = sum(all_faith) / len(all_faith) if all_faith else 0
             avg_rel = sum(all_rel) / len(all_rel) if all_rel else 0
             avg_combined = avg_faith + avg_rel
+            all_ret = []
+            all_inf = []
+            all_tot = []
+            for r in model_results.values():
+                all_ret.extend(r.get('retrieval_time', [0]))
+                all_inf.extend(r.get('inference_time', [0]))
+                all_tot.extend(r.get('total_time', [0]))
             technique_averages[technique_name] = {
                 'faith': avg_faith,
                 'rel': avg_rel,
+                'combined': avg_combined,
+                'ret': sum(all_ret)/len(all_ret) if all_ret else 0,
+                'inf': sum(all_inf)/len(all_inf) if all_inf else 0,
+                'tot': sum(all_tot)/len(all_tot) if all_tot else 0
             }
     # Sort by combined score
     )
     for rank, (technique_name, averages) in enumerate(sorted_techniques, 1):
+        content += f"| {rank} | {technique_name} | {averages['faith']:.1f}% | {averages['rel']:.3f} | {averages['combined']:.3f} | {averages['ret']:.2f}s | {averages['inf']:.2f}s | {averages['tot']:.2f}s |\n"
     content += """
 ### Key Findings
     return output_file
+import time
 def run_rag_for_technique_sequential(technique_name, query, index, encoder, models, evaluator, rag_engine, retriever, retrieval_strategy):
     """Run RAG pipeline for a specific chunking technique and retrieval strategy (sequential)."""
     print(f"{'='*80}")
     # Use HybridRetriever to retrieve chunks
+    retrieval_start_time = time.time()
     context_chunks, chunk_score = retriever.search(
         query=query,
         index=index,
         rerank_strategy="cross-encoder",
         use_mmr=use_mmr,
         top_k=50,
+        final_k=4,
         technique_name=technique_name,
         verbose=False,
         test=True
     )
+    retrieval_time = time.time() - retrieval_start_time
     print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
     if not context_chunks:
     tournament_results = {}
     tournament_results["_ChunkScore"] = chunk_score
     tournament_results["_Strategy"] = strategy_label
+    tournament_results["_retrieval_time"] = retrieval_time
     for name, model_inst in models.items():
         print(f"\n{'-'*60}")
         print(f"{'-'*60}")
         try:
             # Generation
+            inference_start_time = time.time()
+            answer = rag_engine.get_answer(model_inst, query, context_chunks,
+                temperature=cfg.gen["temperature"]
             )
+            inference_time = time.time() - inference_start_time
+            inference_time = time.time() - inference_start_time
             print(f"\n{'─'*60}")
             print(f"📝 FULL ANSWER from {name}:")
                 "Faithfulness": faith['score'],
                 "Relevancy": rel['score'],
                 "Claims": faith['details'],
+                "GenQueries": rel.get('queries', []),
+                "retrieval_time": retrieval_time,
+                "inference_time": inference_time,
+                "total_time": retrieval_time + inference_time,
                 "context_chunks": context_chunks,
             }
             print(f"  Faithfulness: {faith['score']:.1f}%")
             print(f"  Relevancy: {rel['score']:.3f}")
             print(f"  Combined: {faith['score'] + rel['score']:.3f}")
+            print(f"⏱️ LATENCY METRICS:")
+            print(f"  Retrieval: {retrieval_time:.2f}s")
+            print(f"  Inference: {inference_time:.2f}s")
+            print(f"  Total Response: {retrieval_time + inference_time:.2f}s")
         except Exception as e:
             print(f"  Error evaluating {name}: {e}")
                 "Faithfulness": 0,
                 "Relevancy": 0,
                 "Claims": [],
+                "GenQueries": [],
+                "retrieval_time": retrieval_time,
+                "inference_time": 0,
+                "total_time": retrieval_time,
                 "error": str(e),
                 "context_chunks": context_chunks,
             }
     # Test queries
     test_queries = [
+    "What is cognitive behavior therapy and how does it work?",
+    "I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
+    "No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
+    "I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying.",
+    "My friend didn't text me back for five hours. I'm certain they are mad at me or that I've done something to ruin our friendship.",
+    "Can you explain the difference between a 'situation,' a 'thought,' and an 'emotion' in the context of a CBT thought record?",
+    "I have to do everything perfectly. If I make even one small mistake, it means the entire project is a total disaster and I've wasted everyone's time.",
+    "Whenever I have to give a presentation, my heart starts racing and I'm sure I'm going to have a heart attack or pass out in front of everyone.",
+    "I feel like I'm fundamentally broken and that if people really knew me, they would never want to be around me.",
+    "What is 'behavioral activation' and how can it help someone who is struggling with a lack of motivation or depression?"
+]
     print("=" * 80)
     print("RAG ABLATION STUDY - 6 CHUNKING TECHNIQUES")
     ]
     # Filter to only 4 techniques to reduce memory usage
+    TECHNIQUES_TO_EVALUATE = ["recursive",'semantic','fixed','markdown','sentence','paragraph']  # You can adjust this list to test different techniques
     CHUNKING_TECHNIQUES_FILTERED = [t for t in CHUNKING_TECHNIQUES if t['name'] in TECHNIQUES_TO_EVALUATE]
     # Step 3: Run RAG for all techniques x strategies SEQUENTIALLY (to avoid OOM)

models/deepseek_v3.py CHANGED Viewed

@@ -21,5 +21,5 @@ class DeepSeek_V3:
         except Exception as e:
             yield f" DeepSeek API Busy: {e}"
-    def generate(self, prompt, max_tokens=500, temperature=0.1):
         return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))

         except Exception as e:
             yield f" DeepSeek API Busy: {e}"
+    def generate(self, prompt, max_tokens=1500, temperature=0.1):
         return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))

models/llama_3_8b.py CHANGED Viewed

@@ -5,7 +5,7 @@ class Llama3_8B:
         self.client = InferenceClient(token=token)
         self.model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-    def generate_stream(self, prompt, max_tokens=1500, temperature=0.1):
         for message in self.client.chat_completion(
             model=self.model_id,
             messages=[{"role": "user", "content": prompt}],
@@ -18,5 +18,5 @@ class Llama3_8B:
                 if content:
                     yield content
-    def generate(self, prompt, max_tokens=500, temperature=0.1):
-        return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))

         self.client = InferenceClient(token=token)
         self.model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+    def generate_stream(self, prompt, max_tokens=1000, temperature=0.1):
         for message in self.client.chat_completion(
             model=self.model_id,
             messages=[{"role": "user", "content": prompt}],
                 if content:
                     yield content
+    def generate(self, prompt, max_tokens=1000, temperature=0.1):
+        return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))

models/mistral_7b.py CHANGED Viewed

@@ -1,29 +1,25 @@
 from huggingface_hub import InferenceClient
-import os
 class Mistral_7b:
     def __init__(self, token):
-        self.client = InferenceClient(api_key=token)
-        # Provider-suffixed ids (e.g. :featherless-ai) are not valid HF repo ids.
-        # Keep a sane default and allow override via env for experimentation.
-        self.model_id = os.getenv("MISTRAL_MODEL_ID", "mistralai/Mistral-7B-Instruct-v0.2")
-    def generate_stream(self, prompt, max_tokens=1500, temperature=0.1):
         try:
-            stream = self.client.chat.completions.create(
                 model=self.model_id,
                 messages=[{"role": "user", "content": prompt}],
                 max_tokens=max_tokens,
                 temperature=temperature,
-                stream=True,
-            )
-            for chunk in stream:
-                if chunk.choices and chunk.choices[0].delta.content:
-                    content = chunk.choices[0].delta.content
-                    yield content
         except Exception as e:
-            yield f" Mistral Featherless Error: {e}"
-    def generate(self, prompt, max_tokens=500, temperature=0.1):
         return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))

 from huggingface_hub import InferenceClient
 class Mistral_7b:
     def __init__(self, token):
+        self.client = InferenceClient(token=token)
+        self.model_id = "mistralai/Mistral-7B-Instruct-v0.2"
+    def generate_stream(self, prompt, max_tokens=1000, temperature=0.1):
         try:
+            for message in self.client.chat_completion(
                 model=self.model_id,
                 messages=[{"role": "user", "content": prompt}],
                 max_tokens=max_tokens,
                 temperature=temperature,
+                stream=True, extra_body={"reasoning": "none"},
+            ):
+                if message.choices:
+                    content = message.choices[0].delta.content
+                    if content:
+                        yield content
         except Exception as e:
+            yield f" Mistral_7b Error: {e}"
+    def generate(self, prompt, max_tokens=1000, temperature=0.1):
         return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))

models/qwen_2_5.py CHANGED Viewed

@@ -5,7 +5,7 @@ class Qwen2_5:
         self.client = InferenceClient(token=token)
         self.model_id = "Qwen/Qwen2.5-72B-Instruct"
-    def generate_stream(self, prompt, max_tokens=1500, temperature=0.1):
         for message in self.client.chat_completion(
             model=self.model_id,
             messages=[{"role": "user", "content": prompt}],
@@ -18,5 +18,5 @@ class Qwen2_5:
                 if content:
                     yield content
-    def generate(self, prompt, max_tokens=500, temperature=0.1):
         return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))

         self.client = InferenceClient(token=token)
         self.model_id = "Qwen/Qwen2.5-72B-Instruct"
+    def generate_stream(self, prompt, max_tokens=1000, temperature=0.1):
         for message in self.client.chat_completion(
             model=self.model_id,
             messages=[{"role": "user", "content": prompt}],
                 if content:
                     yield content
+    def generate(self, prompt, max_tokens=1000, temperature=0.1):
         return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))

models/qwen_3_5_9b.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+from openai import OpenAI
+class Qwen_3_5_9B:
+    def __init__(self, token=None):
+        openrouter_token = os.getenv("OPENROUTER_API_KEY")
+        if not openrouter_token:
+            print("Warning: OPENROUTER_API_KEY environment variable is not set")
+            # Fallback to the token passed in if available
+            openrouter_token = token
+        self.client = OpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key=openrouter_token,
+        )
+        self.model_id = "qwen/qwen3.5-9b"
+    def generate_stream(self, prompt, max_tokens=100000, temperature=0.1):
+        for message in self.client.chat.completions.create(
+            model=self.model_id,
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=max_tokens,
+            temperature=temperature,
+            stream=True,
+        ):
+            if message.choices and len(message.choices) > 0:
+                content = message.choices[0].delta.content
+                if content:
+                    yield content
+    def generate(self, prompt, max_tokens=100000, temperature=0.1):
+        return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))

models/tiny_aya.py CHANGED Viewed

@@ -5,7 +5,7 @@ class TinyAya:
         self.client = InferenceClient(token=token)
         self.model_id = "CohereLabs/tiny-aya-global"
-    def generate_stream(self, prompt, max_tokens=1500, temperature=0.1):
         try:
             for message in self.client.chat_completion(
                 model=self.model_id,
@@ -21,5 +21,5 @@ class TinyAya:
         except Exception as e:
             yield f" TinyAya Error: {e}"
-    def generate(self, prompt, max_tokens=500, temperature=0.1):
         return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))

         self.client = InferenceClient(token=token)
         self.model_id = "CohereLabs/tiny-aya-global"
+    def generate_stream(self, prompt, max_tokens=1000, temperature=0.1):
         try:
             for message in self.client.chat_completion(
                 model=self.model_id,
         except Exception as e:
             yield f" TinyAya Error: {e}"
+    def generate(self, prompt, max_tokens=1000, temperature=0.1):
         return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))

retriever/retriever.py CHANGED Viewed

@@ -337,6 +337,7 @@ class HybridRetriever:
             semantic_start = time.perf_counter()
             query_vector, semantic_chunks = self._semantic_search(query, index, top_k, requested_technique)
             semantic_time = time.perf_counter() - semantic_start
             if should_print:
                 self._print_candidates("Semantic Search", semantic_chunks)
                 print(f"Semantic time: {semantic_time:.3f}s")
@@ -345,6 +346,7 @@ class HybridRetriever:
             bm25_start = time.perf_counter()
             bm25_chunks = self._bm25_search(query, index, top_k, requested_technique)
             bm25_time = time.perf_counter() - bm25_start
             if should_print:
                 self._print_candidates("BM25 Search", bm25_chunks)
                 print(f"BM25 time: {bm25_time:.3f}s")
@@ -360,8 +362,28 @@ class HybridRetriever:
             label = "RRF"
         elif rerank_strategy == "cross-encoder":
             combined = list(dict.fromkeys(semantic_chunks + bm25_chunks))
             candidates, chunk_scores = self._cross_encoder_rerank(query, combined, final_k)
             label = "Cross-Encoder"
         else:  # "none"
             candidates = list(dict.fromkeys(semantic_chunks + bm25_chunks))[:final_k]
             label = "No Reranking"

             semantic_start = time.perf_counter()
             query_vector, semantic_chunks = self._semantic_search(query, index, top_k, requested_technique)
             semantic_time = time.perf_counter() - semantic_start
+            print(f"[DEBUG-FLOW] retrieved {len(semantic_chunks)} chunks from semantic search", flush=True)
             if should_print:
                 self._print_candidates("Semantic Search", semantic_chunks)
                 print(f"Semantic time: {semantic_time:.3f}s")
             bm25_start = time.perf_counter()
             bm25_chunks = self._bm25_search(query, index, top_k, requested_technique)
             bm25_time = time.perf_counter() - bm25_start
+            print(f"[DEBUG-FLOW] retrieved {len(bm25_chunks)} chunks from BM25 search", flush=True)
             if should_print:
                 self._print_candidates("BM25 Search", bm25_chunks)
                 print(f"BM25 time: {bm25_time:.3f}s")
             label = "RRF"
         elif rerank_strategy == "cross-encoder":
             combined = list(dict.fromkeys(semantic_chunks + bm25_chunks))
+            print(f"[DEBUG-FLOW] {len(combined)} unique chunks went into cross-encoder", flush=True)
             candidates, chunk_scores = self._cross_encoder_rerank(query, combined, final_k)
+            print(f"[DEBUG-FLOW] {len(candidates)} chunks got out of cross-encoder", flush=True)
             label = "Cross-Encoder"
+        elif rerank_strategy == "voyage":
+            import voyageai
+            voyage_client = voyageai.Client()
+            combined = list(dict.fromkeys(semantic_chunks + bm25_chunks))
+            print(f"[DEBUG-FLOW] {len(combined)} unique chunks went into voyage reranker", flush=True)
+            if not combined:
+                candidates, chunk_scores = [], []
+            else:
+                try:
+                    reranking = voyage_client.rerank(query=query, documents=combined, model=self.rerank_model_name, top_k=final_k)
+                    candidates = [r.document for r in reranking.results]
+                    chunk_scores = [r.relevance_score for r in reranking.results]
+                    print(f"[DEBUG-FLOW] {len(candidates)} chunks got out of voyage reranker", flush=True)
+                except Exception as e:
+                    print(f"Error calling Voyage API: {e}")
+                    candidates = combined[:final_k]
+                    chunk_scores = []
+            label = "Voyage"
         else:  # "none"
             candidates = list(dict.fromkeys(semantic_chunks + bm25_chunks))[:final_k]
             label = "No Reranking"