Qar-Raz commited on
Commit
c64aaec
·
verified ·
1 Parent(s): 8f37cc7

Sync backend Docker context from GitHub main

Browse files
config.yaml CHANGED
@@ -29,16 +29,17 @@ retrieval:
29
  rerank_strategy: "cross-encoder"
30
  use_mmr: False
31
  top_k: 50
32
- final_k: 5
33
 
34
  generation:
35
  temperature: 0.
36
- max_new_tokens: 512
37
  # The model used to Judge the others (OpenRouter)
38
  judge_model: "deepseek/deepseek-v3.2"
39
 
40
  # List of contestants in the tournament
41
  models:
 
42
  - "Llama-3-8B"
 
43
  - "Mistral-7B"
44
- - "TinyAya"
 
29
  rerank_strategy: "cross-encoder"
30
  use_mmr: False
31
  top_k: 50
32
+ final_k: 4
33
 
34
  generation:
35
  temperature: 0.
36
+ max_new_tokens: 1500
37
  # The model used to Judge the others (OpenRouter)
38
  judge_model: "deepseek/deepseek-v3.2"
39
 
40
  # List of contestants in the tournament
41
  models:
42
+ - "TinyAya"
43
  - "Llama-3-8B"
44
+ - "Qwen-3.5-9B"
45
  - "Mistral-7B"
 
data/ingest.py CHANGED
@@ -15,27 +15,34 @@ from retriever.processor import ChunkProcessor
15
 
16
  # 6 different chunking techniques for ablation study
17
  CHUNKING_TECHNIQUES = [
18
- # {
19
- # "name": "fixed",
20
- # "description": "Fixed-size chunking - splits every N characters (may cut sentences mid-way)",
21
- # "chunk_size": 1000,
22
- # "chunk_overlap": 100,
23
- # "kwargs": {"separator": ""}, # No separator for fixed splitting
24
- # },
25
- # {
26
- # "name": "sentence",
27
- # "description": "Sentence-level chunking - respects sentence boundaries (NLTK)",
28
- # "chunk_size": 1000,
29
- # "chunk_overlap": 100,
30
- # "kwargs": {},
31
- # },
32
- # {
33
- # "name": "paragraph",
34
- # "description": "Paragraph-level chunking - uses natural paragraph breaks",
35
- # "chunk_size": 2500,
36
- # "chunk_overlap": 100,
37
- # "kwargs": {"separator": "\n\n"}, # Split on paragraph breaks
38
- # },
 
 
 
 
 
 
 
39
  {
40
  "name": "semantic",
41
  "description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
@@ -43,27 +50,27 @@ CHUNKING_TECHNIQUES = [
43
  "chunk_overlap": 100,
44
  "kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
45
  },
46
- # {
47
- # "name": "recursive",
48
- # "description": "Recursive chunking - hierarchical splitting (paragraphs → sentences → words → chars)",
49
- # "chunk_size": 2000,
50
- # "chunk_overlap": 100,
51
- # "kwargs": {"separators": ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""], "keep_separator": True},
52
- # },
53
- # {
54
- # "name": "page",
55
- # "description": "Page-level chunking - uses entire book pages as-is",
56
- # "chunk_size": 10000, # Very large to keep full pages
57
- # "chunk_overlap": 0, # No overlap between pages
58
- # "kwargs": {"separator": "--- Page"}, # Split on page markers
59
- # },
60
- # {
61
- # "name": "markdown",
62
- # "description": "Markdown header chunking - splits by headers (#, ##, ###, ####) with 4k char limit",
63
- # "chunk_size": 4000, # Max 4k chars per chunk
64
- # "chunk_overlap": 0, # No overlap for markdown
65
- # "kwargs": {}, # Custom implementation
66
- # },
67
  ]
68
 
69
 
 
15
 
16
  # 6 different chunking techniques for ablation study
17
  CHUNKING_TECHNIQUES = [
18
+ {
19
+ "name": "fixed",
20
+ "description": "Fixed-size chunking - splits every N characters (may cut sentences mid-way)",
21
+ "chunk_size": 1000,
22
+ "chunk_overlap": 100,
23
+ "kwargs": {"separator": ""}, # No separator for fixed splitting
24
+ },
25
+ {
26
+ "name": "sentence",
27
+ "description": "Sentence-level chunking - respects sentence boundaries (NLTK)",
28
+ "chunk_size": 1000,
29
+ "chunk_overlap": 100,
30
+ "kwargs": {},
31
+ },
32
+ {
33
+ "name": "paragraph",
34
+ "description": "Paragraph-level chunking - uses natural paragraph breaks",
35
+ "chunk_size": 2500,
36
+ "chunk_overlap": 100,
37
+ "kwargs": {"separator": "\n\n"}, # Split on paragraph breaks
38
+ },
39
+ {
40
+ "name": "semantic",
41
+ "description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
42
+ "chunk_size": 2000,
43
+ "chunk_overlap": 100,
44
+ "kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
45
+ },
46
  {
47
  "name": "semantic",
48
  "description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
 
50
  "chunk_overlap": 100,
51
  "kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
52
  },
53
+ {
54
+ "name": "recursive",
55
+ "description": "Recursive chunking - hierarchical splitting (paragraphs → sentences → words → chars)",
56
+ "chunk_size": 2000,
57
+ "chunk_overlap": 100,
58
+ "kwargs": {"separators": ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""], "keep_separator": True},
59
+ },
60
+ {
61
+ "name": "page",
62
+ "description": "Page-level chunking - uses entire book pages as-is",
63
+ "chunk_size": 10000, # Very large to keep full pages
64
+ "chunk_overlap": 0, # No overlap between pages
65
+ "kwargs": {"separator": "--- Page"}, # Split on page markers
66
+ },
67
+ {
68
+ "name": "markdown",
69
+ "description": "Markdown header chunking - splits by headers (#, ##, ###, ####) with 4k char limit",
70
+ "chunk_size": 4000, # Max 4k chars per chunk
71
+ "chunk_overlap": 0, # No overlap for markdown
72
+ "kwargs": {}, # Custom implementation
73
+ },
74
  ]
75
 
76
 
main.py CHANGED
@@ -14,14 +14,16 @@ from data.data_loader import load_cbt_book, get_book_stats
14
  from data.ingest import ingest_data, CHUNKING_TECHNIQUES
15
 
16
  # Import model fleet
 
17
  from models.llama_3_8b import Llama3_8B
18
  from models.mistral_7b import Mistral_7b
19
  from models.tiny_aya import TinyAya
20
 
21
  MODEL_MAP = {
 
 
22
  "Llama-3-8B": Llama3_8B,
23
  "Mistral-7B": Mistral_7b,
24
- "TinyAya": TinyAya
25
  }
26
 
27
  load_dotenv()
@@ -39,6 +41,7 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
39
  print(f"{'='*80}")
40
 
41
  # Use HybridRetriever to retrieve chunks
 
42
  context_chunks, chunk_score = retriever.search(
43
  query=query,
44
  index=index,
@@ -46,11 +49,12 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
46
  rerank_strategy="cross-encoder",
47
  use_mmr=use_mmr,
48
  top_k=50,
49
- final_k=5,
50
  technique_name=technique_name,
51
  verbose=False
52
  )
53
 
 
54
  print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
55
 
56
  if not context_chunks:
@@ -72,6 +76,7 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
72
  tournament_results = {}
73
  tournament_results["_ChunkScore"] = chunk_score # Store at technique level, not per model
74
  tournament_results["_Strategy"] = strategy_label
 
75
 
76
  for name, model_inst in models.items():
77
  print(f"\n{'-'*60}")
@@ -79,10 +84,12 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
79
  print(f"{'-'*60}")
80
  try:
81
  # Generation
82
- answer = rag_engine.get_answer(
 
83
  model_inst, query, context_chunks,
84
  temperature=cfg.gen['temperature']
85
  )
 
86
 
87
  print(f"\n{'─'*60}")
88
  print(f"📝 FULL ANSWER from {name}:")
@@ -100,6 +107,10 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
100
  "Faithfulness": faith['score'],
101
  "Relevancy": rel['score'],
102
  "Claims": faith['details'],
 
 
 
 
103
  "context_chunks": context_chunks,
104
  }
105
 
@@ -107,6 +118,10 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
107
  print(f" Faithfulness: {faith['score']:.1f}%")
108
  print(f" Relevancy: {rel['score']:.3f}")
109
  print(f" Combined: {faith['score'] + rel['score']:.3f}")
 
 
 
 
110
 
111
  except Exception as e:
112
  print(f" Error evaluating {name}: {e}")
@@ -115,6 +130,10 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
115
  "Faithfulness": 0,
116
  "Relevancy": 0,
117
  "Claims": [],
 
 
 
 
118
  "error": str(e),
119
  "context_chunks": context_chunks,
120
  }
@@ -186,13 +205,26 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
186
  'Faithfulness': [],
187
  'Relevancy': [],
188
  'answers': [],
 
 
189
  'context_chunks': results.get('context_chunks', []),
190
- 'context_urls': results.get('context_urls', [])
 
 
 
191
  }
192
 
193
  aggregated_results[technique_name][model_name]['Faithfulness'].append(results.get('Faithfulness', 0))
194
  aggregated_results[technique_name][model_name]['Relevancy'].append(results.get('Relevancy', 0))
195
  aggregated_results[technique_name][model_name]['answers'].append(results.get('answer', ''))
 
 
 
 
 
 
 
 
196
 
197
  # Add results for each technique
198
  for technique_name, model_results in aggregated_results.items():
@@ -212,14 +244,19 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
212
  content += "\n"
213
 
214
  # Create results table with averaged scores
215
- content += "| Model | Avg Faithfulness | Avg Relevancy | Avg Combined |\n"
216
- content += "|-------|------------------|---------------|--------------|\n"
217
 
218
  for model_name, results in model_results.items():
219
  avg_faith = sum(results['Faithfulness']) / len(results['Faithfulness']) if results['Faithfulness'] else 0
220
  avg_rel = sum(results['Relevancy']) / len(results['Relevancy']) if results['Relevancy'] else 0
221
  avg_combined = avg_faith + avg_rel
222
- content += f"| {model_name} | {avg_faith:.1f}% | {avg_rel:.3f} | {avg_combined:.3f} |\n"
 
 
 
 
 
223
 
224
  # Find best model for this technique
225
  if model_results:
@@ -266,7 +303,26 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
266
  # Show answers from each query
267
  for q_idx, answer in enumerate(answers):
268
  content += f"📝 *Answer for Query {q_idx + 1}:*\n\n"
269
- content += f"\n{answer}\n\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
  content += "---\n\n"
272
 
@@ -275,8 +331,8 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
275
 
276
  ### Overall Performance Ranking (Across All Queries)
277
 
278
- | Rank | Technique | Avg Faithfulness | Avg Relevancy | Avg Combined |
279
- |------|-----------|------------------|---------------|--------------|
280
  """
281
 
282
  # Calculate averages for each technique across all queries
@@ -292,10 +348,21 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
292
  avg_faith = sum(all_faith) / len(all_faith) if all_faith else 0
293
  avg_rel = sum(all_rel) / len(all_rel) if all_rel else 0
294
  avg_combined = avg_faith + avg_rel
 
 
 
 
 
 
 
 
295
  technique_averages[technique_name] = {
296
  'faith': avg_faith,
297
  'rel': avg_rel,
298
- 'combined': avg_combined
 
 
 
299
  }
300
 
301
  # Sort by combined score
@@ -306,7 +373,7 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
306
  )
307
 
308
  for rank, (technique_name, averages) in enumerate(sorted_techniques, 1):
309
- content += f"| {rank} | {technique_name} | {averages['faith']:.1f}% | {averages['rel']:.3f} | {averages['combined']:.3f} |\n"
310
 
311
  content += """
312
  ### Key Findings
@@ -362,6 +429,7 @@ This report was automatically generated by the RAG Ablation Study Pipeline.
362
  return output_file
363
 
364
 
 
365
  def run_rag_for_technique_sequential(technique_name, query, index, encoder, models, evaluator, rag_engine, retriever, retrieval_strategy):
366
  """Run RAG pipeline for a specific chunking technique and retrieval strategy (sequential)."""
367
 
@@ -374,6 +442,7 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
374
  print(f"{'='*80}")
375
 
376
  # Use HybridRetriever to retrieve chunks
 
377
  context_chunks, chunk_score = retriever.search(
378
  query=query,
379
  index=index,
@@ -381,12 +450,13 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
381
  rerank_strategy="cross-encoder",
382
  use_mmr=use_mmr,
383
  top_k=50,
384
- final_k=5,
385
  technique_name=technique_name,
386
  verbose=False,
387
  test=True
388
  )
389
 
 
390
  print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
391
 
392
  if not context_chunks:
@@ -408,6 +478,7 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
408
  tournament_results = {}
409
  tournament_results["_ChunkScore"] = chunk_score
410
  tournament_results["_Strategy"] = strategy_label
 
411
 
412
  for name, model_inst in models.items():
413
  print(f"\n{'-'*60}")
@@ -415,10 +486,13 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
415
  print(f"{'-'*60}")
416
  try:
417
  # Generation
418
- answer = rag_engine.get_answer(
419
- model_inst, query, context_chunks,
420
- temperature=cfg.gen['temperature']
421
  )
 
 
 
422
 
423
  print(f"\n{'─'*60}")
424
  print(f"📝 FULL ANSWER from {name}:")
@@ -436,6 +510,10 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
436
  "Faithfulness": faith['score'],
437
  "Relevancy": rel['score'],
438
  "Claims": faith['details'],
 
 
 
 
439
  "context_chunks": context_chunks,
440
  }
441
 
@@ -443,6 +521,10 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
443
  print(f" Faithfulness: {faith['score']:.1f}%")
444
  print(f" Relevancy: {rel['score']:.3f}")
445
  print(f" Combined: {faith['score'] + rel['score']:.3f}")
 
 
 
 
446
 
447
  except Exception as e:
448
  print(f" Error evaluating {name}: {e}")
@@ -451,6 +533,10 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
451
  "Faithfulness": 0,
452
  "Relevancy": 0,
453
  "Claims": [],
 
 
 
 
454
  "error": str(e),
455
  "context_chunks": context_chunks,
456
  }
@@ -474,11 +560,17 @@ def main():
474
 
475
  # Test queries
476
  test_queries = [
477
- "What is cognitive behavior therapy and how does it work?",
478
- "I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
479
- "No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
480
- "I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying."
481
- ]
 
 
 
 
 
 
482
 
483
  print("=" * 80)
484
  print("RAG ABLATION STUDY - 6 CHUNKING TECHNIQUES")
@@ -555,7 +647,7 @@ def main():
555
  ]
556
 
557
  # Filter to only 4 techniques to reduce memory usage
558
- TECHNIQUES_TO_EVALUATE = ["markdown", "recursive", "paragraph"]
559
  CHUNKING_TECHNIQUES_FILTERED = [t for t in CHUNKING_TECHNIQUES if t['name'] in TECHNIQUES_TO_EVALUATE]
560
 
561
  # Step 3: Run RAG for all techniques x strategies SEQUENTIALLY (to avoid OOM)
 
14
  from data.ingest import ingest_data, CHUNKING_TECHNIQUES
15
 
16
  # Import model fleet
17
+ from models.qwen_3_5_9b import Qwen_3_5_9B
18
  from models.llama_3_8b import Llama3_8B
19
  from models.mistral_7b import Mistral_7b
20
  from models.tiny_aya import TinyAya
21
 
22
  MODEL_MAP = {
23
+ "Qwen-3.5-9B": Qwen_3_5_9B,
24
+ "TinyAya": TinyAya,
25
  "Llama-3-8B": Llama3_8B,
26
  "Mistral-7B": Mistral_7b,
 
27
  }
28
 
29
  load_dotenv()
 
41
  print(f"{'='*80}")
42
 
43
  # Use HybridRetriever to retrieve chunks
44
+ retrieval_start_time = time.time()
45
  context_chunks, chunk_score = retriever.search(
46
  query=query,
47
  index=index,
 
49
  rerank_strategy="cross-encoder",
50
  use_mmr=use_mmr,
51
  top_k=50,
52
+ final_k=4,
53
  technique_name=technique_name,
54
  verbose=False
55
  )
56
 
57
+ retrieval_time = time.time() - retrieval_start_time
58
  print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
59
 
60
  if not context_chunks:
 
76
  tournament_results = {}
77
  tournament_results["_ChunkScore"] = chunk_score # Store at technique level, not per model
78
  tournament_results["_Strategy"] = strategy_label
79
+ tournament_results["_retrieval_time"] = retrieval_time
80
 
81
  for name, model_inst in models.items():
82
  print(f"\n{'-'*60}")
 
84
  print(f"{'-'*60}")
85
  try:
86
  # Generation
87
+ inference_start_time = time.time()
88
+ answer = rag_engine.get_answer(model_inst, query, context_chunks,
89
  model_inst, query, context_chunks,
90
  temperature=cfg.gen['temperature']
91
  )
92
+ inference_time = time.time() - inference_start_time
93
 
94
  print(f"\n{'─'*60}")
95
  print(f"📝 FULL ANSWER from {name}:")
 
107
  "Faithfulness": faith['score'],
108
  "Relevancy": rel['score'],
109
  "Claims": faith['details'],
110
+ "GenQueries": rel.get('queries', []),
111
+ "retrieval_time": retrieval_time,
112
+ "inference_time": inference_time,
113
+ "total_time": retrieval_time + inference_time,
114
  "context_chunks": context_chunks,
115
  }
116
 
 
118
  print(f" Faithfulness: {faith['score']:.1f}%")
119
  print(f" Relevancy: {rel['score']:.3f}")
120
  print(f" Combined: {faith['score'] + rel['score']:.3f}")
121
+ print(f"⏱️ LATENCY METRICS:")
122
+ print(f" Retrieval: {retrieval_time:.2f}s")
123
+ print(f" Inference: {inference_time:.2f}s")
124
+ print(f" Total Response: {retrieval_time + inference_time:.2f}s")
125
 
126
  except Exception as e:
127
  print(f" Error evaluating {name}: {e}")
 
130
  "Faithfulness": 0,
131
  "Relevancy": 0,
132
  "Claims": [],
133
+ "GenQueries": [],
134
+ "retrieval_time": retrieval_time,
135
+ "inference_time": 0,
136
+ "total_time": retrieval_time,
137
  "error": str(e),
138
  "context_chunks": context_chunks,
139
  }
 
205
  'Faithfulness': [],
206
  'Relevancy': [],
207
  'answers': [],
208
+ 'claims': [],
209
+ 'gen_queries': [],
210
  'context_chunks': results.get('context_chunks', []),
211
+ 'context_urls': results.get('context_urls', []),
212
+ 'retrieval_time': [],
213
+ 'inference_time': [],
214
+ 'total_time': []
215
  }
216
 
217
  aggregated_results[technique_name][model_name]['Faithfulness'].append(results.get('Faithfulness', 0))
218
  aggregated_results[technique_name][model_name]['Relevancy'].append(results.get('Relevancy', 0))
219
  aggregated_results[technique_name][model_name]['answers'].append(results.get('answer', ''))
220
+ aggregated_results[technique_name][model_name]['claims'].append(results.get('Claims', []))
221
+ aggregated_results[technique_name][model_name]['gen_queries'].append(results.get('GenQueries', []))
222
+ if 'retrieval_time' in results:
223
+ aggregated_results[technique_name][model_name]['retrieval_time'].append(results['retrieval_time'])
224
+ if 'inference_time' in results:
225
+ aggregated_results[technique_name][model_name]['inference_time'].append(results['inference_time'])
226
+ if 'total_time' in results:
227
+ aggregated_results[technique_name][model_name]['total_time'].append(results['total_time'])
228
 
229
  # Add results for each technique
230
  for technique_name, model_results in aggregated_results.items():
 
244
  content += "\n"
245
 
246
  # Create results table with averaged scores
247
+ content += "| Model | Avg Faithfulness | Avg Relevancy | Avg Combined | Avg Retrieval | Avg Inference | Avg Total |\n"
248
+ content += "|-------|------------------|---------------|--------------|---------------|---------------|-----------|\n"
249
 
250
  for model_name, results in model_results.items():
251
  avg_faith = sum(results['Faithfulness']) / len(results['Faithfulness']) if results['Faithfulness'] else 0
252
  avg_rel = sum(results['Relevancy']) / len(results['Relevancy']) if results['Relevancy'] else 0
253
  avg_combined = avg_faith + avg_rel
254
+
255
+ avg_ret = sum(results.get('retrieval_time', [0])) / len(results.get('retrieval_time', [1])) if results.get('retrieval_time') else 0
256
+ avg_inf = sum(results.get('inference_time', [0])) / len(results.get('inference_time', [1])) if results.get('inference_time') else 0
257
+ avg_tot = sum(results.get('total_time', [0])) / len(results.get('total_time', [1])) if results.get('total_time') else 0
258
+
259
+ content += f"| {model_name} | {avg_faith:.1f}% | {avg_rel:.3f} | {avg_combined:.3f} | {avg_ret:.2f}s | {avg_inf:.2f}s | {avg_tot:.2f}s |\n"
260
 
261
  # Find best model for this technique
262
  if model_results:
 
303
  # Show answers from each query
304
  for q_idx, answer in enumerate(answers):
305
  content += f"📝 *Answer for Query {q_idx + 1}:*\n\n"
306
+ content += f"\n{answer}\n\n"
307
+
308
+ # Add extracted claims
309
+ claims = results.get('claims', [])[q_idx] if q_idx < len(results.get('claims', [])) else []
310
+ if claims:
311
+ content += f"**Extracted Claims (Faithfulness):**\n"
312
+ for claim in claims:
313
+ status = "✅" if "Yes" in claim.get('verdict', '') else "❌"
314
+ content += f"- {status} {claim.get('claim', '')}\n"
315
+ content += "\n"
316
+
317
+ # Add generated queries
318
+ gen_queries = results.get('gen_queries', [])[q_idx] if q_idx < len(results.get('gen_queries', [])) else []
319
+ if gen_queries:
320
+ content += f"**Generated Queries (Relevancy):**\n"
321
+ for q in gen_queries:
322
+ content += f"- {q}\n"
323
+ content += "\n"
324
+
325
+ content += "\n"
326
 
327
  content += "---\n\n"
328
 
 
331
 
332
  ### Overall Performance Ranking (Across All Queries)
333
 
334
+ | Rank | Technique | Avg Faithfulness | Avg Relevancy | Avg Combined | Avg Retrieval | Avg Inference | Avg Total |
335
+ |------|-----------|------------------|---------------|--------------|---------------|---------------|-----------|
336
  """
337
 
338
  # Calculate averages for each technique across all queries
 
348
  avg_faith = sum(all_faith) / len(all_faith) if all_faith else 0
349
  avg_rel = sum(all_rel) / len(all_rel) if all_rel else 0
350
  avg_combined = avg_faith + avg_rel
351
+ all_ret = []
352
+ all_inf = []
353
+ all_tot = []
354
+ for r in model_results.values():
355
+ all_ret.extend(r.get('retrieval_time', [0]))
356
+ all_inf.extend(r.get('inference_time', [0]))
357
+ all_tot.extend(r.get('total_time', [0]))
358
+
359
  technique_averages[technique_name] = {
360
  'faith': avg_faith,
361
  'rel': avg_rel,
362
+ 'combined': avg_combined,
363
+ 'ret': sum(all_ret)/len(all_ret) if all_ret else 0,
364
+ 'inf': sum(all_inf)/len(all_inf) if all_inf else 0,
365
+ 'tot': sum(all_tot)/len(all_tot) if all_tot else 0
366
  }
367
 
368
  # Sort by combined score
 
373
  )
374
 
375
  for rank, (technique_name, averages) in enumerate(sorted_techniques, 1):
376
+ content += f"| {rank} | {technique_name} | {averages['faith']:.1f}% | {averages['rel']:.3f} | {averages['combined']:.3f} | {averages['ret']:.2f}s | {averages['inf']:.2f}s | {averages['tot']:.2f}s |\n"
377
 
378
  content += """
379
  ### Key Findings
 
429
  return output_file
430
 
431
 
432
+ import time
433
  def run_rag_for_technique_sequential(technique_name, query, index, encoder, models, evaluator, rag_engine, retriever, retrieval_strategy):
434
  """Run RAG pipeline for a specific chunking technique and retrieval strategy (sequential)."""
435
 
 
442
  print(f"{'='*80}")
443
 
444
  # Use HybridRetriever to retrieve chunks
445
+ retrieval_start_time = time.time()
446
  context_chunks, chunk_score = retriever.search(
447
  query=query,
448
  index=index,
 
450
  rerank_strategy="cross-encoder",
451
  use_mmr=use_mmr,
452
  top_k=50,
453
+ final_k=4,
454
  technique_name=technique_name,
455
  verbose=False,
456
  test=True
457
  )
458
 
459
+ retrieval_time = time.time() - retrieval_start_time
460
  print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
461
 
462
  if not context_chunks:
 
478
  tournament_results = {}
479
  tournament_results["_ChunkScore"] = chunk_score
480
  tournament_results["_Strategy"] = strategy_label
481
+ tournament_results["_retrieval_time"] = retrieval_time
482
 
483
  for name, model_inst in models.items():
484
  print(f"\n{'-'*60}")
 
486
  print(f"{'-'*60}")
487
  try:
488
  # Generation
489
+ inference_start_time = time.time()
490
+ answer = rag_engine.get_answer(model_inst, query, context_chunks,
491
+ temperature=cfg.gen["temperature"]
492
  )
493
+ inference_time = time.time() - inference_start_time
494
+ inference_time = time.time() - inference_start_time
495
+
496
 
497
  print(f"\n{'─'*60}")
498
  print(f"📝 FULL ANSWER from {name}:")
 
510
  "Faithfulness": faith['score'],
511
  "Relevancy": rel['score'],
512
  "Claims": faith['details'],
513
+ "GenQueries": rel.get('queries', []),
514
+ "retrieval_time": retrieval_time,
515
+ "inference_time": inference_time,
516
+ "total_time": retrieval_time + inference_time,
517
  "context_chunks": context_chunks,
518
  }
519
 
 
521
  print(f" Faithfulness: {faith['score']:.1f}%")
522
  print(f" Relevancy: {rel['score']:.3f}")
523
  print(f" Combined: {faith['score'] + rel['score']:.3f}")
524
+ print(f"⏱️ LATENCY METRICS:")
525
+ print(f" Retrieval: {retrieval_time:.2f}s")
526
+ print(f" Inference: {inference_time:.2f}s")
527
+ print(f" Total Response: {retrieval_time + inference_time:.2f}s")
528
 
529
  except Exception as e:
530
  print(f" Error evaluating {name}: {e}")
 
533
  "Faithfulness": 0,
534
  "Relevancy": 0,
535
  "Claims": [],
536
+ "GenQueries": [],
537
+ "retrieval_time": retrieval_time,
538
+ "inference_time": 0,
539
+ "total_time": retrieval_time,
540
  "error": str(e),
541
  "context_chunks": context_chunks,
542
  }
 
560
 
561
  # Test queries
562
  test_queries = [
563
+ "What is cognitive behavior therapy and how does it work?",
564
+ "I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
565
+ "No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
566
+ "I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying.",
567
+ "My friend didn't text me back for five hours. I'm certain they are mad at me or that I've done something to ruin our friendship.",
568
+ "Can you explain the difference between a 'situation,' a 'thought,' and an 'emotion' in the context of a CBT thought record?",
569
+ "I have to do everything perfectly. If I make even one small mistake, it means the entire project is a total disaster and I've wasted everyone's time.",
570
+ "Whenever I have to give a presentation, my heart starts racing and I'm sure I'm going to have a heart attack or pass out in front of everyone.",
571
+ "I feel like I'm fundamentally broken and that if people really knew me, they would never want to be around me.",
572
+ "What is 'behavioral activation' and how can it help someone who is struggling with a lack of motivation or depression?"
573
+ ]
574
 
575
  print("=" * 80)
576
  print("RAG ABLATION STUDY - 6 CHUNKING TECHNIQUES")
 
647
  ]
648
 
649
  # Filter to only 4 techniques to reduce memory usage
650
+ TECHNIQUES_TO_EVALUATE = ["recursive",'semantic','fixed','markdown','sentence','paragraph'] # You can adjust this list to test different techniques
651
  CHUNKING_TECHNIQUES_FILTERED = [t for t in CHUNKING_TECHNIQUES if t['name'] in TECHNIQUES_TO_EVALUATE]
652
 
653
  # Step 3: Run RAG for all techniques x strategies SEQUENTIALLY (to avoid OOM)
models/deepseek_v3.py CHANGED
@@ -21,5 +21,5 @@ class DeepSeek_V3:
21
  except Exception as e:
22
  yield f" DeepSeek API Busy: {e}"
23
 
24
- def generate(self, prompt, max_tokens=500, temperature=0.1):
25
  return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
 
21
  except Exception as e:
22
  yield f" DeepSeek API Busy: {e}"
23
 
24
+ def generate(self, prompt, max_tokens=1500, temperature=0.1):
25
  return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
models/llama_3_8b.py CHANGED
@@ -5,7 +5,7 @@ class Llama3_8B:
5
  self.client = InferenceClient(token=token)
6
  self.model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
7
 
8
- def generate_stream(self, prompt, max_tokens=1500, temperature=0.1):
9
  for message in self.client.chat_completion(
10
  model=self.model_id,
11
  messages=[{"role": "user", "content": prompt}],
@@ -18,5 +18,5 @@ class Llama3_8B:
18
  if content:
19
  yield content
20
 
21
- def generate(self, prompt, max_tokens=500, temperature=0.1):
22
- return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
 
5
  self.client = InferenceClient(token=token)
6
  self.model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
7
 
8
+ def generate_stream(self, prompt, max_tokens=1000, temperature=0.1):
9
  for message in self.client.chat_completion(
10
  model=self.model_id,
11
  messages=[{"role": "user", "content": prompt}],
 
18
  if content:
19
  yield content
20
 
21
+ def generate(self, prompt, max_tokens=1000, temperature=0.1):
22
+ return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
models/mistral_7b.py CHANGED
@@ -1,29 +1,25 @@
1
  from huggingface_hub import InferenceClient
2
- import os
3
 
4
  class Mistral_7b:
5
  def __init__(self, token):
6
- self.client = InferenceClient(api_key=token)
7
- # Provider-suffixed ids (e.g. :featherless-ai) are not valid HF repo ids.
8
- # Keep a sane default and allow override via env for experimentation.
9
- self.model_id = os.getenv("MISTRAL_MODEL_ID", "mistralai/Mistral-7B-Instruct-v0.2")
10
 
11
- def generate_stream(self, prompt, max_tokens=1500, temperature=0.1):
12
  try:
13
- stream = self.client.chat.completions.create(
14
  model=self.model_id,
15
  messages=[{"role": "user", "content": prompt}],
16
  max_tokens=max_tokens,
17
  temperature=temperature,
18
- stream=True,
19
- )
20
- for chunk in stream:
21
- if chunk.choices and chunk.choices[0].delta.content:
22
- content = chunk.choices[0].delta.content
23
- yield content
24
-
25
  except Exception as e:
26
- yield f" Mistral Featherless Error: {e}"
27
 
28
- def generate(self, prompt, max_tokens=500, temperature=0.1):
29
  return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
 
1
  from huggingface_hub import InferenceClient
 
2
 
3
  class Mistral_7b:
4
  def __init__(self, token):
5
+ self.client = InferenceClient(token=token)
6
+ self.model_id = "mistralai/Mistral-7B-Instruct-v0.2"
 
 
7
 
8
+ def generate_stream(self, prompt, max_tokens=1000, temperature=0.1):
9
  try:
10
+ for message in self.client.chat_completion(
11
  model=self.model_id,
12
  messages=[{"role": "user", "content": prompt}],
13
  max_tokens=max_tokens,
14
  temperature=temperature,
15
+ stream=True, extra_body={"reasoning": "none"},
16
+ ):
17
+ if message.choices:
18
+ content = message.choices[0].delta.content
19
+ if content:
20
+ yield content
 
21
  except Exception as e:
22
+ yield f" Mistral_7b Error: {e}"
23
 
24
+ def generate(self, prompt, max_tokens=1000, temperature=0.1):
25
  return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
models/qwen_2_5.py CHANGED
@@ -5,7 +5,7 @@ class Qwen2_5:
5
  self.client = InferenceClient(token=token)
6
  self.model_id = "Qwen/Qwen2.5-72B-Instruct"
7
 
8
- def generate_stream(self, prompt, max_tokens=1500, temperature=0.1):
9
  for message in self.client.chat_completion(
10
  model=self.model_id,
11
  messages=[{"role": "user", "content": prompt}],
@@ -18,5 +18,5 @@ class Qwen2_5:
18
  if content:
19
  yield content
20
 
21
- def generate(self, prompt, max_tokens=500, temperature=0.1):
22
  return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
 
5
  self.client = InferenceClient(token=token)
6
  self.model_id = "Qwen/Qwen2.5-72B-Instruct"
7
 
8
+ def generate_stream(self, prompt, max_tokens=1000, temperature=0.1):
9
  for message in self.client.chat_completion(
10
  model=self.model_id,
11
  messages=[{"role": "user", "content": prompt}],
 
18
  if content:
19
  yield content
20
 
21
+ def generate(self, prompt, max_tokens=1000, temperature=0.1):
22
  return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
models/qwen_3_5_9b.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from openai import OpenAI
3
+
4
+ class Qwen_3_5_9B:
5
+ def __init__(self, token=None):
6
+ openrouter_token = os.getenv("OPENROUTER_API_KEY")
7
+ if not openrouter_token:
8
+ print("Warning: OPENROUTER_API_KEY environment variable is not set")
9
+ # Fallback to the token passed in if available
10
+ openrouter_token = token
11
+
12
+ self.client = OpenAI(
13
+ base_url="https://openrouter.ai/api/v1",
14
+ api_key=openrouter_token,
15
+ )
16
+ self.model_id = "qwen/qwen3.5-9b"
17
+
18
+ def generate_stream(self, prompt, max_tokens=100000, temperature=0.1):
19
+ for message in self.client.chat.completions.create(
20
+ model=self.model_id,
21
+ messages=[{"role": "user", "content": prompt}],
22
+ max_tokens=max_tokens,
23
+ temperature=temperature,
24
+ stream=True,
25
+ ):
26
+ if message.choices and len(message.choices) > 0:
27
+ content = message.choices[0].delta.content
28
+ if content:
29
+ yield content
30
+
31
+ def generate(self, prompt, max_tokens=100000, temperature=0.1):
32
+ return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
models/tiny_aya.py CHANGED
@@ -5,7 +5,7 @@ class TinyAya:
5
  self.client = InferenceClient(token=token)
6
  self.model_id = "CohereLabs/tiny-aya-global"
7
 
8
- def generate_stream(self, prompt, max_tokens=1500, temperature=0.1):
9
  try:
10
  for message in self.client.chat_completion(
11
  model=self.model_id,
@@ -21,5 +21,5 @@ class TinyAya:
21
  except Exception as e:
22
  yield f" TinyAya Error: {e}"
23
 
24
- def generate(self, prompt, max_tokens=500, temperature=0.1):
25
  return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
 
5
  self.client = InferenceClient(token=token)
6
  self.model_id = "CohereLabs/tiny-aya-global"
7
 
8
+ def generate_stream(self, prompt, max_tokens=1000, temperature=0.1):
9
  try:
10
  for message in self.client.chat_completion(
11
  model=self.model_id,
 
21
  except Exception as e:
22
  yield f" TinyAya Error: {e}"
23
 
24
+ def generate(self, prompt, max_tokens=1000, temperature=0.1):
25
  return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
retriever/retriever.py CHANGED
@@ -337,6 +337,7 @@ class HybridRetriever:
337
  semantic_start = time.perf_counter()
338
  query_vector, semantic_chunks = self._semantic_search(query, index, top_k, requested_technique)
339
  semantic_time = time.perf_counter() - semantic_start
 
340
  if should_print:
341
  self._print_candidates("Semantic Search", semantic_chunks)
342
  print(f"Semantic time: {semantic_time:.3f}s")
@@ -345,6 +346,7 @@ class HybridRetriever:
345
  bm25_start = time.perf_counter()
346
  bm25_chunks = self._bm25_search(query, index, top_k, requested_technique)
347
  bm25_time = time.perf_counter() - bm25_start
 
348
  if should_print:
349
  self._print_candidates("BM25 Search", bm25_chunks)
350
  print(f"BM25 time: {bm25_time:.3f}s")
@@ -360,8 +362,28 @@ class HybridRetriever:
360
  label = "RRF"
361
  elif rerank_strategy == "cross-encoder":
362
  combined = list(dict.fromkeys(semantic_chunks + bm25_chunks))
 
363
  candidates, chunk_scores = self._cross_encoder_rerank(query, combined, final_k)
 
364
  label = "Cross-Encoder"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  else: # "none"
366
  candidates = list(dict.fromkeys(semantic_chunks + bm25_chunks))[:final_k]
367
  label = "No Reranking"
 
337
  semantic_start = time.perf_counter()
338
  query_vector, semantic_chunks = self._semantic_search(query, index, top_k, requested_technique)
339
  semantic_time = time.perf_counter() - semantic_start
340
+ print(f"[DEBUG-FLOW] retrieved {len(semantic_chunks)} chunks from semantic search", flush=True)
341
  if should_print:
342
  self._print_candidates("Semantic Search", semantic_chunks)
343
  print(f"Semantic time: {semantic_time:.3f}s")
 
346
  bm25_start = time.perf_counter()
347
  bm25_chunks = self._bm25_search(query, index, top_k, requested_technique)
348
  bm25_time = time.perf_counter() - bm25_start
349
+ print(f"[DEBUG-FLOW] retrieved {len(bm25_chunks)} chunks from BM25 search", flush=True)
350
  if should_print:
351
  self._print_candidates("BM25 Search", bm25_chunks)
352
  print(f"BM25 time: {bm25_time:.3f}s")
 
362
  label = "RRF"
363
  elif rerank_strategy == "cross-encoder":
364
  combined = list(dict.fromkeys(semantic_chunks + bm25_chunks))
365
+ print(f"[DEBUG-FLOW] {len(combined)} unique chunks went into cross-encoder", flush=True)
366
  candidates, chunk_scores = self._cross_encoder_rerank(query, combined, final_k)
367
+ print(f"[DEBUG-FLOW] {len(candidates)} chunks got out of cross-encoder", flush=True)
368
  label = "Cross-Encoder"
369
+ elif rerank_strategy == "voyage":
370
+ import voyageai
371
+ voyage_client = voyageai.Client()
372
+ combined = list(dict.fromkeys(semantic_chunks + bm25_chunks))
373
+ print(f"[DEBUG-FLOW] {len(combined)} unique chunks went into voyage reranker", flush=True)
374
+ if not combined:
375
+ candidates, chunk_scores = [], []
376
+ else:
377
+ try:
378
+ reranking = voyage_client.rerank(query=query, documents=combined, model=self.rerank_model_name, top_k=final_k)
379
+ candidates = [r.document for r in reranking.results]
380
+ chunk_scores = [r.relevance_score for r in reranking.results]
381
+ print(f"[DEBUG-FLOW] {len(candidates)} chunks got out of voyage reranker", flush=True)
382
+ except Exception as e:
383
+ print(f"Error calling Voyage API: {e}")
384
+ candidates = combined[:final_k]
385
+ chunk_scores = []
386
+ label = "Voyage"
387
  else: # "none"
388
  candidates = list(dict.fromkeys(semantic_chunks + bm25_chunks))[:final_k]
389
  label = "No Reranking"