Sync backend Docker context from GitHub main
Browse files- config.yaml +4 -3
- data/ingest.py +49 -42
- main.py +114 -22
- models/deepseek_v3.py +1 -1
- models/llama_3_8b.py +3 -3
- models/mistral_7b.py +12 -16
- models/qwen_2_5.py +2 -2
- models/qwen_3_5_9b.py +32 -0
- models/tiny_aya.py +2 -2
- retriever/retriever.py +22 -0
config.yaml
CHANGED
|
@@ -29,16 +29,17 @@ retrieval:
|
|
| 29 |
rerank_strategy: "cross-encoder"
|
| 30 |
use_mmr: False
|
| 31 |
top_k: 50
|
| 32 |
-
final_k:
|
| 33 |
|
| 34 |
generation:
|
| 35 |
temperature: 0.
|
| 36 |
-
max_new_tokens:
|
| 37 |
# The model used to Judge the others (OpenRouter)
|
| 38 |
judge_model: "deepseek/deepseek-v3.2"
|
| 39 |
|
| 40 |
# List of contestants in the tournament
|
| 41 |
models:
|
|
|
|
| 42 |
- "Llama-3-8B"
|
|
|
|
| 43 |
- "Mistral-7B"
|
| 44 |
-
- "TinyAya"
|
|
|
|
| 29 |
rerank_strategy: "cross-encoder"
|
| 30 |
use_mmr: False
|
| 31 |
top_k: 50
|
| 32 |
+
final_k: 4
|
| 33 |
|
| 34 |
generation:
|
| 35 |
temperature: 0.
|
| 36 |
+
max_new_tokens: 1500
|
| 37 |
# The model used to Judge the others (OpenRouter)
|
| 38 |
judge_model: "deepseek/deepseek-v3.2"
|
| 39 |
|
| 40 |
# List of contestants in the tournament
|
| 41 |
models:
|
| 42 |
+
- "TinyAya"
|
| 43 |
- "Llama-3-8B"
|
| 44 |
+
- "Qwen-3.5-9B"
|
| 45 |
- "Mistral-7B"
|
|
|
data/ingest.py
CHANGED
|
@@ -15,27 +15,34 @@ from retriever.processor import ChunkProcessor
|
|
| 15 |
|
| 16 |
# 6 different chunking techniques for ablation study
|
| 17 |
CHUNKING_TECHNIQUES = [
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
{
|
| 40 |
"name": "semantic",
|
| 41 |
"description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
|
|
@@ -43,27 +50,27 @@ CHUNKING_TECHNIQUES = [
|
|
| 43 |
"chunk_overlap": 100,
|
| 44 |
"kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
|
| 45 |
},
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
]
|
| 68 |
|
| 69 |
|
|
|
|
| 15 |
|
| 16 |
# 6 different chunking techniques for ablation study
|
| 17 |
CHUNKING_TECHNIQUES = [
|
| 18 |
+
{
|
| 19 |
+
"name": "fixed",
|
| 20 |
+
"description": "Fixed-size chunking - splits every N characters (may cut sentences mid-way)",
|
| 21 |
+
"chunk_size": 1000,
|
| 22 |
+
"chunk_overlap": 100,
|
| 23 |
+
"kwargs": {"separator": ""}, # No separator for fixed splitting
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"name": "sentence",
|
| 27 |
+
"description": "Sentence-level chunking - respects sentence boundaries (NLTK)",
|
| 28 |
+
"chunk_size": 1000,
|
| 29 |
+
"chunk_overlap": 100,
|
| 30 |
+
"kwargs": {},
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"name": "paragraph",
|
| 34 |
+
"description": "Paragraph-level chunking - uses natural paragraph breaks",
|
| 35 |
+
"chunk_size": 2500,
|
| 36 |
+
"chunk_overlap": 100,
|
| 37 |
+
"kwargs": {"separator": "\n\n"}, # Split on paragraph breaks
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"name": "semantic",
|
| 41 |
+
"description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
|
| 42 |
+
"chunk_size": 2000,
|
| 43 |
+
"chunk_overlap": 100,
|
| 44 |
+
"kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
|
| 45 |
+
},
|
| 46 |
{
|
| 47 |
"name": "semantic",
|
| 48 |
"description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
|
|
|
|
| 50 |
"chunk_overlap": 100,
|
| 51 |
"kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
|
| 52 |
},
|
| 53 |
+
{
|
| 54 |
+
"name": "recursive",
|
| 55 |
+
"description": "Recursive chunking - hierarchical splitting (paragraphs → sentences → words → chars)",
|
| 56 |
+
"chunk_size": 2000,
|
| 57 |
+
"chunk_overlap": 100,
|
| 58 |
+
"kwargs": {"separators": ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""], "keep_separator": True},
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"name": "page",
|
| 62 |
+
"description": "Page-level chunking - uses entire book pages as-is",
|
| 63 |
+
"chunk_size": 10000, # Very large to keep full pages
|
| 64 |
+
"chunk_overlap": 0, # No overlap between pages
|
| 65 |
+
"kwargs": {"separator": "--- Page"}, # Split on page markers
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"name": "markdown",
|
| 69 |
+
"description": "Markdown header chunking - splits by headers (#, ##, ###, ####) with 4k char limit",
|
| 70 |
+
"chunk_size": 4000, # Max 4k chars per chunk
|
| 71 |
+
"chunk_overlap": 0, # No overlap for markdown
|
| 72 |
+
"kwargs": {}, # Custom implementation
|
| 73 |
+
},
|
| 74 |
]
|
| 75 |
|
| 76 |
|
main.py
CHANGED
|
@@ -14,14 +14,16 @@ from data.data_loader import load_cbt_book, get_book_stats
|
|
| 14 |
from data.ingest import ingest_data, CHUNKING_TECHNIQUES
|
| 15 |
|
| 16 |
# Import model fleet
|
|
|
|
| 17 |
from models.llama_3_8b import Llama3_8B
|
| 18 |
from models.mistral_7b import Mistral_7b
|
| 19 |
from models.tiny_aya import TinyAya
|
| 20 |
|
| 21 |
MODEL_MAP = {
|
|
|
|
|
|
|
| 22 |
"Llama-3-8B": Llama3_8B,
|
| 23 |
"Mistral-7B": Mistral_7b,
|
| 24 |
-
"TinyAya": TinyAya
|
| 25 |
}
|
| 26 |
|
| 27 |
load_dotenv()
|
|
@@ -39,6 +41,7 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
|
|
| 39 |
print(f"{'='*80}")
|
| 40 |
|
| 41 |
# Use HybridRetriever to retrieve chunks
|
|
|
|
| 42 |
context_chunks, chunk_score = retriever.search(
|
| 43 |
query=query,
|
| 44 |
index=index,
|
|
@@ -46,11 +49,12 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
|
|
| 46 |
rerank_strategy="cross-encoder",
|
| 47 |
use_mmr=use_mmr,
|
| 48 |
top_k=50,
|
| 49 |
-
final_k=
|
| 50 |
technique_name=technique_name,
|
| 51 |
verbose=False
|
| 52 |
)
|
| 53 |
|
|
|
|
| 54 |
print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
|
| 55 |
|
| 56 |
if not context_chunks:
|
|
@@ -72,6 +76,7 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
|
|
| 72 |
tournament_results = {}
|
| 73 |
tournament_results["_ChunkScore"] = chunk_score # Store at technique level, not per model
|
| 74 |
tournament_results["_Strategy"] = strategy_label
|
|
|
|
| 75 |
|
| 76 |
for name, model_inst in models.items():
|
| 77 |
print(f"\n{'-'*60}")
|
|
@@ -79,10 +84,12 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
|
|
| 79 |
print(f"{'-'*60}")
|
| 80 |
try:
|
| 81 |
# Generation
|
| 82 |
-
|
|
|
|
| 83 |
model_inst, query, context_chunks,
|
| 84 |
temperature=cfg.gen['temperature']
|
| 85 |
)
|
|
|
|
| 86 |
|
| 87 |
print(f"\n{'─'*60}")
|
| 88 |
print(f"📝 FULL ANSWER from {name}:")
|
|
@@ -100,6 +107,10 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
|
|
| 100 |
"Faithfulness": faith['score'],
|
| 101 |
"Relevancy": rel['score'],
|
| 102 |
"Claims": faith['details'],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
"context_chunks": context_chunks,
|
| 104 |
}
|
| 105 |
|
|
@@ -107,6 +118,10 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
|
|
| 107 |
print(f" Faithfulness: {faith['score']:.1f}%")
|
| 108 |
print(f" Relevancy: {rel['score']:.3f}")
|
| 109 |
print(f" Combined: {faith['score'] + rel['score']:.3f}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
except Exception as e:
|
| 112 |
print(f" Error evaluating {name}: {e}")
|
|
@@ -115,6 +130,10 @@ def run_rag_for_technique(technique_name, query, index, encoder, models, evaluat
|
|
| 115 |
"Faithfulness": 0,
|
| 116 |
"Relevancy": 0,
|
| 117 |
"Claims": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
"error": str(e),
|
| 119 |
"context_chunks": context_chunks,
|
| 120 |
}
|
|
@@ -186,13 +205,26 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
|
|
| 186 |
'Faithfulness': [],
|
| 187 |
'Relevancy': [],
|
| 188 |
'answers': [],
|
|
|
|
|
|
|
| 189 |
'context_chunks': results.get('context_chunks', []),
|
| 190 |
-
'context_urls': results.get('context_urls', [])
|
|
|
|
|
|
|
|
|
|
| 191 |
}
|
| 192 |
|
| 193 |
aggregated_results[technique_name][model_name]['Faithfulness'].append(results.get('Faithfulness', 0))
|
| 194 |
aggregated_results[technique_name][model_name]['Relevancy'].append(results.get('Relevancy', 0))
|
| 195 |
aggregated_results[technique_name][model_name]['answers'].append(results.get('answer', ''))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
# Add results for each technique
|
| 198 |
for technique_name, model_results in aggregated_results.items():
|
|
@@ -212,14 +244,19 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
|
|
| 212 |
content += "\n"
|
| 213 |
|
| 214 |
# Create results table with averaged scores
|
| 215 |
-
content += "| Model | Avg Faithfulness | Avg Relevancy | Avg Combined |\n"
|
| 216 |
-
content += "|-------|------------------|---------------|--------------|\n"
|
| 217 |
|
| 218 |
for model_name, results in model_results.items():
|
| 219 |
avg_faith = sum(results['Faithfulness']) / len(results['Faithfulness']) if results['Faithfulness'] else 0
|
| 220 |
avg_rel = sum(results['Relevancy']) / len(results['Relevancy']) if results['Relevancy'] else 0
|
| 221 |
avg_combined = avg_faith + avg_rel
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
# Find best model for this technique
|
| 225 |
if model_results:
|
|
@@ -266,7 +303,26 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
|
|
| 266 |
# Show answers from each query
|
| 267 |
for q_idx, answer in enumerate(answers):
|
| 268 |
content += f"📝 *Answer for Query {q_idx + 1}:*\n\n"
|
| 269 |
-
content += f"\n{answer}\n\n
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
|
| 271 |
content += "---\n\n"
|
| 272 |
|
|
@@ -275,8 +331,8 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
|
|
| 275 |
|
| 276 |
### Overall Performance Ranking (Across All Queries)
|
| 277 |
|
| 278 |
-
| Rank | Technique | Avg Faithfulness | Avg Relevancy | Avg Combined |
|
| 279 |
-
|------|-----------|------------------|---------------|--------------|
|
| 280 |
"""
|
| 281 |
|
| 282 |
# Calculate averages for each technique across all queries
|
|
@@ -292,10 +348,21 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
|
|
| 292 |
avg_faith = sum(all_faith) / len(all_faith) if all_faith else 0
|
| 293 |
avg_rel = sum(all_rel) / len(all_rel) if all_rel else 0
|
| 294 |
avg_combined = avg_faith + avg_rel
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
technique_averages[technique_name] = {
|
| 296 |
'faith': avg_faith,
|
| 297 |
'rel': avg_rel,
|
| 298 |
-
'combined': avg_combined
|
|
|
|
|
|
|
|
|
|
| 299 |
}
|
| 300 |
|
| 301 |
# Sort by combined score
|
|
@@ -306,7 +373,7 @@ multiple LLM models with RAG (Retrieval-Augmented Generation) pipeline.
|
|
| 306 |
)
|
| 307 |
|
| 308 |
for rank, (technique_name, averages) in enumerate(sorted_techniques, 1):
|
| 309 |
-
content += f"| {rank} | {technique_name} | {averages['faith']:.1f}% | {averages['rel']:.3f} | {averages['combined']:.3f} |\n"
|
| 310 |
|
| 311 |
content += """
|
| 312 |
### Key Findings
|
|
@@ -362,6 +429,7 @@ This report was automatically generated by the RAG Ablation Study Pipeline.
|
|
| 362 |
return output_file
|
| 363 |
|
| 364 |
|
|
|
|
| 365 |
def run_rag_for_technique_sequential(technique_name, query, index, encoder, models, evaluator, rag_engine, retriever, retrieval_strategy):
|
| 366 |
"""Run RAG pipeline for a specific chunking technique and retrieval strategy (sequential)."""
|
| 367 |
|
|
@@ -374,6 +442,7 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
|
|
| 374 |
print(f"{'='*80}")
|
| 375 |
|
| 376 |
# Use HybridRetriever to retrieve chunks
|
|
|
|
| 377 |
context_chunks, chunk_score = retriever.search(
|
| 378 |
query=query,
|
| 379 |
index=index,
|
|
@@ -381,12 +450,13 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
|
|
| 381 |
rerank_strategy="cross-encoder",
|
| 382 |
use_mmr=use_mmr,
|
| 383 |
top_k=50,
|
| 384 |
-
final_k=
|
| 385 |
technique_name=technique_name,
|
| 386 |
verbose=False,
|
| 387 |
test=True
|
| 388 |
)
|
| 389 |
|
|
|
|
| 390 |
print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
|
| 391 |
|
| 392 |
if not context_chunks:
|
|
@@ -408,6 +478,7 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
|
|
| 408 |
tournament_results = {}
|
| 409 |
tournament_results["_ChunkScore"] = chunk_score
|
| 410 |
tournament_results["_Strategy"] = strategy_label
|
|
|
|
| 411 |
|
| 412 |
for name, model_inst in models.items():
|
| 413 |
print(f"\n{'-'*60}")
|
|
@@ -415,10 +486,13 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
|
|
| 415 |
print(f"{'-'*60}")
|
| 416 |
try:
|
| 417 |
# Generation
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
temperature=cfg.gen[
|
| 421 |
)
|
|
|
|
|
|
|
|
|
|
| 422 |
|
| 423 |
print(f"\n{'─'*60}")
|
| 424 |
print(f"📝 FULL ANSWER from {name}:")
|
|
@@ -436,6 +510,10 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
|
|
| 436 |
"Faithfulness": faith['score'],
|
| 437 |
"Relevancy": rel['score'],
|
| 438 |
"Claims": faith['details'],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
"context_chunks": context_chunks,
|
| 440 |
}
|
| 441 |
|
|
@@ -443,6 +521,10 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
|
|
| 443 |
print(f" Faithfulness: {faith['score']:.1f}%")
|
| 444 |
print(f" Relevancy: {rel['score']:.3f}")
|
| 445 |
print(f" Combined: {faith['score'] + rel['score']:.3f}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 446 |
|
| 447 |
except Exception as e:
|
| 448 |
print(f" Error evaluating {name}: {e}")
|
|
@@ -451,6 +533,10 @@ def run_rag_for_technique_sequential(technique_name, query, index, encoder, mode
|
|
| 451 |
"Faithfulness": 0,
|
| 452 |
"Relevancy": 0,
|
| 453 |
"Claims": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
"error": str(e),
|
| 455 |
"context_chunks": context_chunks,
|
| 456 |
}
|
|
@@ -474,11 +560,17 @@ def main():
|
|
| 474 |
|
| 475 |
# Test queries
|
| 476 |
test_queries = [
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
|
| 483 |
print("=" * 80)
|
| 484 |
print("RAG ABLATION STUDY - 6 CHUNKING TECHNIQUES")
|
|
@@ -555,7 +647,7 @@ def main():
|
|
| 555 |
]
|
| 556 |
|
| 557 |
# Filter to only 4 techniques to reduce memory usage
|
| 558 |
-
TECHNIQUES_TO_EVALUATE = ["
|
| 559 |
CHUNKING_TECHNIQUES_FILTERED = [t for t in CHUNKING_TECHNIQUES if t['name'] in TECHNIQUES_TO_EVALUATE]
|
| 560 |
|
| 561 |
# Step 3: Run RAG for all techniques x strategies SEQUENTIALLY (to avoid OOM)
|
|
|
|
| 14 |
from data.ingest import ingest_data, CHUNKING_TECHNIQUES
|
| 15 |
|
| 16 |
# Import model fleet
|
| 17 |
+
from models.qwen_3_5_9b import Qwen_3_5_9B
|
| 18 |
from models.llama_3_8b import Llama3_8B
|
| 19 |
from models.mistral_7b import Mistral_7b
|
| 20 |
from models.tiny_aya import TinyAya
|
| 21 |
|
| 22 |
MODEL_MAP = {
|
| 23 |
+
"Qwen-3.5-9B": Qwen_3_5_9B,
|
| 24 |
+
"TinyAya": TinyAya,
|
| 25 |
"Llama-3-8B": Llama3_8B,
|
| 26 |
"Mistral-7B": Mistral_7b,
|
|
|
|
| 27 |
}
|
| 28 |
|
| 29 |
load_dotenv()
|
|
|
|
| 41 |
print(f"{'='*80}")
|
| 42 |
|
| 43 |
# Use HybridRetriever to retrieve chunks
|
| 44 |
+
retrieval_start_time = time.time()
|
| 45 |
context_chunks, chunk_score = retriever.search(
|
| 46 |
query=query,
|
| 47 |
index=index,
|
|
|
|
| 49 |
rerank_strategy="cross-encoder",
|
| 50 |
use_mmr=use_mmr,
|
| 51 |
top_k=50,
|
| 52 |
+
final_k=4,
|
| 53 |
technique_name=technique_name,
|
| 54 |
verbose=False
|
| 55 |
)
|
| 56 |
|
| 57 |
+
retrieval_time = time.time() - retrieval_start_time
|
| 58 |
print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
|
| 59 |
|
| 60 |
if not context_chunks:
|
|
|
|
| 76 |
tournament_results = {}
|
| 77 |
tournament_results["_ChunkScore"] = chunk_score # Store at technique level, not per model
|
| 78 |
tournament_results["_Strategy"] = strategy_label
|
| 79 |
+
tournament_results["_retrieval_time"] = retrieval_time
|
| 80 |
|
| 81 |
for name, model_inst in models.items():
|
| 82 |
print(f"\n{'-'*60}")
|
|
|
|
| 84 |
print(f"{'-'*60}")
|
| 85 |
try:
|
| 86 |
# Generation
|
| 87 |
+
inference_start_time = time.time()
|
| 88 |
+
answer = rag_engine.get_answer(model_inst, query, context_chunks,
|
| 89 |
model_inst, query, context_chunks,
|
| 90 |
temperature=cfg.gen['temperature']
|
| 91 |
)
|
| 92 |
+
inference_time = time.time() - inference_start_time
|
| 93 |
|
| 94 |
print(f"\n{'─'*60}")
|
| 95 |
print(f"📝 FULL ANSWER from {name}:")
|
|
|
|
| 107 |
"Faithfulness": faith['score'],
|
| 108 |
"Relevancy": rel['score'],
|
| 109 |
"Claims": faith['details'],
|
| 110 |
+
"GenQueries": rel.get('queries', []),
|
| 111 |
+
"retrieval_time": retrieval_time,
|
| 112 |
+
"inference_time": inference_time,
|
| 113 |
+
"total_time": retrieval_time + inference_time,
|
| 114 |
"context_chunks": context_chunks,
|
| 115 |
}
|
| 116 |
|
|
|
|
| 118 |
print(f" Faithfulness: {faith['score']:.1f}%")
|
| 119 |
print(f" Relevancy: {rel['score']:.3f}")
|
| 120 |
print(f" Combined: {faith['score'] + rel['score']:.3f}")
|
| 121 |
+
print(f"⏱️ LATENCY METRICS:")
|
| 122 |
+
print(f" Retrieval: {retrieval_time:.2f}s")
|
| 123 |
+
print(f" Inference: {inference_time:.2f}s")
|
| 124 |
+
print(f" Total Response: {retrieval_time + inference_time:.2f}s")
|
| 125 |
|
| 126 |
except Exception as e:
|
| 127 |
print(f" Error evaluating {name}: {e}")
|
|
|
|
| 130 |
"Faithfulness": 0,
|
| 131 |
"Relevancy": 0,
|
| 132 |
"Claims": [],
|
| 133 |
+
"GenQueries": [],
|
| 134 |
+
"retrieval_time": retrieval_time,
|
| 135 |
+
"inference_time": 0,
|
| 136 |
+
"total_time": retrieval_time,
|
| 137 |
"error": str(e),
|
| 138 |
"context_chunks": context_chunks,
|
| 139 |
}
|
|
|
|
| 205 |
'Faithfulness': [],
|
| 206 |
'Relevancy': [],
|
| 207 |
'answers': [],
|
| 208 |
+
'claims': [],
|
| 209 |
+
'gen_queries': [],
|
| 210 |
'context_chunks': results.get('context_chunks', []),
|
| 211 |
+
'context_urls': results.get('context_urls', []),
|
| 212 |
+
'retrieval_time': [],
|
| 213 |
+
'inference_time': [],
|
| 214 |
+
'total_time': []
|
| 215 |
}
|
| 216 |
|
| 217 |
aggregated_results[technique_name][model_name]['Faithfulness'].append(results.get('Faithfulness', 0))
|
| 218 |
aggregated_results[technique_name][model_name]['Relevancy'].append(results.get('Relevancy', 0))
|
| 219 |
aggregated_results[technique_name][model_name]['answers'].append(results.get('answer', ''))
|
| 220 |
+
aggregated_results[technique_name][model_name]['claims'].append(results.get('Claims', []))
|
| 221 |
+
aggregated_results[technique_name][model_name]['gen_queries'].append(results.get('GenQueries', []))
|
| 222 |
+
if 'retrieval_time' in results:
|
| 223 |
+
aggregated_results[technique_name][model_name]['retrieval_time'].append(results['retrieval_time'])
|
| 224 |
+
if 'inference_time' in results:
|
| 225 |
+
aggregated_results[technique_name][model_name]['inference_time'].append(results['inference_time'])
|
| 226 |
+
if 'total_time' in results:
|
| 227 |
+
aggregated_results[technique_name][model_name]['total_time'].append(results['total_time'])
|
| 228 |
|
| 229 |
# Add results for each technique
|
| 230 |
for technique_name, model_results in aggregated_results.items():
|
|
|
|
| 244 |
content += "\n"
|
| 245 |
|
| 246 |
# Create results table with averaged scores
|
| 247 |
+
content += "| Model | Avg Faithfulness | Avg Relevancy | Avg Combined | Avg Retrieval | Avg Inference | Avg Total |\n"
|
| 248 |
+
content += "|-------|------------------|---------------|--------------|---------------|---------------|-----------|\n"
|
| 249 |
|
| 250 |
for model_name, results in model_results.items():
|
| 251 |
avg_faith = sum(results['Faithfulness']) / len(results['Faithfulness']) if results['Faithfulness'] else 0
|
| 252 |
avg_rel = sum(results['Relevancy']) / len(results['Relevancy']) if results['Relevancy'] else 0
|
| 253 |
avg_combined = avg_faith + avg_rel
|
| 254 |
+
|
| 255 |
+
avg_ret = sum(results.get('retrieval_time', [0])) / len(results.get('retrieval_time', [1])) if results.get('retrieval_time') else 0
|
| 256 |
+
avg_inf = sum(results.get('inference_time', [0])) / len(results.get('inference_time', [1])) if results.get('inference_time') else 0
|
| 257 |
+
avg_tot = sum(results.get('total_time', [0])) / len(results.get('total_time', [1])) if results.get('total_time') else 0
|
| 258 |
+
|
| 259 |
+
content += f"| {model_name} | {avg_faith:.1f}% | {avg_rel:.3f} | {avg_combined:.3f} | {avg_ret:.2f}s | {avg_inf:.2f}s | {avg_tot:.2f}s |\n"
|
| 260 |
|
| 261 |
# Find best model for this technique
|
| 262 |
if model_results:
|
|
|
|
| 303 |
# Show answers from each query
|
| 304 |
for q_idx, answer in enumerate(answers):
|
| 305 |
content += f"📝 *Answer for Query {q_idx + 1}:*\n\n"
|
| 306 |
+
content += f"\n{answer}\n\n"
|
| 307 |
+
|
| 308 |
+
# Add extracted claims
|
| 309 |
+
claims = results.get('claims', [])[q_idx] if q_idx < len(results.get('claims', [])) else []
|
| 310 |
+
if claims:
|
| 311 |
+
content += f"**Extracted Claims (Faithfulness):**\n"
|
| 312 |
+
for claim in claims:
|
| 313 |
+
status = "✅" if "Yes" in claim.get('verdict', '') else "❌"
|
| 314 |
+
content += f"- {status} {claim.get('claim', '')}\n"
|
| 315 |
+
content += "\n"
|
| 316 |
+
|
| 317 |
+
# Add generated queries
|
| 318 |
+
gen_queries = results.get('gen_queries', [])[q_idx] if q_idx < len(results.get('gen_queries', [])) else []
|
| 319 |
+
if gen_queries:
|
| 320 |
+
content += f"**Generated Queries (Relevancy):**\n"
|
| 321 |
+
for q in gen_queries:
|
| 322 |
+
content += f"- {q}\n"
|
| 323 |
+
content += "\n"
|
| 324 |
+
|
| 325 |
+
content += "\n"
|
| 326 |
|
| 327 |
content += "---\n\n"
|
| 328 |
|
|
|
|
| 331 |
|
| 332 |
### Overall Performance Ranking (Across All Queries)
|
| 333 |
|
| 334 |
+
| Rank | Technique | Avg Faithfulness | Avg Relevancy | Avg Combined | Avg Retrieval | Avg Inference | Avg Total |
|
| 335 |
+
|------|-----------|------------------|---------------|--------------|---------------|---------------|-----------|
|
| 336 |
"""
|
| 337 |
|
| 338 |
# Calculate averages for each technique across all queries
|
|
|
|
| 348 |
avg_faith = sum(all_faith) / len(all_faith) if all_faith else 0
|
| 349 |
avg_rel = sum(all_rel) / len(all_rel) if all_rel else 0
|
| 350 |
avg_combined = avg_faith + avg_rel
|
| 351 |
+
all_ret = []
|
| 352 |
+
all_inf = []
|
| 353 |
+
all_tot = []
|
| 354 |
+
for r in model_results.values():
|
| 355 |
+
all_ret.extend(r.get('retrieval_time', [0]))
|
| 356 |
+
all_inf.extend(r.get('inference_time', [0]))
|
| 357 |
+
all_tot.extend(r.get('total_time', [0]))
|
| 358 |
+
|
| 359 |
technique_averages[technique_name] = {
|
| 360 |
'faith': avg_faith,
|
| 361 |
'rel': avg_rel,
|
| 362 |
+
'combined': avg_combined,
|
| 363 |
+
'ret': sum(all_ret)/len(all_ret) if all_ret else 0,
|
| 364 |
+
'inf': sum(all_inf)/len(all_inf) if all_inf else 0,
|
| 365 |
+
'tot': sum(all_tot)/len(all_tot) if all_tot else 0
|
| 366 |
}
|
| 367 |
|
| 368 |
# Sort by combined score
|
|
|
|
| 373 |
)
|
| 374 |
|
| 375 |
for rank, (technique_name, averages) in enumerate(sorted_techniques, 1):
|
| 376 |
+
content += f"| {rank} | {technique_name} | {averages['faith']:.1f}% | {averages['rel']:.3f} | {averages['combined']:.3f} | {averages['ret']:.2f}s | {averages['inf']:.2f}s | {averages['tot']:.2f}s |\n"
|
| 377 |
|
| 378 |
content += """
|
| 379 |
### Key Findings
|
|
|
|
| 429 |
return output_file
|
| 430 |
|
| 431 |
|
| 432 |
+
import time
|
| 433 |
def run_rag_for_technique_sequential(technique_name, query, index, encoder, models, evaluator, rag_engine, retriever, retrieval_strategy):
|
| 434 |
"""Run RAG pipeline for a specific chunking technique and retrieval strategy (sequential)."""
|
| 435 |
|
|
|
|
| 442 |
print(f"{'='*80}")
|
| 443 |
|
| 444 |
# Use HybridRetriever to retrieve chunks
|
| 445 |
+
retrieval_start_time = time.time()
|
| 446 |
context_chunks, chunk_score = retriever.search(
|
| 447 |
query=query,
|
| 448 |
index=index,
|
|
|
|
| 450 |
rerank_strategy="cross-encoder",
|
| 451 |
use_mmr=use_mmr,
|
| 452 |
top_k=50,
|
| 453 |
+
final_k=4,
|
| 454 |
technique_name=technique_name,
|
| 455 |
verbose=False,
|
| 456 |
test=True
|
| 457 |
)
|
| 458 |
|
| 459 |
+
retrieval_time = time.time() - retrieval_start_time
|
| 460 |
print(f"\nRetrieved {len(context_chunks)} chunks for technique '{technique_name}' with strategy '{strategy_label}' (ChunkScore: {chunk_score:.4f})")
|
| 461 |
|
| 462 |
if not context_chunks:
|
|
|
|
| 478 |
tournament_results = {}
|
| 479 |
tournament_results["_ChunkScore"] = chunk_score
|
| 480 |
tournament_results["_Strategy"] = strategy_label
|
| 481 |
+
tournament_results["_retrieval_time"] = retrieval_time
|
| 482 |
|
| 483 |
for name, model_inst in models.items():
|
| 484 |
print(f"\n{'-'*60}")
|
|
|
|
| 486 |
print(f"{'-'*60}")
|
| 487 |
try:
|
| 488 |
# Generation
|
| 489 |
+
inference_start_time = time.time()
|
| 490 |
+
answer = rag_engine.get_answer(model_inst, query, context_chunks,
|
| 491 |
+
temperature=cfg.gen["temperature"]
|
| 492 |
)
|
| 493 |
+
inference_time = time.time() - inference_start_time
|
| 494 |
+
inference_time = time.time() - inference_start_time
|
| 495 |
+
|
| 496 |
|
| 497 |
print(f"\n{'─'*60}")
|
| 498 |
print(f"📝 FULL ANSWER from {name}:")
|
|
|
|
| 510 |
"Faithfulness": faith['score'],
|
| 511 |
"Relevancy": rel['score'],
|
| 512 |
"Claims": faith['details'],
|
| 513 |
+
"GenQueries": rel.get('queries', []),
|
| 514 |
+
"retrieval_time": retrieval_time,
|
| 515 |
+
"inference_time": inference_time,
|
| 516 |
+
"total_time": retrieval_time + inference_time,
|
| 517 |
"context_chunks": context_chunks,
|
| 518 |
}
|
| 519 |
|
|
|
|
| 521 |
print(f" Faithfulness: {faith['score']:.1f}%")
|
| 522 |
print(f" Relevancy: {rel['score']:.3f}")
|
| 523 |
print(f" Combined: {faith['score'] + rel['score']:.3f}")
|
| 524 |
+
print(f"⏱️ LATENCY METRICS:")
|
| 525 |
+
print(f" Retrieval: {retrieval_time:.2f}s")
|
| 526 |
+
print(f" Inference: {inference_time:.2f}s")
|
| 527 |
+
print(f" Total Response: {retrieval_time + inference_time:.2f}s")
|
| 528 |
|
| 529 |
except Exception as e:
|
| 530 |
print(f" Error evaluating {name}: {e}")
|
|
|
|
| 533 |
"Faithfulness": 0,
|
| 534 |
"Relevancy": 0,
|
| 535 |
"Claims": [],
|
| 536 |
+
"GenQueries": [],
|
| 537 |
+
"retrieval_time": retrieval_time,
|
| 538 |
+
"inference_time": 0,
|
| 539 |
+
"total_time": retrieval_time,
|
| 540 |
"error": str(e),
|
| 541 |
"context_chunks": context_chunks,
|
| 542 |
}
|
|
|
|
| 560 |
|
| 561 |
# Test queries
|
| 562 |
test_queries = [
|
| 563 |
+
"What is cognitive behavior therapy and how does it work?",
|
| 564 |
+
"I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
|
| 565 |
+
"No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
|
| 566 |
+
"I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying.",
|
| 567 |
+
"My friend didn't text me back for five hours. I'm certain they are mad at me or that I've done something to ruin our friendship.",
|
| 568 |
+
"Can you explain the difference between a 'situation,' a 'thought,' and an 'emotion' in the context of a CBT thought record?",
|
| 569 |
+
"I have to do everything perfectly. If I make even one small mistake, it means the entire project is a total disaster and I've wasted everyone's time.",
|
| 570 |
+
"Whenever I have to give a presentation, my heart starts racing and I'm sure I'm going to have a heart attack or pass out in front of everyone.",
|
| 571 |
+
"I feel like I'm fundamentally broken and that if people really knew me, they would never want to be around me.",
|
| 572 |
+
"What is 'behavioral activation' and how can it help someone who is struggling with a lack of motivation or depression?"
|
| 573 |
+
]
|
| 574 |
|
| 575 |
print("=" * 80)
|
| 576 |
print("RAG ABLATION STUDY - 6 CHUNKING TECHNIQUES")
|
|
|
|
| 647 |
]
|
| 648 |
|
| 649 |
# Filter to only 4 techniques to reduce memory usage
|
| 650 |
+
TECHNIQUES_TO_EVALUATE = ["recursive",'semantic','fixed','markdown','sentence','paragraph'] # You can adjust this list to test different techniques
|
| 651 |
CHUNKING_TECHNIQUES_FILTERED = [t for t in CHUNKING_TECHNIQUES if t['name'] in TECHNIQUES_TO_EVALUATE]
|
| 652 |
|
| 653 |
# Step 3: Run RAG for all techniques x strategies SEQUENTIALLY (to avoid OOM)
|
models/deepseek_v3.py
CHANGED
|
@@ -21,5 +21,5 @@ class DeepSeek_V3:
|
|
| 21 |
except Exception as e:
|
| 22 |
yield f" DeepSeek API Busy: {e}"
|
| 23 |
|
| 24 |
-
def generate(self, prompt, max_tokens=
|
| 25 |
return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
|
|
|
|
| 21 |
except Exception as e:
|
| 22 |
yield f" DeepSeek API Busy: {e}"
|
| 23 |
|
| 24 |
+
def generate(self, prompt, max_tokens=1500, temperature=0.1):
|
| 25 |
return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
|
models/llama_3_8b.py
CHANGED
|
@@ -5,7 +5,7 @@ class Llama3_8B:
|
|
| 5 |
self.client = InferenceClient(token=token)
|
| 6 |
self.model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
|
| 7 |
|
| 8 |
-
def generate_stream(self, prompt, max_tokens=
|
| 9 |
for message in self.client.chat_completion(
|
| 10 |
model=self.model_id,
|
| 11 |
messages=[{"role": "user", "content": prompt}],
|
|
@@ -18,5 +18,5 @@ class Llama3_8B:
|
|
| 18 |
if content:
|
| 19 |
yield content
|
| 20 |
|
| 21 |
-
def generate(self, prompt, max_tokens=
|
| 22 |
-
return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
|
|
|
|
| 5 |
self.client = InferenceClient(token=token)
|
| 6 |
self.model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
|
| 7 |
|
| 8 |
+
def generate_stream(self, prompt, max_tokens=1000, temperature=0.1):
|
| 9 |
for message in self.client.chat_completion(
|
| 10 |
model=self.model_id,
|
| 11 |
messages=[{"role": "user", "content": prompt}],
|
|
|
|
| 18 |
if content:
|
| 19 |
yield content
|
| 20 |
|
| 21 |
+
def generate(self, prompt, max_tokens=1000, temperature=0.1):
|
| 22 |
+
return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
|
models/mistral_7b.py
CHANGED
|
@@ -1,29 +1,25 @@
|
|
| 1 |
from huggingface_hub import InferenceClient
|
| 2 |
-
import os
|
| 3 |
|
| 4 |
class Mistral_7b:
|
| 5 |
def __init__(self, token):
|
| 6 |
-
self.client = InferenceClient(
|
| 7 |
-
|
| 8 |
-
# Keep a sane default and allow override via env for experimentation.
|
| 9 |
-
self.model_id = os.getenv("MISTRAL_MODEL_ID", "mistralai/Mistral-7B-Instruct-v0.2")
|
| 10 |
|
| 11 |
-
def generate_stream(self, prompt, max_tokens=
|
| 12 |
try:
|
| 13 |
-
|
| 14 |
model=self.model_id,
|
| 15 |
messages=[{"role": "user", "content": prompt}],
|
| 16 |
max_tokens=max_tokens,
|
| 17 |
temperature=temperature,
|
| 18 |
-
stream=True,
|
| 19 |
-
)
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
except Exception as e:
|
| 26 |
-
yield f"
|
| 27 |
|
| 28 |
-
def generate(self, prompt, max_tokens=
|
| 29 |
return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
|
|
|
|
| 1 |
from huggingface_hub import InferenceClient
|
|
|
|
| 2 |
|
| 3 |
class Mistral_7b:
|
| 4 |
def __init__(self, token):
|
| 5 |
+
self.client = InferenceClient(token=token)
|
| 6 |
+
self.model_id = "mistralai/Mistral-7B-Instruct-v0.2"
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
def generate_stream(self, prompt, max_tokens=1000, temperature=0.1):
|
| 9 |
try:
|
| 10 |
+
for message in self.client.chat_completion(
|
| 11 |
model=self.model_id,
|
| 12 |
messages=[{"role": "user", "content": prompt}],
|
| 13 |
max_tokens=max_tokens,
|
| 14 |
temperature=temperature,
|
| 15 |
+
stream=True, extra_body={"reasoning": "none"},
|
| 16 |
+
):
|
| 17 |
+
if message.choices:
|
| 18 |
+
content = message.choices[0].delta.content
|
| 19 |
+
if content:
|
| 20 |
+
yield content
|
|
|
|
| 21 |
except Exception as e:
|
| 22 |
+
yield f" Mistral_7b Error: {e}"
|
| 23 |
|
| 24 |
+
def generate(self, prompt, max_tokens=1000, temperature=0.1):
|
| 25 |
return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
|
models/qwen_2_5.py
CHANGED
|
@@ -5,7 +5,7 @@ class Qwen2_5:
|
|
| 5 |
self.client = InferenceClient(token=token)
|
| 6 |
self.model_id = "Qwen/Qwen2.5-72B-Instruct"
|
| 7 |
|
| 8 |
-
def generate_stream(self, prompt, max_tokens=
|
| 9 |
for message in self.client.chat_completion(
|
| 10 |
model=self.model_id,
|
| 11 |
messages=[{"role": "user", "content": prompt}],
|
|
@@ -18,5 +18,5 @@ class Qwen2_5:
|
|
| 18 |
if content:
|
| 19 |
yield content
|
| 20 |
|
| 21 |
-
def generate(self, prompt, max_tokens=
|
| 22 |
return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
|
|
|
|
| 5 |
self.client = InferenceClient(token=token)
|
| 6 |
self.model_id = "Qwen/Qwen2.5-72B-Instruct"
|
| 7 |
|
| 8 |
+
def generate_stream(self, prompt, max_tokens=1000, temperature=0.1):
|
| 9 |
for message in self.client.chat_completion(
|
| 10 |
model=self.model_id,
|
| 11 |
messages=[{"role": "user", "content": prompt}],
|
|
|
|
| 18 |
if content:
|
| 19 |
yield content
|
| 20 |
|
| 21 |
+
def generate(self, prompt, max_tokens=1000, temperature=0.1):
|
| 22 |
return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
|
models/qwen_3_5_9b.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from openai import OpenAI
|
| 3 |
+
|
| 4 |
+
class Qwen_3_5_9B:
|
| 5 |
+
def __init__(self, token=None):
|
| 6 |
+
openrouter_token = os.getenv("OPENROUTER_API_KEY")
|
| 7 |
+
if not openrouter_token:
|
| 8 |
+
print("Warning: OPENROUTER_API_KEY environment variable is not set")
|
| 9 |
+
# Fallback to the token passed in if available
|
| 10 |
+
openrouter_token = token
|
| 11 |
+
|
| 12 |
+
self.client = OpenAI(
|
| 13 |
+
base_url="https://openrouter.ai/api/v1",
|
| 14 |
+
api_key=openrouter_token,
|
| 15 |
+
)
|
| 16 |
+
self.model_id = "qwen/qwen3.5-9b"
|
| 17 |
+
|
| 18 |
+
def generate_stream(self, prompt, max_tokens=100000, temperature=0.1):
|
| 19 |
+
for message in self.client.chat.completions.create(
|
| 20 |
+
model=self.model_id,
|
| 21 |
+
messages=[{"role": "user", "content": prompt}],
|
| 22 |
+
max_tokens=max_tokens,
|
| 23 |
+
temperature=temperature,
|
| 24 |
+
stream=True,
|
| 25 |
+
):
|
| 26 |
+
if message.choices and len(message.choices) > 0:
|
| 27 |
+
content = message.choices[0].delta.content
|
| 28 |
+
if content:
|
| 29 |
+
yield content
|
| 30 |
+
|
| 31 |
+
def generate(self, prompt, max_tokens=100000, temperature=0.1):
|
| 32 |
+
return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
|
models/tiny_aya.py
CHANGED
|
@@ -5,7 +5,7 @@ class TinyAya:
|
|
| 5 |
self.client = InferenceClient(token=token)
|
| 6 |
self.model_id = "CohereLabs/tiny-aya-global"
|
| 7 |
|
| 8 |
-
def generate_stream(self, prompt, max_tokens=
|
| 9 |
try:
|
| 10 |
for message in self.client.chat_completion(
|
| 11 |
model=self.model_id,
|
|
@@ -21,5 +21,5 @@ class TinyAya:
|
|
| 21 |
except Exception as e:
|
| 22 |
yield f" TinyAya Error: {e}"
|
| 23 |
|
| 24 |
-
def generate(self, prompt, max_tokens=
|
| 25 |
return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
|
|
|
|
| 5 |
self.client = InferenceClient(token=token)
|
| 6 |
self.model_id = "CohereLabs/tiny-aya-global"
|
| 7 |
|
| 8 |
+
def generate_stream(self, prompt, max_tokens=1000, temperature=0.1):
|
| 9 |
try:
|
| 10 |
for message in self.client.chat_completion(
|
| 11 |
model=self.model_id,
|
|
|
|
| 21 |
except Exception as e:
|
| 22 |
yield f" TinyAya Error: {e}"
|
| 23 |
|
| 24 |
+
def generate(self, prompt, max_tokens=1000, temperature=0.1):
|
| 25 |
return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))
|
retriever/retriever.py
CHANGED
|
@@ -337,6 +337,7 @@ class HybridRetriever:
|
|
| 337 |
semantic_start = time.perf_counter()
|
| 338 |
query_vector, semantic_chunks = self._semantic_search(query, index, top_k, requested_technique)
|
| 339 |
semantic_time = time.perf_counter() - semantic_start
|
|
|
|
| 340 |
if should_print:
|
| 341 |
self._print_candidates("Semantic Search", semantic_chunks)
|
| 342 |
print(f"Semantic time: {semantic_time:.3f}s")
|
|
@@ -345,6 +346,7 @@ class HybridRetriever:
|
|
| 345 |
bm25_start = time.perf_counter()
|
| 346 |
bm25_chunks = self._bm25_search(query, index, top_k, requested_technique)
|
| 347 |
bm25_time = time.perf_counter() - bm25_start
|
|
|
|
| 348 |
if should_print:
|
| 349 |
self._print_candidates("BM25 Search", bm25_chunks)
|
| 350 |
print(f"BM25 time: {bm25_time:.3f}s")
|
|
@@ -360,8 +362,28 @@ class HybridRetriever:
|
|
| 360 |
label = "RRF"
|
| 361 |
elif rerank_strategy == "cross-encoder":
|
| 362 |
combined = list(dict.fromkeys(semantic_chunks + bm25_chunks))
|
|
|
|
| 363 |
candidates, chunk_scores = self._cross_encoder_rerank(query, combined, final_k)
|
|
|
|
| 364 |
label = "Cross-Encoder"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
else: # "none"
|
| 366 |
candidates = list(dict.fromkeys(semantic_chunks + bm25_chunks))[:final_k]
|
| 367 |
label = "No Reranking"
|
|
|
|
| 337 |
semantic_start = time.perf_counter()
|
| 338 |
query_vector, semantic_chunks = self._semantic_search(query, index, top_k, requested_technique)
|
| 339 |
semantic_time = time.perf_counter() - semantic_start
|
| 340 |
+
print(f"[DEBUG-FLOW] retrieved {len(semantic_chunks)} chunks from semantic search", flush=True)
|
| 341 |
if should_print:
|
| 342 |
self._print_candidates("Semantic Search", semantic_chunks)
|
| 343 |
print(f"Semantic time: {semantic_time:.3f}s")
|
|
|
|
| 346 |
bm25_start = time.perf_counter()
|
| 347 |
bm25_chunks = self._bm25_search(query, index, top_k, requested_technique)
|
| 348 |
bm25_time = time.perf_counter() - bm25_start
|
| 349 |
+
print(f"[DEBUG-FLOW] retrieved {len(bm25_chunks)} chunks from BM25 search", flush=True)
|
| 350 |
if should_print:
|
| 351 |
self._print_candidates("BM25 Search", bm25_chunks)
|
| 352 |
print(f"BM25 time: {bm25_time:.3f}s")
|
|
|
|
| 362 |
label = "RRF"
|
| 363 |
elif rerank_strategy == "cross-encoder":
|
| 364 |
combined = list(dict.fromkeys(semantic_chunks + bm25_chunks))
|
| 365 |
+
print(f"[DEBUG-FLOW] {len(combined)} unique chunks went into cross-encoder", flush=True)
|
| 366 |
candidates, chunk_scores = self._cross_encoder_rerank(query, combined, final_k)
|
| 367 |
+
print(f"[DEBUG-FLOW] {len(candidates)} chunks got out of cross-encoder", flush=True)
|
| 368 |
label = "Cross-Encoder"
|
| 369 |
+
elif rerank_strategy == "voyage":
|
| 370 |
+
import voyageai
|
| 371 |
+
voyage_client = voyageai.Client()
|
| 372 |
+
combined = list(dict.fromkeys(semantic_chunks + bm25_chunks))
|
| 373 |
+
print(f"[DEBUG-FLOW] {len(combined)} unique chunks went into voyage reranker", flush=True)
|
| 374 |
+
if not combined:
|
| 375 |
+
candidates, chunk_scores = [], []
|
| 376 |
+
else:
|
| 377 |
+
try:
|
| 378 |
+
reranking = voyage_client.rerank(query=query, documents=combined, model=self.rerank_model_name, top_k=final_k)
|
| 379 |
+
candidates = [r.document for r in reranking.results]
|
| 380 |
+
chunk_scores = [r.relevance_score for r in reranking.results]
|
| 381 |
+
print(f"[DEBUG-FLOW] {len(candidates)} chunks got out of voyage reranker", flush=True)
|
| 382 |
+
except Exception as e:
|
| 383 |
+
print(f"Error calling Voyage API: {e}")
|
| 384 |
+
candidates = combined[:final_k]
|
| 385 |
+
chunk_scores = []
|
| 386 |
+
label = "Voyage"
|
| 387 |
else: # "none"
|
| 388 |
candidates = list(dict.fromkeys(semantic_chunks + bm25_chunks))[:final_k]
|
| 389 |
label = "No Reranking"
|