Qar-Raz commited on
Commit
8f37cc7
·
verified ·
1 Parent(s): 860aa5d

Sync backend Docker context from GitHub main

Browse files
Files changed (4) hide show
  1. backend/schemas.py +1 -1
  2. data/ingest.py +42 -42
  3. retriever/retriever.py +3 -3
  4. test.py +29 -18
backend/schemas.py CHANGED
@@ -9,7 +9,7 @@ from pydantic import BaseModel, Field
9
  class PredictRequest(BaseModel):
10
  query: str = Field(..., min_length=1, description="User query text")
11
  model: str = Field(default="Llama-3-8B", description="Model name key")
12
- top_k: int = Field(default=10, ge=1, le=20)
13
  final_k: int = Field(default=3, ge=1, le=8)
14
  chunking_technique: str = Field(default="all", description="all | fixed | sentence | paragraph | semantic | recursive | page | markdown")
15
  mode: str = Field(default="hybrid", description="semantic | bm25 | hybrid")
 
9
  class PredictRequest(BaseModel):
10
  query: str = Field(..., min_length=1, description="User query text")
11
  model: str = Field(default="Llama-3-8B", description="Model name key")
12
+ top_k: int = Field(default=50, ge=1, le=100)
13
  final_k: int = Field(default=3, ge=1, le=8)
14
  chunking_technique: str = Field(default="all", description="all | fixed | sentence | paragraph | semantic | recursive | page | markdown")
15
  mode: str = Field(default="hybrid", description="semantic | bm25 | hybrid")
data/ingest.py CHANGED
@@ -15,55 +15,55 @@ from retriever.processor import ChunkProcessor
15
 
16
  # 6 different chunking techniques for ablation study
17
  CHUNKING_TECHNIQUES = [
18
- {
19
- "name": "fixed",
20
- "description": "Fixed-size chunking - splits every N characters (may cut sentences mid-way)",
21
- "chunk_size": 1000,
22
- "chunk_overlap": 100,
23
- "kwargs": {"separator": ""}, # No separator for fixed splitting
24
- },
25
- {
26
- "name": "sentence",
27
- "description": "Sentence-level chunking - respects sentence boundaries (NLTK)",
28
- "chunk_size": 1000,
29
- "chunk_overlap": 100,
30
- "kwargs": {},
31
- },
32
- {
33
- "name": "paragraph",
34
- "description": "Paragraph-level chunking - uses natural paragraph breaks",
35
- "chunk_size": 2500,
36
- "chunk_overlap": 100,
37
- "kwargs": {"separator": "\n\n"}, # Split on paragraph breaks
38
- },
39
  # {
40
- # "name": "semantic",
41
- # "description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
42
- # "chunk_size": 2000,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  # "chunk_overlap": 100,
44
- # "kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
45
  # },
46
  {
47
- "name": "recursive",
48
- "description": "Recursive chunking - hierarchical splitting (paragraphs sentences → words → chars)",
49
  "chunk_size": 2000,
50
  "chunk_overlap": 100,
51
- "kwargs": {"separators": ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""], "keep_separator": True},
52
- },
53
- {
54
- "name": "page",
55
- "description": "Page-level chunking - uses entire book pages as-is",
56
- "chunk_size": 10000, # Very large to keep full pages
57
- "chunk_overlap": 0, # No overlap between pages
58
- "kwargs": {"separator": "--- Page"}, # Split on page markers
59
- },
60
- {
61
- "name": "markdown",
62
- "description": "Markdown header chunking - splits by headers (#, ##, ###, ####) with 4k char limit",
63
- "chunk_size": 4000, # Max 4k chars per chunk
64
- "chunk_overlap": 0, # No overlap for markdown
65
- "kwargs": {}, # Custom implementation
66
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  ]
68
 
69
 
 
15
 
16
  # 6 different chunking techniques for ablation study
17
  CHUNKING_TECHNIQUES = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # {
19
+ # "name": "fixed",
20
+ # "description": "Fixed-size chunking - splits every N characters (may cut sentences mid-way)",
21
+ # "chunk_size": 1000,
22
+ # "chunk_overlap": 100,
23
+ # "kwargs": {"separator": ""}, # No separator for fixed splitting
24
+ # },
25
+ # {
26
+ # "name": "sentence",
27
+ # "description": "Sentence-level chunking - respects sentence boundaries (NLTK)",
28
+ # "chunk_size": 1000,
29
+ # "chunk_overlap": 100,
30
+ # "kwargs": {},
31
+ # },
32
+ # {
33
+ # "name": "paragraph",
34
+ # "description": "Paragraph-level chunking - uses natural paragraph breaks",
35
+ # "chunk_size": 2500,
36
  # "chunk_overlap": 100,
37
+ # "kwargs": {"separator": "\n\n"}, # Split on paragraph breaks
38
  # },
39
  {
40
+ "name": "semantic",
41
+ "description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
42
  "chunk_size": 2000,
43
  "chunk_overlap": 100,
44
+ "kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  },
46
+ # {
47
+ # "name": "recursive",
48
+ # "description": "Recursive chunking - hierarchical splitting (paragraphs → sentences → words → chars)",
49
+ # "chunk_size": 2000,
50
+ # "chunk_overlap": 100,
51
+ # "kwargs": {"separators": ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""], "keep_separator": True},
52
+ # },
53
+ # {
54
+ # "name": "page",
55
+ # "description": "Page-level chunking - uses entire book pages as-is",
56
+ # "chunk_size": 10000, # Very large to keep full pages
57
+ # "chunk_overlap": 0, # No overlap between pages
58
+ # "kwargs": {"separator": "--- Page"}, # Split on page markers
59
+ # },
60
+ # {
61
+ # "name": "markdown",
62
+ # "description": "Markdown header chunking - splits by headers (#, ##, ###, ####) with 4k char limit",
63
+ # "chunk_size": 4000, # Max 4k chars per chunk
64
+ # "chunk_overlap": 0, # No overlap for markdown
65
+ # "kwargs": {}, # Custom implementation
66
+ # },
67
  ]
68
 
69
 
retriever/retriever.py CHANGED
@@ -324,9 +324,6 @@ class HybridRetriever:
324
  rerank_time = 0.0
325
  mmr_time = 0.0
326
 
327
- if use_mmr:
328
- final_k = 10
329
-
330
  if should_print:
331
  self._print_search_header(query, mode, rerank_strategy, top_k, final_k)
332
  if requested_technique:
@@ -383,6 +380,9 @@ class HybridRetriever:
383
  label += " + MMR"
384
  mmr_time = time.perf_counter() - mmr_start
385
 
 
 
 
386
  if test and rerank_strategy != "cross-encoder" and candidates:
387
  _, test_scores = self._cross_encoder_rerank(query, candidates, len(candidates))
388
  avg_chunk_score = float(np.mean(test_scores)) if test_scores else 0.0
 
324
  rerank_time = 0.0
325
  mmr_time = 0.0
326
 
 
 
 
327
  if should_print:
328
  self._print_search_header(query, mode, rerank_strategy, top_k, final_k)
329
  if requested_technique:
 
380
  label += " + MMR"
381
  mmr_time = time.perf_counter() - mmr_start
382
 
383
+ # Safety cap: always honor requested final_k regardless of retrieval strategy.
384
+ candidates = candidates[:final_k]
385
+
386
  if test and rerank_strategy != "cross-encoder" and candidates:
387
  _, test_scores = self._cross_encoder_rerank(query, candidates, len(candidates))
388
  avg_chunk_score = float(np.mean(test_scores)) if test_scores else 0.0
test.py CHANGED
@@ -35,14 +35,14 @@ def generate_retrieval_report(all_results, queries, output_file="retrieval_repor
35
  chunks = chunks_data.get('chunks', [])
36
  score = chunks_data.get('score', 0)
37
 
38
- content += f"**ChunkScore:** {score:.4f} | **Chunks retrieved:** {len(chunks)}\n\n"
39
 
40
  if not chunks:
41
- content += "*No chunks retrieved.*\n\n"
42
  else:
43
  for i, chunk in enumerate(chunks, 1):
44
- content += f"**[Chunk {i}]** ({len(chunk)} chars):\n"
45
- content += f"```text\n{chunk}\n```\n\n"
46
 
47
  content += "---\n\n"
48
 
@@ -61,11 +61,17 @@ def main():
61
  raise RuntimeError("PINECONE_API_KEY not found in environment variables")
62
 
63
  test_queries = [
64
- "What is cognitive behavior therapy and how does it work?",
65
- "I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
66
- "No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
67
- "I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying."
68
- ]
 
 
 
 
 
 
69
 
70
  # TECHNIQUES_TO_EVALUATE = ["fixed", "semantic", "markdown", "page"]
71
  # Use all 7 chunking techniques from ingest.py
@@ -75,11 +81,16 @@ def main():
75
  print(f" - {tech['name']}: {tech['description']}")
76
 
77
  RETRIEVAL_STRATEGIES = [
78
- {"mode": "semantic", "use_mmr": False, "label": "semantic-no-mmr"},
79
- {"mode": "semantic", "use_mmr": True, "label": "semantic-with-mmr"},
80
- {"mode": "hybrid", "use_mmr": False, "label": "hybrid-no-mmr"},
81
- {"mode": "hybrid", "use_mmr": True, "label": "hybrid-with-mmr"},
82
- {"mode": "bm25", "use_mmr": False, "label": "bm25-no-mmr"},
 
 
 
 
 
83
  ]
84
 
85
  print("Initializing ChunkProcessor to load Embedding Model...")
@@ -117,7 +128,7 @@ def main():
117
  technique_name = technique['name']
118
 
119
  for strategy in RETRIEVAL_STRATEGIES:
120
- result_key = f"{technique_name} + {strategy['label']}"
121
  print(f"\nEvaluating: {result_key}")
122
 
123
  try:
@@ -125,9 +136,9 @@ def main():
125
  query=query,
126
  index=index,
127
  mode=strategy['mode'],
128
- rerank_strategy="cross-encoder",
129
  use_mmr=strategy['use_mmr'],
130
- top_k=25,
131
  final_k=4,
132
  technique_name=technique_name,
133
  verbose=False,
@@ -150,4 +161,4 @@ def main():
150
 
151
 
152
  if __name__ == '__main__':
153
- main()
 
35
  chunks = chunks_data.get('chunks', [])
36
  score = chunks_data.get('score', 0)
37
 
38
+ content += f"*ChunkScore:* {score:.4f} | *Chunks retrieved:* {len(chunks)}\n\n"
39
 
40
  if not chunks:
41
+ content += "No chunks retrieved.\n\n"
42
  else:
43
  for i, chunk in enumerate(chunks, 1):
44
+ content += f"*[Chunk {i}]* ({len(chunk)} chars):\n"
45
+ content += f"text\n{chunk}\n\n\n"
46
 
47
  content += "---\n\n"
48
 
 
61
  raise RuntimeError("PINECONE_API_KEY not found in environment variables")
62
 
63
  test_queries = [
64
+ "What is cognitive behavior therapy and how does it work?",
65
+ "I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
66
+ "No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
67
+ "I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying.",
68
+ "My friend didn't text me back for five hours. I'm certain they are mad at me or that I've done something to ruin our friendship.",
69
+ "Can you explain the difference between a 'situation,' a 'thought,' and an 'emotion' in the context of a CBT thought record?",
70
+ "I have to do everything perfectly. If I make even one small mistake, it means the entire project is a total disaster and I've wasted everyone's time.",
71
+ "Whenever I have to give a presentation, my heart starts racing and I'm sure I'm going to have a heart attack or pass out in front of everyone.",
72
+ "I feel like I'm fundamentally broken and that if people really knew me, they would never want to be around me.",
73
+ "What is 'behavioral activation' and how can it help someone who is struggling with a lack of motivation or depression?"
74
+ ]
75
 
76
  # TECHNIQUES_TO_EVALUATE = ["fixed", "semantic", "markdown", "page"]
77
  # Use all 7 chunking techniques from ingest.py
 
81
  print(f" - {tech['name']}: {tech['description']}")
82
 
83
  RETRIEVAL_STRATEGIES = [
84
+ {"mode": "semantic", "use_mmr": False, "label": "semantic-no-mmr","rerank_strategy":"cross-encoder"},
85
+ {"mode": "semantic", "use_mmr": True, "label": "semantic-with-mmr","rerank_strategy":"cross-encoder"},
86
+ {"mode": "semantic", "use_mmr": False, "label": "semantic-no-mmr","rerank_strategy":"none"},
87
+ {"mode": "semantic", "use_mmr": True, "label": "semantic-with-mmr","rerank_strategy":"none"},
88
+ {"mode": "hybrid", "use_mmr": False, "label": "hybrid-no-mmr","rerank_strategy":"cross-encoder"},
89
+ {"mode": "hybrid", "use_mmr": True, "label": "hybrid-with-mmr","rerank_strategy":"rrf"},
90
+ {"mode": "hybrid", "use_mmr": True, "label": "hybrid-with-mmr","rerank_strategy":"cross-encoder"},
91
+ {"mode": "hybrid", "use_mmr": False, "label": "hybrid-with-mmr","rerank_strategy":"rrf"},
92
+ {"mode": "bm25", "use_mmr": False, "label": "bm25-no-mmr","rerank_strategy":"cross-encoder"},
93
+ {"mode": "bm25", "use_mmr": False, "label": "bm25-no-mmr","rerank_strategy":"none"},
94
  ]
95
 
96
  print("Initializing ChunkProcessor to load Embedding Model...")
 
128
  technique_name = technique['name']
129
 
130
  for strategy in RETRIEVAL_STRATEGIES:
131
+ result_key = f"{technique_name} + {strategy['label']} + {strategy['rerank_strategy']}"
132
  print(f"\nEvaluating: {result_key}")
133
 
134
  try:
 
136
  query=query,
137
  index=index,
138
  mode=strategy['mode'],
139
+ rerank_strategy=strategy['rerank_strategy'],
140
  use_mmr=strategy['use_mmr'],
141
+ top_k=50,
142
  final_k=4,
143
  technique_name=technique_name,
144
  verbose=False,
 
161
 
162
 
163
  if __name__ == '__main__':
164
+ main()