soft.engineer commited on
Commit
dfabcc2
·
1 Parent(s): 2813e41

add some feature

Browse files
Files changed (3) hide show
  1. Dockerfile +35 -0
  2. core/eval.py +9 -1
  3. core/report_generator.py +51 -6
Dockerfile ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ git \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ # Copy requirements first (for better caching)
12
+ COPY requirements.txt .
13
+
14
+ # Install Python dependencies
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Models download automatically on first run (SentenceTransformers, spaCy, etc.)
18
+ # No manual download needed
19
+
20
+ # Copy application
21
+ COPY . .
22
+
23
+ # Create necessary directories
24
+ RUN mkdir -p reports chroma_data
25
+
26
+ # Expose port
27
+ EXPOSE 7860
28
+
29
+ # Set environment variables
30
+ ENV GRADIO_SERVER_PORT=7860
31
+ ENV CHROMA_PERSIST_DIR=/app/chroma_data
32
+
33
+ # Run application
34
+ CMD ["python", "app.py"]
35
+
core/eval.py CHANGED
@@ -225,15 +225,23 @@ class RAGEvaluator:
225
  if user_sat_score is None:
226
  user_sat_score = user_satisfaction.get(f'query_{i}')
227
 
 
 
 
 
 
 
228
  query_result = {
229
  'query_id': query_id,
230
  'query': query,
231
  'ground_truth': query_data.get('ground_truth', []),
232
  'user_satisfaction': user_sat_score,
 
 
233
  'pipelines': {}
234
  }
235
 
236
- k_values = query_data.get('k_values', [1, 3, 5])
237
 
238
  for pipeline in pipelines:
239
  pipeline_results = {}
 
225
  if user_sat_score is None:
226
  user_sat_score = user_satisfaction.get(f'query_{i}')
227
 
228
+ # Extract tags and tag_operator for representative examples
229
+ tags = query_data.get('tags', []) or []
230
+ if isinstance(tags, str):
231
+ tags = [t.strip() for t in tags.split(',') if t.strip()]
232
+ tag_operator = query_data.get('tag_operator', 'OR')
233
+
234
  query_result = {
235
  'query_id': query_id,
236
  'query': query,
237
  'ground_truth': query_data.get('ground_truth', []),
238
  'user_satisfaction': user_sat_score,
239
+ 'tags': tags,
240
+ 'tag_operator': tag_operator,
241
  'pipelines': {}
242
  }
243
 
244
+ k_values = query_data.get('k_values', [1, 3, 5, 10]) # Include k=10 as per requirements
245
 
246
  for pipeline in pipelines:
247
  pipeline_results = {}
core/report_generator.py CHANGED
@@ -166,12 +166,29 @@ class ReportGenerator:
166
  query_id = row['query_id']
167
  query_result = next((r for r in raw_results if r.get('query_id') == query_id), None)
168
  if query_result:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  examples['best_performing'].append({
170
  'query': query_result['query'],
171
  'pipeline': row['pipeline'],
172
  'precision_at_k': row['precision_at_k'],
173
  'ndcg_at_k': row['ndcg_at_k'],
174
- 'mrr': row['mrr']
 
 
175
  })
176
 
177
  # Worst performing queries
@@ -180,12 +197,19 @@ class ReportGenerator:
180
  query_id = row['query_id']
181
  query_result = next((r for r in raw_results if r.get('query_id') == query_id), None)
182
  if query_result:
 
 
 
 
 
183
  examples['worst_performing'].append({
184
  'query': query_result['query'],
185
  'pipeline': row['pipeline'],
186
  'precision_at_k': row['precision_at_k'],
187
  'ndcg_at_k': row['ndcg_at_k'],
188
- 'mrr': row['mrr']
 
 
189
  })
190
 
191
  # Most improved (hybrid vs baseline)
@@ -200,11 +224,18 @@ class ReportGenerator:
200
  for query_id in improvement.index:
201
  query_result = next((r for r in raw_results if r.get('query_id') == query_id), None)
202
  if query_result:
 
 
 
 
 
203
  examples['most_improved'].append({
204
  'query': query_result['query'],
205
  'baseline_precision': baseline_df.loc[query_id, 'precision_at_k'],
206
  'hybrid_precision': hybrid_df.loc[query_id, 'precision_at_k'],
207
- 'improvement': improvement[query_id]
 
 
208
  })
209
 
210
  return examples
@@ -327,20 +358,34 @@ class ReportGenerator:
327
  for example in examples['best_performing']:
328
  html += f"<div class='example-item'><strong>Query:</strong> {example['query']}<br>"
329
  html += f"<strong>Pipeline:</strong> {self.METHOD_NAMES.get(example['pipeline'], example['pipeline'])}<br>"
330
- html += f"<strong>Precision@5:</strong> {example['precision_at_k']:.3f}</div>"
 
 
 
 
331
 
332
  if examples.get('worst_performing'):
333
  html += "<h3>Worst Performing Queries</h3>"
334
  for example in examples['worst_performing']:
335
  html += f"<div class='example-item'><strong>Query:</strong> {example['query']}<br>"
336
  html += f"<strong>Pipeline:</strong> {self.METHOD_NAMES.get(example['pipeline'], example['pipeline'])}<br>"
337
- html += f"<strong>Precision@5:</strong> {example['precision_at_k']:.3f}</div>"
 
 
 
 
338
 
339
  if examples.get('most_improved'):
340
  html += "<h3>Most Improved Queries (Hybrid vs Baseline)</h3>"
341
  for example in examples['most_improved']:
342
  html += f"<div class='example-item'><strong>Query:</strong> {example['query']}<br>"
343
- html += f"<strong>Improvement:</strong> +{example['improvement']:.3f}</div>"
 
 
 
 
 
 
344
 
345
  if not html:
346
  return "<p>No representative examples available.</p>"
 
166
  query_id = row['query_id']
167
  query_result = next((r for r in raw_results if r.get('query_id') == query_id), None)
168
  if query_result:
169
+ # Extract tags from query result if available
170
+ tags_used = query_result.get('tags', []) or []
171
+ if isinstance(tags_used, str):
172
+ tags_used = [t.strip() for t in tags_used.split(',') if t.strip()]
173
+
174
+ # Get tags from retrieved documents (from pipeline results)
175
+ tags_in_results = []
176
+ if 'pipelines' in query_result:
177
+ pipeline_data = query_result['pipelines'].get(row['pipeline'], {})
178
+ # Try to extract tags from any k value's results
179
+ for k_val, pipeline_result in pipeline_data.items():
180
+ if isinstance(pipeline_result, dict) and 'sources' in str(pipeline_result):
181
+ # Tags would be in sources metadata, extract if available
182
+ pass # Tags in sources metadata are complex to extract here
183
+
184
  examples['best_performing'].append({
185
  'query': query_result['query'],
186
  'pipeline': row['pipeline'],
187
  'precision_at_k': row['precision_at_k'],
188
  'ndcg_at_k': row['ndcg_at_k'],
189
+ 'mrr': row['mrr'],
190
+ 'tags_used': tags_used,
191
+ 'tag_operator': query_result.get('tag_operator', 'N/A')
192
  })
193
 
194
  # Worst performing queries
 
197
  query_id = row['query_id']
198
  query_result = next((r for r in raw_results if r.get('query_id') == query_id), None)
199
  if query_result:
200
+ # Extract tags from query result if available
201
+ tags_used = query_result.get('tags', []) or []
202
+ if isinstance(tags_used, str):
203
+ tags_used = [t.strip() for t in tags_used.split(',') if t.strip()]
204
+
205
  examples['worst_performing'].append({
206
  'query': query_result['query'],
207
  'pipeline': row['pipeline'],
208
  'precision_at_k': row['precision_at_k'],
209
  'ndcg_at_k': row['ndcg_at_k'],
210
+ 'mrr': row['mrr'],
211
+ 'tags_used': tags_used,
212
+ 'tag_operator': query_result.get('tag_operator', 'N/A')
213
  })
214
 
215
  # Most improved (hybrid vs baseline)
 
224
  for query_id in improvement.index:
225
  query_result = next((r for r in raw_results if r.get('query_id') == query_id), None)
226
  if query_result:
227
+ # Extract tags from query result if available
228
+ tags_used = query_result.get('tags', []) or []
229
+ if isinstance(tags_used, str):
230
+ tags_used = [t.strip() for t in tags_used.split(',') if t.strip()]
231
+
232
  examples['most_improved'].append({
233
  'query': query_result['query'],
234
  'baseline_precision': baseline_df.loc[query_id, 'precision_at_k'],
235
  'hybrid_precision': hybrid_df.loc[query_id, 'precision_at_k'],
236
+ 'improvement': improvement[query_id],
237
+ 'tags_used': tags_used,
238
+ 'tag_operator': query_result.get('tag_operator', 'N/A')
239
  })
240
 
241
  return examples
 
358
  for example in examples['best_performing']:
359
  html += f"<div class='example-item'><strong>Query:</strong> {example['query']}<br>"
360
  html += f"<strong>Pipeline:</strong> {self.METHOD_NAMES.get(example['pipeline'], example['pipeline'])}<br>"
361
+ html += f"<strong>Precision@5:</strong> {example['precision_at_k']:.3f}, <strong>nDCG@5:</strong> {example['ndcg_at_k']:.3f}, <strong>MRR:</strong> {example['mrr']:.3f}<br>"
362
+ tags_used = example.get('tags_used', [])
363
+ if tags_used:
364
+ html += f"<strong>Tags Used:</strong> {', '.join(tags_used)} (Operator: {example.get('tag_operator', 'N/A')})<br>"
365
+ html += "</div>"
366
 
367
  if examples.get('worst_performing'):
368
  html += "<h3>Worst Performing Queries</h3>"
369
  for example in examples['worst_performing']:
370
  html += f"<div class='example-item'><strong>Query:</strong> {example['query']}<br>"
371
  html += f"<strong>Pipeline:</strong> {self.METHOD_NAMES.get(example['pipeline'], example['pipeline'])}<br>"
372
+ html += f"<strong>Precision@5:</strong> {example['precision_at_k']:.3f}, <strong>nDCG@5:</strong> {example['ndcg_at_k']:.3f}, <strong>MRR:</strong> {example['mrr']:.3f}<br>"
373
+ tags_used = example.get('tags_used', [])
374
+ if tags_used:
375
+ html += f"<strong>Tags Used:</strong> {', '.join(tags_used)} (Operator: {example.get('tag_operator', 'N/A')})<br>"
376
+ html += "</div>"
377
 
378
  if examples.get('most_improved'):
379
  html += "<h3>Most Improved Queries (Hybrid vs Baseline)</h3>"
380
  for example in examples['most_improved']:
381
  html += f"<div class='example-item'><strong>Query:</strong> {example['query']}<br>"
382
+ html += f"<strong>Baseline Precision@5:</strong> {example['baseline_precision']:.3f}<br>"
383
+ html += f"<strong>Hybrid Precision@5:</strong> {example['hybrid_precision']:.3f}<br>"
384
+ html += f"<strong>Improvement:</strong> +{example['improvement']:.3f}<br>"
385
+ tags_used = example.get('tags_used', [])
386
+ if tags_used:
387
+ html += f"<strong>Tags Used:</strong> {', '.join(tags_used)} (Operator: {example.get('tag_operator', 'N/A')})<br>"
388
+ html += "</div>"
389
 
390
  if not html:
391
  return "<p>No representative examples available.</p>"