Spaces:
Sleeping
Sleeping
soft.engineer commited on
Commit ·
dfabcc2
1
Parent(s): 2813e41
add some feature
Browse files- Dockerfile +35 -0
- core/eval.py +9 -1
- core/report_generator.py +51 -6
Dockerfile
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
build-essential \
|
| 8 |
+
git \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
# Copy requirements first (for better caching)
|
| 12 |
+
COPY requirements.txt .
|
| 13 |
+
|
| 14 |
+
# Install Python dependencies
|
| 15 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
+
|
| 17 |
+
# Models download automatically on first run (SentenceTransformers, spaCy, etc.)
|
| 18 |
+
# No manual download needed
|
| 19 |
+
|
| 20 |
+
# Copy application
|
| 21 |
+
COPY . .
|
| 22 |
+
|
| 23 |
+
# Create necessary directories
|
| 24 |
+
RUN mkdir -p reports chroma_data
|
| 25 |
+
|
| 26 |
+
# Expose port
|
| 27 |
+
EXPOSE 7860
|
| 28 |
+
|
| 29 |
+
# Set environment variables
|
| 30 |
+
ENV GRADIO_SERVER_PORT=7860
|
| 31 |
+
ENV CHROMA_PERSIST_DIR=/app/chroma_data
|
| 32 |
+
|
| 33 |
+
# Run application
|
| 34 |
+
CMD ["python", "app.py"]
|
| 35 |
+
|
core/eval.py
CHANGED
|
@@ -225,15 +225,23 @@ class RAGEvaluator:
|
|
| 225 |
if user_sat_score is None:
|
| 226 |
user_sat_score = user_satisfaction.get(f'query_{i}')
|
| 227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
query_result = {
|
| 229 |
'query_id': query_id,
|
| 230 |
'query': query,
|
| 231 |
'ground_truth': query_data.get('ground_truth', []),
|
| 232 |
'user_satisfaction': user_sat_score,
|
|
|
|
|
|
|
| 233 |
'pipelines': {}
|
| 234 |
}
|
| 235 |
|
| 236 |
-
k_values = query_data.get('k_values', [1, 3, 5])
|
| 237 |
|
| 238 |
for pipeline in pipelines:
|
| 239 |
pipeline_results = {}
|
|
|
|
| 225 |
if user_sat_score is None:
|
| 226 |
user_sat_score = user_satisfaction.get(f'query_{i}')
|
| 227 |
|
| 228 |
+
# Extract tags and tag_operator for representative examples
|
| 229 |
+
tags = query_data.get('tags', []) or []
|
| 230 |
+
if isinstance(tags, str):
|
| 231 |
+
tags = [t.strip() for t in tags.split(',') if t.strip()]
|
| 232 |
+
tag_operator = query_data.get('tag_operator', 'OR')
|
| 233 |
+
|
| 234 |
query_result = {
|
| 235 |
'query_id': query_id,
|
| 236 |
'query': query,
|
| 237 |
'ground_truth': query_data.get('ground_truth', []),
|
| 238 |
'user_satisfaction': user_sat_score,
|
| 239 |
+
'tags': tags,
|
| 240 |
+
'tag_operator': tag_operator,
|
| 241 |
'pipelines': {}
|
| 242 |
}
|
| 243 |
|
| 244 |
+
k_values = query_data.get('k_values', [1, 3, 5, 10]) # Include k=10 as per requirements
|
| 245 |
|
| 246 |
for pipeline in pipelines:
|
| 247 |
pipeline_results = {}
|
core/report_generator.py
CHANGED
|
@@ -166,12 +166,29 @@ class ReportGenerator:
|
|
| 166 |
query_id = row['query_id']
|
| 167 |
query_result = next((r for r in raw_results if r.get('query_id') == query_id), None)
|
| 168 |
if query_result:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
examples['best_performing'].append({
|
| 170 |
'query': query_result['query'],
|
| 171 |
'pipeline': row['pipeline'],
|
| 172 |
'precision_at_k': row['precision_at_k'],
|
| 173 |
'ndcg_at_k': row['ndcg_at_k'],
|
| 174 |
-
'mrr': row['mrr']
|
|
|
|
|
|
|
| 175 |
})
|
| 176 |
|
| 177 |
# Worst performing queries
|
|
@@ -180,12 +197,19 @@ class ReportGenerator:
|
|
| 180 |
query_id = row['query_id']
|
| 181 |
query_result = next((r for r in raw_results if r.get('query_id') == query_id), None)
|
| 182 |
if query_result:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
examples['worst_performing'].append({
|
| 184 |
'query': query_result['query'],
|
| 185 |
'pipeline': row['pipeline'],
|
| 186 |
'precision_at_k': row['precision_at_k'],
|
| 187 |
'ndcg_at_k': row['ndcg_at_k'],
|
| 188 |
-
'mrr': row['mrr']
|
|
|
|
|
|
|
| 189 |
})
|
| 190 |
|
| 191 |
# Most improved (hybrid vs baseline)
|
|
@@ -200,11 +224,18 @@ class ReportGenerator:
|
|
| 200 |
for query_id in improvement.index:
|
| 201 |
query_result = next((r for r in raw_results if r.get('query_id') == query_id), None)
|
| 202 |
if query_result:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
examples['most_improved'].append({
|
| 204 |
'query': query_result['query'],
|
| 205 |
'baseline_precision': baseline_df.loc[query_id, 'precision_at_k'],
|
| 206 |
'hybrid_precision': hybrid_df.loc[query_id, 'precision_at_k'],
|
| 207 |
-
'improvement': improvement[query_id]
|
|
|
|
|
|
|
| 208 |
})
|
| 209 |
|
| 210 |
return examples
|
|
@@ -327,20 +358,34 @@ class ReportGenerator:
|
|
| 327 |
for example in examples['best_performing']:
|
| 328 |
html += f"<div class='example-item'><strong>Query:</strong> {example['query']}<br>"
|
| 329 |
html += f"<strong>Pipeline:</strong> {self.METHOD_NAMES.get(example['pipeline'], example['pipeline'])}<br>"
|
| 330 |
-
html += f"<strong>Precision@5:</strong> {example['precision_at_k']:.3f}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
|
| 332 |
if examples.get('worst_performing'):
|
| 333 |
html += "<h3>Worst Performing Queries</h3>"
|
| 334 |
for example in examples['worst_performing']:
|
| 335 |
html += f"<div class='example-item'><strong>Query:</strong> {example['query']}<br>"
|
| 336 |
html += f"<strong>Pipeline:</strong> {self.METHOD_NAMES.get(example['pipeline'], example['pipeline'])}<br>"
|
| 337 |
-
html += f"<strong>Precision@5:</strong> {example['precision_at_k']:.3f}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
|
| 339 |
if examples.get('most_improved'):
|
| 340 |
html += "<h3>Most Improved Queries (Hybrid vs Baseline)</h3>"
|
| 341 |
for example in examples['most_improved']:
|
| 342 |
html += f"<div class='example-item'><strong>Query:</strong> {example['query']}<br>"
|
| 343 |
-
html += f"<strong>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
|
| 345 |
if not html:
|
| 346 |
return "<p>No representative examples available.</p>"
|
|
|
|
| 166 |
query_id = row['query_id']
|
| 167 |
query_result = next((r for r in raw_results if r.get('query_id') == query_id), None)
|
| 168 |
if query_result:
|
| 169 |
+
# Extract tags from query result if available
|
| 170 |
+
tags_used = query_result.get('tags', []) or []
|
| 171 |
+
if isinstance(tags_used, str):
|
| 172 |
+
tags_used = [t.strip() for t in tags_used.split(',') if t.strip()]
|
| 173 |
+
|
| 174 |
+
# Get tags from retrieved documents (from pipeline results)
|
| 175 |
+
tags_in_results = []
|
| 176 |
+
if 'pipelines' in query_result:
|
| 177 |
+
pipeline_data = query_result['pipelines'].get(row['pipeline'], {})
|
| 178 |
+
# Try to extract tags from any k value's results
|
| 179 |
+
for k_val, pipeline_result in pipeline_data.items():
|
| 180 |
+
if isinstance(pipeline_result, dict) and 'sources' in str(pipeline_result):
|
| 181 |
+
# Tags would be in sources metadata, extract if available
|
| 182 |
+
pass # Tags in sources metadata are complex to extract here
|
| 183 |
+
|
| 184 |
examples['best_performing'].append({
|
| 185 |
'query': query_result['query'],
|
| 186 |
'pipeline': row['pipeline'],
|
| 187 |
'precision_at_k': row['precision_at_k'],
|
| 188 |
'ndcg_at_k': row['ndcg_at_k'],
|
| 189 |
+
'mrr': row['mrr'],
|
| 190 |
+
'tags_used': tags_used,
|
| 191 |
+
'tag_operator': query_result.get('tag_operator', 'N/A')
|
| 192 |
})
|
| 193 |
|
| 194 |
# Worst performing queries
|
|
|
|
| 197 |
query_id = row['query_id']
|
| 198 |
query_result = next((r for r in raw_results if r.get('query_id') == query_id), None)
|
| 199 |
if query_result:
|
| 200 |
+
# Extract tags from query result if available
|
| 201 |
+
tags_used = query_result.get('tags', []) or []
|
| 202 |
+
if isinstance(tags_used, str):
|
| 203 |
+
tags_used = [t.strip() for t in tags_used.split(',') if t.strip()]
|
| 204 |
+
|
| 205 |
examples['worst_performing'].append({
|
| 206 |
'query': query_result['query'],
|
| 207 |
'pipeline': row['pipeline'],
|
| 208 |
'precision_at_k': row['precision_at_k'],
|
| 209 |
'ndcg_at_k': row['ndcg_at_k'],
|
| 210 |
+
'mrr': row['mrr'],
|
| 211 |
+
'tags_used': tags_used,
|
| 212 |
+
'tag_operator': query_result.get('tag_operator', 'N/A')
|
| 213 |
})
|
| 214 |
|
| 215 |
# Most improved (hybrid vs baseline)
|
|
|
|
| 224 |
for query_id in improvement.index:
|
| 225 |
query_result = next((r for r in raw_results if r.get('query_id') == query_id), None)
|
| 226 |
if query_result:
|
| 227 |
+
# Extract tags from query result if available
|
| 228 |
+
tags_used = query_result.get('tags', []) or []
|
| 229 |
+
if isinstance(tags_used, str):
|
| 230 |
+
tags_used = [t.strip() for t in tags_used.split(',') if t.strip()]
|
| 231 |
+
|
| 232 |
examples['most_improved'].append({
|
| 233 |
'query': query_result['query'],
|
| 234 |
'baseline_precision': baseline_df.loc[query_id, 'precision_at_k'],
|
| 235 |
'hybrid_precision': hybrid_df.loc[query_id, 'precision_at_k'],
|
| 236 |
+
'improvement': improvement[query_id],
|
| 237 |
+
'tags_used': tags_used,
|
| 238 |
+
'tag_operator': query_result.get('tag_operator', 'N/A')
|
| 239 |
})
|
| 240 |
|
| 241 |
return examples
|
|
|
|
| 358 |
for example in examples['best_performing']:
|
| 359 |
html += f"<div class='example-item'><strong>Query:</strong> {example['query']}<br>"
|
| 360 |
html += f"<strong>Pipeline:</strong> {self.METHOD_NAMES.get(example['pipeline'], example['pipeline'])}<br>"
|
| 361 |
+
html += f"<strong>Precision@5:</strong> {example['precision_at_k']:.3f}, <strong>nDCG@5:</strong> {example['ndcg_at_k']:.3f}, <strong>MRR:</strong> {example['mrr']:.3f}<br>"
|
| 362 |
+
tags_used = example.get('tags_used', [])
|
| 363 |
+
if tags_used:
|
| 364 |
+
html += f"<strong>Tags Used:</strong> {', '.join(tags_used)} (Operator: {example.get('tag_operator', 'N/A')})<br>"
|
| 365 |
+
html += "</div>"
|
| 366 |
|
| 367 |
if examples.get('worst_performing'):
|
| 368 |
html += "<h3>Worst Performing Queries</h3>"
|
| 369 |
for example in examples['worst_performing']:
|
| 370 |
html += f"<div class='example-item'><strong>Query:</strong> {example['query']}<br>"
|
| 371 |
html += f"<strong>Pipeline:</strong> {self.METHOD_NAMES.get(example['pipeline'], example['pipeline'])}<br>"
|
| 372 |
+
html += f"<strong>Precision@5:</strong> {example['precision_at_k']:.3f}, <strong>nDCG@5:</strong> {example['ndcg_at_k']:.3f}, <strong>MRR:</strong> {example['mrr']:.3f}<br>"
|
| 373 |
+
tags_used = example.get('tags_used', [])
|
| 374 |
+
if tags_used:
|
| 375 |
+
html += f"<strong>Tags Used:</strong> {', '.join(tags_used)} (Operator: {example.get('tag_operator', 'N/A')})<br>"
|
| 376 |
+
html += "</div>"
|
| 377 |
|
| 378 |
if examples.get('most_improved'):
|
| 379 |
html += "<h3>Most Improved Queries (Hybrid vs Baseline)</h3>"
|
| 380 |
for example in examples['most_improved']:
|
| 381 |
html += f"<div class='example-item'><strong>Query:</strong> {example['query']}<br>"
|
| 382 |
+
html += f"<strong>Baseline Precision@5:</strong> {example['baseline_precision']:.3f}<br>"
|
| 383 |
+
html += f"<strong>Hybrid Precision@5:</strong> {example['hybrid_precision']:.3f}<br>"
|
| 384 |
+
html += f"<strong>Improvement:</strong> +{example['improvement']:.3f}<br>"
|
| 385 |
+
tags_used = example.get('tags_used', [])
|
| 386 |
+
if tags_used:
|
| 387 |
+
html += f"<strong>Tags Used:</strong> {', '.join(tags_used)} (Operator: {example.get('tag_operator', 'N/A')})<br>"
|
| 388 |
+
html += "</div>"
|
| 389 |
|
| 390 |
if not html:
|
| 391 |
return "<p>No representative examples available.</p>"
|