shahbazdev0 commited on
Commit
ee91b7e
·
verified ·
1 Parent(s): 7ad4566

Update evaluation.py

Browse files
Files changed (1) hide show
  1. evaluation.py +15 -15
evaluation.py CHANGED
@@ -1,10 +1,9 @@
1
- # evaluation.py - Evaluation System
2
  from typing import List, Dict, Tuple
3
  import time
4
  import numpy as np
5
  from dataclasses import dataclass
6
  import json
7
- from sklearn.metrics.pairwise import cosine_similarity
8
 
9
  @dataclass
10
  class Question:
@@ -318,13 +317,13 @@ class Evaluator:
318
  # Check version awareness
319
  version_score = self._compute_version_score(sources, expected_version)
320
 
321
- # Combined score
322
- total_score = (content_score * 0.4 + version_score * 0.4 + keyword_score * 0.2)
323
 
324
  return {
325
- 'content_score': content_score,
326
- 'version_score': version_score,
327
- 'keyword_score': keyword_score,
328
  'total_score': total_score
329
  }
330
 
@@ -351,7 +350,7 @@ class Evaluator:
351
  return 0.0
352
 
353
  def _compute_metrics(self, results: List[Dict]) -> Dict:
354
- """Compute evaluation metrics"""
355
  if not results:
356
  return {
357
  'accuracy': 0.0,
@@ -399,15 +398,16 @@ class Evaluator:
399
  qtype = result['question'].query_type
400
  by_type[qtype].append(result['score']['total_score'])
401
 
 
402
  return {
403
- 'accuracy': np.mean(total_scores) * 100,
404
- 'hit_at_5': np.mean(hits) * 100,
405
- 'mrr': np.mean(reciprocal_ranks),
406
- 'vsa': np.mean(version_scores) * 100, # Version-Sensitive Accuracy
407
  'avg_latency': np.mean(latencies) if latencies else 0,
408
  'by_type': {
409
- 'content_retrieval': np.mean(by_type['content_retrieval']) * 100 if by_type['content_retrieval'] else 0,
410
- 'version_inquiry': np.mean(by_type['version_inquiry']) * 100 if by_type['version_inquiry'] else 0,
411
- 'change_retrieval': np.mean(by_type['change_retrieval']) * 100 if by_type['change_retrieval'] else 0
412
  }
413
  }
 
1
+ # evaluation.py - Evaluation System (WITH SAFETY CAPS)
2
  from typing import List, Dict, Tuple
3
  import time
4
  import numpy as np
5
  from dataclasses import dataclass
6
  import json
 
7
 
8
  @dataclass
9
  class Question:
 
317
  # Check version awareness
318
  version_score = self._compute_version_score(sources, expected_version)
319
 
320
+ # Combined score with SAFETY CAP ✅
321
+ total_score = min((content_score * 0.4 + version_score * 0.4 + keyword_score * 0.2), 1.0)
322
 
323
  return {
324
+ 'content_score': min(content_score, 1.0),
325
+ 'version_score': min(version_score, 1.0),
326
+ 'keyword_score': min(keyword_score, 1.0),
327
  'total_score': total_score
328
  }
329
 
 
350
  return 0.0
351
 
352
  def _compute_metrics(self, results: List[Dict]) -> Dict:
353
+ """Compute evaluation metrics with SAFETY CAPS ✅"""
354
  if not results:
355
  return {
356
  'accuracy': 0.0,
 
398
  qtype = result['question'].query_type
399
  by_type[qtype].append(result['score']['total_score'])
400
 
401
+ # Return metrics with SAFETY CAPS ✅
402
  return {
403
+ 'accuracy': min(np.mean(total_scores) * 100, 100.0),
404
+ 'hit_at_5': min(np.mean(hits) * 100, 100.0),
405
+ 'mrr': min(np.mean(reciprocal_ranks), 1.0),
406
+ 'vsa': min(np.mean(version_scores) * 100, 100.0), # Version-Sensitive Accuracy
407
  'avg_latency': np.mean(latencies) if latencies else 0,
408
  'by_type': {
409
+ 'content_retrieval': min(np.mean(by_type['content_retrieval']) * 100, 100.0) if by_type['content_retrieval'] else 0,
410
+ 'version_inquiry': min(np.mean(by_type['version_inquiry']) * 100, 100.0) if by_type['version_inquiry'] else 0,
411
+ 'change_retrieval': min(np.mean(by_type['change_retrieval']) * 100, 100.0) if by_type['change_retrieval'] else 0
412
  }
413
  }