Spaces:
Sleeping
Sleeping
Update evaluation.py
Browse files- evaluation.py +15 -15
evaluation.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
-
# evaluation.py - Evaluation System
|
| 2 |
from typing import List, Dict, Tuple
|
| 3 |
import time
|
| 4 |
import numpy as np
|
| 5 |
from dataclasses import dataclass
|
| 6 |
import json
|
| 7 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
| 8 |
|
| 9 |
@dataclass
|
| 10 |
class Question:
|
|
@@ -318,13 +317,13 @@ class Evaluator:
|
|
| 318 |
# Check version awareness
|
| 319 |
version_score = self._compute_version_score(sources, expected_version)
|
| 320 |
|
| 321 |
-
# Combined score
|
| 322 |
-
total_score = (content_score * 0.4 + version_score * 0.4 + keyword_score * 0.2)
|
| 323 |
|
| 324 |
return {
|
| 325 |
-
'content_score': content_score,
|
| 326 |
-
'version_score': version_score,
|
| 327 |
-
'keyword_score': keyword_score,
|
| 328 |
'total_score': total_score
|
| 329 |
}
|
| 330 |
|
|
@@ -351,7 +350,7 @@ class Evaluator:
|
|
| 351 |
return 0.0
|
| 352 |
|
| 353 |
def _compute_metrics(self, results: List[Dict]) -> Dict:
|
| 354 |
-
"""Compute evaluation metrics"""
|
| 355 |
if not results:
|
| 356 |
return {
|
| 357 |
'accuracy': 0.0,
|
|
@@ -399,15 +398,16 @@ class Evaluator:
|
|
| 399 |
qtype = result['question'].query_type
|
| 400 |
by_type[qtype].append(result['score']['total_score'])
|
| 401 |
|
|
|
|
| 402 |
return {
|
| 403 |
-
'accuracy': np.mean(total_scores) * 100,
|
| 404 |
-
'hit_at_5': np.mean(hits) * 100,
|
| 405 |
-
'mrr': np.mean(reciprocal_ranks),
|
| 406 |
-
'vsa': np.mean(version_scores) * 100, # Version-Sensitive Accuracy
|
| 407 |
'avg_latency': np.mean(latencies) if latencies else 0,
|
| 408 |
'by_type': {
|
| 409 |
-
'content_retrieval': np.mean(by_type['content_retrieval']) * 100 if by_type['content_retrieval'] else 0,
|
| 410 |
-
'version_inquiry': np.mean(by_type['version_inquiry']) * 100 if by_type['version_inquiry'] else 0,
|
| 411 |
-
'change_retrieval': np.mean(by_type['change_retrieval']) * 100 if by_type['change_retrieval'] else 0
|
| 412 |
}
|
| 413 |
}
|
|
|
|
| 1 |
+
# evaluation.py - Evaluation System (WITH SAFETY CAPS)
|
| 2 |
from typing import List, Dict, Tuple
|
| 3 |
import time
|
| 4 |
import numpy as np
|
| 5 |
from dataclasses import dataclass
|
| 6 |
import json
|
|
|
|
| 7 |
|
| 8 |
@dataclass
|
| 9 |
class Question:
|
|
|
|
| 317 |
# Check version awareness
|
| 318 |
version_score = self._compute_version_score(sources, expected_version)
|
| 319 |
|
| 320 |
+
# Combined score with SAFETY CAP ✅
|
| 321 |
+
total_score = min((content_score * 0.4 + version_score * 0.4 + keyword_score * 0.2), 1.0)
|
| 322 |
|
| 323 |
return {
|
| 324 |
+
'content_score': min(content_score, 1.0),
|
| 325 |
+
'version_score': min(version_score, 1.0),
|
| 326 |
+
'keyword_score': min(keyword_score, 1.0),
|
| 327 |
'total_score': total_score
|
| 328 |
}
|
| 329 |
|
|
|
|
| 350 |
return 0.0
|
| 351 |
|
| 352 |
def _compute_metrics(self, results: List[Dict]) -> Dict:
|
| 353 |
+
"""Compute evaluation metrics with SAFETY CAPS ✅"""
|
| 354 |
if not results:
|
| 355 |
return {
|
| 356 |
'accuracy': 0.0,
|
|
|
|
| 398 |
qtype = result['question'].query_type
|
| 399 |
by_type[qtype].append(result['score']['total_score'])
|
| 400 |
|
| 401 |
+
# Return metrics with SAFETY CAPS ✅
|
| 402 |
return {
|
| 403 |
+
'accuracy': min(np.mean(total_scores) * 100, 100.0),
|
| 404 |
+
'hit_at_5': min(np.mean(hits) * 100, 100.0),
|
| 405 |
+
'mrr': min(np.mean(reciprocal_ranks), 1.0),
|
| 406 |
+
'vsa': min(np.mean(version_scores) * 100, 100.0), # Version-Sensitive Accuracy
|
| 407 |
'avg_latency': np.mean(latencies) if latencies else 0,
|
| 408 |
'by_type': {
|
| 409 |
+
'content_retrieval': min(np.mean(by_type['content_retrieval']) * 100, 100.0) if by_type['content_retrieval'] else 0,
|
| 410 |
+
'version_inquiry': min(np.mean(by_type['version_inquiry']) * 100, 100.0) if by_type['version_inquiry'] else 0,
|
| 411 |
+
'change_retrieval': min(np.mean(by_type['change_retrieval']) * 100, 100.0) if by_type['change_retrieval'] else 0
|
| 412 |
}
|
| 413 |
}
|