Spaces:

shahbazdev0
/

VersionRAG

Sleeping

App Files Files Community

shahbazdev0 commited on Dec 4, 2025

Commit

ee91b7e

verified ·

1 Parent(s): 7ad4566

Update evaluation.py

Browse files

Files changed (1) hide show

evaluation.py +15 -15

evaluation.py CHANGED Viewed

@@ -1,10 +1,9 @@
-# evaluation.py - Evaluation System
 from typing import List, Dict, Tuple
 import time
 import numpy as np
 from dataclasses import dataclass
 import json
-from sklearn.metrics.pairwise import cosine_similarity
 @dataclass
 class Question:
@@ -318,13 +317,13 @@ class Evaluator:
         # Check version awareness
         version_score = self._compute_version_score(sources, expected_version)
-        # Combined score
-        total_score = (content_score * 0.4 + version_score * 0.4 + keyword_score * 0.2)
         return {
-            'content_score': content_score,
-            'version_score': version_score,
-            'keyword_score': keyword_score,
             'total_score': total_score
         }
@@ -351,7 +350,7 @@ class Evaluator:
             return 0.0
     def _compute_metrics(self, results: List[Dict]) -> Dict:
-        """Compute evaluation metrics"""
         if not results:
             return {
                 'accuracy': 0.0,
@@ -399,15 +398,16 @@ class Evaluator:
             qtype = result['question'].query_type
             by_type[qtype].append(result['score']['total_score'])
         return {
-            'accuracy': np.mean(total_scores) * 100,
-            'hit_at_5': np.mean(hits) * 100,
-            'mrr': np.mean(reciprocal_ranks),
-            'vsa': np.mean(version_scores) * 100,  # Version-Sensitive Accuracy
             'avg_latency': np.mean(latencies) if latencies else 0,
             'by_type': {
-                'content_retrieval': np.mean(by_type['content_retrieval']) * 100 if by_type['content_retrieval'] else 0,
-                'version_inquiry': np.mean(by_type['version_inquiry']) * 100 if by_type['version_inquiry'] else 0,
-                'change_retrieval': np.mean(by_type['change_retrieval']) * 100 if by_type['change_retrieval'] else 0
             }
         }

+# evaluation.py - Evaluation System (WITH SAFETY CAPS)
 from typing import List, Dict, Tuple
 import time
 import numpy as np
 from dataclasses import dataclass
 import json
 @dataclass
 class Question:
         # Check version awareness
         version_score = self._compute_version_score(sources, expected_version)
+        # Combined score with SAFETY CAP ✅
+        total_score = min((content_score * 0.4 + version_score * 0.4 + keyword_score * 0.2), 1.0)
         return {
+            'content_score': min(content_score, 1.0),
+            'version_score': min(version_score, 1.0),
+            'keyword_score': min(keyword_score, 1.0),
             'total_score': total_score
         }
             return 0.0
     def _compute_metrics(self, results: List[Dict]) -> Dict:
+        """Compute evaluation metrics with SAFETY CAPS ✅"""
         if not results:
             return {
                 'accuracy': 0.0,
             qtype = result['question'].query_type
             by_type[qtype].append(result['score']['total_score'])
+        # Return metrics with SAFETY CAPS ✅
         return {
+            'accuracy': min(np.mean(total_scores) * 100, 100.0),
+            'hit_at_5': min(np.mean(hits) * 100, 100.0),
+            'mrr': min(np.mean(reciprocal_ranks), 1.0),
+            'vsa': min(np.mean(version_scores) * 100, 100.0),  # Version-Sensitive Accuracy
             'avg_latency': np.mean(latencies) if latencies else 0,
             'by_type': {
+                'content_retrieval': min(np.mean(by_type['content_retrieval']) * 100, 100.0) if by_type['content_retrieval'] else 0,
+                'version_inquiry': min(np.mean(by_type['version_inquiry']) * 100, 100.0) if by_type['version_inquiry'] else 0,
+                'change_retrieval': min(np.mean(by_type['change_retrieval']) * 100, 100.0) if by_type['change_retrieval'] else 0
             }
         }