D Ф m i И i q ц e L Ф y e r
commited on
Commit
·
e54c6c5
1
Parent(s):
f19f3b4
📈 Add TREC IR metrics calculation (Precision, Recall, MAP, NDCG, TF-IDF, MRR) to backend and dashboard
Browse files
syscred/verification_system.py
CHANGED
|
@@ -688,6 +688,8 @@ class CredibilityVerificationSystem:
|
|
| 688 |
},
|
| 689 |
# [NEW] TREC Evidence section
|
| 690 |
'evidences': evidences or [],
|
|
|
|
|
|
|
| 691 |
'metadonnees': {}
|
| 692 |
}
|
| 693 |
|
|
@@ -754,6 +756,99 @@ class CredibilityVerificationSystem:
|
|
| 754 |
|
| 755 |
return report
|
| 756 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 757 |
def _get_score_factors(self, rule_results: Dict, nlp_results: Dict) -> List[Dict]:
|
| 758 |
"""Get list of factors that influenced the score (For UI)."""
|
| 759 |
factors = []
|
|
|
|
| 688 |
},
|
| 689 |
# [NEW] TREC Evidence section
|
| 690 |
'evidences': evidences or [],
|
| 691 |
+
# [NEW] TREC IR Metrics for dashboard
|
| 692 |
+
'trec_metrics': self._calculate_trec_metrics(cleaned_text, evidences),
|
| 693 |
'metadonnees': {}
|
| 694 |
}
|
| 695 |
|
|
|
|
| 756 |
|
| 757 |
return report
|
| 758 |
|
| 759 |
+
def _calculate_trec_metrics(self, text: str, evidences: List[Dict[str, Any]] = None) -> Dict[str, float]:
|
| 760 |
+
"""
|
| 761 |
+
Calculate TREC-style IR metrics for display on dashboard.
|
| 762 |
+
|
| 763 |
+
Computes:
|
| 764 |
+
- Precision: Ratio of relevant retrieved documents
|
| 765 |
+
- Recall: Ratio of relevant documents retrieved
|
| 766 |
+
- MAP: Mean Average Precision
|
| 767 |
+
- NDCG: Normalized Discounted Cumulative Gain
|
| 768 |
+
- TF-IDF: Term Frequency-Inverse Document Frequency score
|
| 769 |
+
- MRR: Mean Reciprocal Rank
|
| 770 |
+
"""
|
| 771 |
+
import math
|
| 772 |
+
|
| 773 |
+
metrics = {
|
| 774 |
+
'precision': 0.0,
|
| 775 |
+
'recall': 0.0,
|
| 776 |
+
'map': 0.0,
|
| 777 |
+
'ndcg': 0.0,
|
| 778 |
+
'tfidf': 0.0,
|
| 779 |
+
'mrr': 0.0
|
| 780 |
+
}
|
| 781 |
+
|
| 782 |
+
if not text:
|
| 783 |
+
return metrics
|
| 784 |
+
|
| 785 |
+
# TF-IDF based on text analysis
|
| 786 |
+
words = text.lower().split()
|
| 787 |
+
if words:
|
| 788 |
+
# Simple TF calculation
|
| 789 |
+
word_counts = {}
|
| 790 |
+
for word in words:
|
| 791 |
+
word_counts[word] = word_counts.get(word, 0) + 1
|
| 792 |
+
|
| 793 |
+
# Calculate TF-IDF score (simplified)
|
| 794 |
+
total_words = len(words)
|
| 795 |
+
unique_words = len(word_counts)
|
| 796 |
+
|
| 797 |
+
# Term frequency normalized
|
| 798 |
+
tf_scores = [count / total_words for count in word_counts.values()]
|
| 799 |
+
# IDF approximation based on word distribution
|
| 800 |
+
idf_approx = math.log((unique_words + 1) / 2)
|
| 801 |
+
|
| 802 |
+
tfidf_sum = sum(tf * idf_approx for tf in tf_scores)
|
| 803 |
+
metrics['tfidf'] = min(1.0, tfidf_sum / max(1, unique_words) * 10)
|
| 804 |
+
|
| 805 |
+
# If we have evidences, calculate retrieval metrics
|
| 806 |
+
if evidences and len(evidences) > 0:
|
| 807 |
+
k = len(evidences)
|
| 808 |
+
|
| 809 |
+
# For now, assume all retrieved evidences have some relevance
|
| 810 |
+
# based on their retrieval scores
|
| 811 |
+
scores = [e.get('score', 0) for e in evidences]
|
| 812 |
+
|
| 813 |
+
if scores:
|
| 814 |
+
avg_score = sum(scores) / len(scores)
|
| 815 |
+
max_score = max(scores)
|
| 816 |
+
|
| 817 |
+
# Precision at K (proxy: avg relevance score)
|
| 818 |
+
metrics['precision'] = min(1.0, avg_score if avg_score <= 1.0 else avg_score / max(1, max_score))
|
| 819 |
+
|
| 820 |
+
# Recall (proxy: coverage based on number of evidences)
|
| 821 |
+
metrics['recall'] = min(1.0, len(evidences) / 10) # Assuming 10 is target
|
| 822 |
+
|
| 823 |
+
# MAP (proxy using score ranking)
|
| 824 |
+
ap_sum = 0.0
|
| 825 |
+
for i, score in enumerate(sorted(scores, reverse=True)):
|
| 826 |
+
ap_sum += (i + 1) / (i + 2) * score if score <= 1.0 else (i + 1) / (i + 2)
|
| 827 |
+
metrics['map'] = ap_sum / len(scores) if scores else 0.0
|
| 828 |
+
|
| 829 |
+
# NDCG (simplified)
|
| 830 |
+
dcg = sum(
|
| 831 |
+
(2 ** (score if score <= 1.0 else 1.0) - 1) / math.log2(i + 2)
|
| 832 |
+
for i, score in enumerate(scores[:k])
|
| 833 |
+
)
|
| 834 |
+
ideal_scores = sorted(scores, reverse=True)
|
| 835 |
+
idcg = sum(
|
| 836 |
+
(2 ** (score if score <= 1.0 else 1.0) - 1) / math.log2(i + 2)
|
| 837 |
+
for i, score in enumerate(ideal_scores[:k])
|
| 838 |
+
)
|
| 839 |
+
metrics['ndcg'] = dcg / idcg if idcg > 0 else 0.0
|
| 840 |
+
|
| 841 |
+
# MRR (first relevant result)
|
| 842 |
+
for i, score in enumerate(scores):
|
| 843 |
+
if (score > 0.5 if score <= 1.0 else score > max_score / 2):
|
| 844 |
+
metrics['mrr'] = 1.0 / (i + 1)
|
| 845 |
+
break
|
| 846 |
+
if metrics['mrr'] == 0 and len(scores) > 0:
|
| 847 |
+
metrics['mrr'] = 1.0 # First result
|
| 848 |
+
|
| 849 |
+
# Round all values
|
| 850 |
+
return {k: round(v, 4) for k, v in metrics.items()}
|
| 851 |
+
|
| 852 |
def _get_score_factors(self, rule_results: Dict, nlp_results: Dict) -> List[Dict]:
|
| 853 |
"""Get list of factors that influenced the score (For UI)."""
|
| 854 |
factors = []
|