File size: 3,840 Bytes
39028c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
Quality metrics and evaluation for summaries
"""

import logging
from typing import Dict, Tuple
from rouge_score import rouge_scorer
import torch

logger = logging.getLogger(__name__)


class SummaryEvaluator:
    """Evaluate summary quality using ROUGE scores and confidence metrics."""
    
    def __init__(self):
        """Initialize evaluator with ROUGE scorer."""
        self.rouge_scorer = rouge_scorer.RougeScorer(
            ['rouge1', 'rouge2', 'rougeL'],
            use_stemmer=True
        )
    
    def calculate_rouge_scores(
        self, 
        summary: str, 
        reference: str = None
    ) -> Dict[str, float]:
        """
        Calculate ROUGE scores (optional reference).
        
        Args:
            summary: Generated summary
            reference: Reference summary (optional)
            
        Returns:
            Dictionary with ROUGE scores
        """
        if not reference:
            # Self-evaluation based on length and complexity
            words = summary.split()
            unique_words = len(set(words))
            avg_word_length = sum(len(w) for w in words) / len(words) if words else 0
            
            # Simple heuristics
            return {
                'length_score': min(len(words) / 150, 1.0),
                'diversity_score': unique_words / len(words) if words else 0,
                'complexity_score': min(avg_word_length / 6, 1.0)
            }
        
        # Calculate ROUGE against reference
        scores = self.rouge_scorer.score(reference, summary)
        return {
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        }
    
    def get_confidence_score(
        self,
        model_output: torch.Tensor,
        summary: str
    ) -> float:
        """
        Calculate confidence score (0-1).

        Args:
            model_output: Raw model output logits (may be None when called
                          without direct access to model outputs, e.g. from
                          the REST API path or main.py single-doc mode).
            summary: Generated summary

        Returns:
            Confidence score (0-1)
        """
        # ── Guard: no model output available ──────────────────────────────────
        if model_output is None:
            confidence = 0.5  # Neutral default when tensor not provided
        elif hasattr(model_output, 'sequences_scores'):
            scores = model_output.sequences_scores
            confidence = torch.sigmoid(scores).item() if len(scores) > 0 else 0.5
        else:
            confidence = 0.5

        # Adjust based on summary characteristics
        words = summary.split()
        if 5 <= len(words) <= 200:  # Reasonable length
            confidence *= 1.1

        return min(confidence, 1.0)
    
    def evaluate_summary(
        self,
        summary: str,
        reference: str = None,
        model_output: torch.Tensor = None
    ) -> Dict[str, any]:
        """
        Complete evaluation of summary.
        
        Args:
            summary: Generated summary
            reference: Reference summary
            model_output: Model output for confidence
            
        Returns:
            Comprehensive evaluation metrics
        """
        rouge_scores = self.calculate_rouge_scores(summary, reference)
        confidence = self.get_confidence_score(model_output, summary)
        
        return {
            'summary': summary,
            'rouge_scores': rouge_scores,
            'confidence_score': confidence,
            'length': len(summary.split()),
            'quality': 'high' if confidence > 0.7 else 'medium' if confidence > 0.5 else 'low'
        }