contexto-api / src /evaluation.py
Dev-ks04
feat: Contexto FastAPI backend - intent-aware summarization engine
39028c9
"""
Quality metrics and evaluation for summaries
"""
import logging
from typing import Dict, Tuple
from rouge_score import rouge_scorer
import torch
logger = logging.getLogger(__name__)
class SummaryEvaluator:
"""Evaluate summary quality using ROUGE scores and confidence metrics."""
def __init__(self):
"""Initialize evaluator with ROUGE scorer."""
self.rouge_scorer = rouge_scorer.RougeScorer(
['rouge1', 'rouge2', 'rougeL'],
use_stemmer=True
)
def calculate_rouge_scores(
self,
summary: str,
reference: str = None
) -> Dict[str, float]:
"""
Calculate ROUGE scores (optional reference).
Args:
summary: Generated summary
reference: Reference summary (optional)
Returns:
Dictionary with ROUGE scores
"""
if not reference:
# Self-evaluation based on length and complexity
words = summary.split()
unique_words = len(set(words))
avg_word_length = sum(len(w) for w in words) / len(words) if words else 0
# Simple heuristics
return {
'length_score': min(len(words) / 150, 1.0),
'diversity_score': unique_words / len(words) if words else 0,
'complexity_score': min(avg_word_length / 6, 1.0)
}
# Calculate ROUGE against reference
scores = self.rouge_scorer.score(reference, summary)
return {
'rouge1': scores['rouge1'].fmeasure,
'rouge2': scores['rouge2'].fmeasure,
'rougeL': scores['rougeL'].fmeasure
}
def get_confidence_score(
self,
model_output: torch.Tensor,
summary: str
) -> float:
"""
Calculate confidence score (0-1).
Args:
model_output: Raw model output logits (may be None when called
without direct access to model outputs, e.g. from
the REST API path or main.py single-doc mode).
summary: Generated summary
Returns:
Confidence score (0-1)
"""
# ── Guard: no model output available ──────────────────────────────────
if model_output is None:
confidence = 0.5 # Neutral default when tensor not provided
elif hasattr(model_output, 'sequences_scores'):
scores = model_output.sequences_scores
confidence = torch.sigmoid(scores).item() if len(scores) > 0 else 0.5
else:
confidence = 0.5
# Adjust based on summary characteristics
words = summary.split()
if 5 <= len(words) <= 200: # Reasonable length
confidence *= 1.1
return min(confidence, 1.0)
def evaluate_summary(
self,
summary: str,
reference: str = None,
model_output: torch.Tensor = None
) -> Dict[str, any]:
"""
Complete evaluation of summary.
Args:
summary: Generated summary
reference: Reference summary
model_output: Model output for confidence
Returns:
Comprehensive evaluation metrics
"""
rouge_scores = self.calculate_rouge_scores(summary, reference)
confidence = self.get_confidence_score(model_output, summary)
return {
'summary': summary,
'rouge_scores': rouge_scores,
'confidence_score': confidence,
'length': len(summary.split()),
'quality': 'high' if confidence > 0.7 else 'medium' if confidence > 0.5 else 'low'
}