Spaces:

satyaki-mitra
/

AI_Text_Authenticator

Sleeping

File size: 35,205 Bytes

edf1149

# DEPENDENCIES
import re
from typing import List
from typing import Dict
from typing import Tuple
from loguru import logger
from typing import Optional
from dataclasses import dataclass
from config.threshold_config import Domain
from metrics.base_metric import MetricResult
from detector.ensemble import EnsembleResult
from detector.ensemble import EnsembleClassifier
from processors.text_processor import TextProcessor
from config.threshold_config import ConfidenceLevel
from config.threshold_config import MetricThresholds
from config.threshold_config import get_confidence_level
from config.threshold_config import get_threshold_for_domain
from config.threshold_config import get_active_metric_weights


@dataclass
class HighlightedSentence:
    """
    A sentence with highlighting information - ENHANCED FOR ENSEMBLE INTEGRATION
    """
    text              : str
    ai_probability    : float
    human_probability : float
    mixed_probability : float
    confidence        : float
    confidence_level  : ConfidenceLevel
    color_class       : str
    tooltip           : str
    index             : int
    is_mixed_content  : bool
    metric_breakdown  : Optional[Dict[str, float]] = None


class TextHighlighter:
    """
    Generates sentence-level highlighting with ensemble resaults integration
    
    FEATURES:
    - Sentence-level highlighting with confidence scores
    - Domain-aware calibration
    - Ensemble-based probability aggregation
    - Mixed content detection
    - Explainable tooltips
    """
    # Color thresholds with MIXED content support
    COLOR_THRESHOLDS = [(0.00, 0.10, "very-high-human", "#dcfce7", "Very likely human-written"),
                        (0.10, 0.25, "high-human", "#bbf7d0", "Likely human-written"),
                        (0.25, 0.40, "medium-human", "#86efac", "Possibly human-written"),
                        (0.40, 0.60, "uncertain", "#fef9c3", "Uncertain"),
                        (0.60, 0.75, "medium-ai", "#fef3c7", "Possibly AI-generated"),
                        (0.75, 0.90, "high-ai", "#fed7aa", "Likely AI-generated"),
                        (0.90, 1.01, "very-high-ai", "#fecaca", "Very likely AI-generated"),
                       ]
    
    # Mixed content pattern
    MIXED_THRESHOLD = 0.25


    def __init__(self, domain: Domain = Domain.GENERAL, ensemble_classifier: Optional[EnsembleClassifier] = None):
        """
        Initialize text highlighter with ENSEMBLE INTEGRATION
        
        Arguments:
        ----------
            domain                    { Domain }       : Text domain for adaptive thresholding

            ensemble_classifier { EnsembleClassifier } : Optional ensemble for sentence-level analysis
        """
        self.text_processor     = TextProcessor()
        self.domain             = domain
        self.domain_thresholds  = get_threshold_for_domain(domain)
        self.ensemble           = ensemble_classifier or EnsembleClassifier(primary_method  = "confidence_calibrated",
                                                                            fallback_method = "domain_weighted",
                                                                           )
    

    def generate_highlights(self, text: str, metric_results: Dict[str, MetricResult], ensemble_result: Optional[EnsembleResult] = None,
                            enabled_metrics: Optional[Dict[str, bool]] = None, use_sentence_level: bool = True) -> List[HighlightedSentence]:
        """
        Generate sentence-level highlights with ensemble integration
        
        Arguments:
        ----------
            text                    { str }       : Original text
            
            metric_results          { dict }      : Results from all 6 metrics
            
            ensemble_result    { EnsembleResult } : Optional document-level ensemble result
            
            enabled_metrics         { dict }      : Dict of metric_name -> is_enabled
            
            use_sentence_level      { bool }      : Whether to compute sentence-level probabilities
            
        Returns:
        --------
                         { list }                 : List of HighlightedSentence objects
        """
        # Get domain-appropriate weights for enabled metrics
        if enabled_metrics is None:
            enabled_metrics = {name: True for name in metric_results.keys()}
        
        weights   = get_active_metric_weights(self.domain, enabled_metrics)
        
        # Split text into sentences
        sentences = self._split_sentences(text)
        
        if not sentences:
            return []
        
        # Calculate probabilities for each sentence using ENSEMBLE METHODS
        highlighted_sentences = list()
        
        for idx, sentence in enumerate(sentences):
            if use_sentence_level:
                # Use ENSEMBLE for sentence-level analysis
                ai_prob, human_prob, mixed_prob, confidence, breakdown = self._calculate_sentence_ensemble_probability(sentence        = sentence, 
                                                                                                                       metric_results  = metric_results,
                                                                                                                       weights         = weights,
                                                                                                                       ensemble_result = ensemble_result,
                                                                                                                      )
            else:
                # Use document-level ensemble probabilities
                ai_prob, human_prob, mixed_prob, confidence, breakdown = self._get_document_ensemble_probability(ensemble_result = ensemble_result,
                                                                                                                 metric_results  = metric_results,
                                                                                                                 weights         = weights,
                                                                                                                )
            
            # Determine if this is mixed content
            is_mixed_content                     = (mixed_prob > self.MIXED_THRESHOLD)
            
            # Get confidence level
            confidence_level                     = get_confidence_level(confidence)
            
            # Get color class (consider mixed content)
            color_class, color_hex, tooltip_base = self._get_color_for_probability(probability      = ai_prob,
                                                                                   is_mixed_content = is_mixed_content,
                                                                                   mixed_prob       = mixed_prob,
                                                                                  )
            
            # Generate enhanced tooltip
            tooltip                              = self._generate_ensemble_tooltip(sentence         = sentence, 
                                                                                   ai_prob          = ai_prob,
                                                                                   human_prob       = human_prob,
                                                                                   mixed_prob       = mixed_prob,
                                                                                   confidence       = confidence, 
                                                                                   confidence_level = confidence_level, 
                                                                                   tooltip_base     = tooltip_base, 
                                                                                   breakdown        = breakdown,
                                                                                   is_mixed_content = is_mixed_content,
                                                                                  )
            
            highlighted_sentences.append(HighlightedSentence(text              = sentence,
                                                             ai_probability    = ai_prob,
                                                             human_probability = human_prob,
                                                             mixed_probability = mixed_prob,
                                                             confidence        = confidence,
                                                             confidence_level  = confidence_level,
                                                             color_class       = color_class,
                                                             tooltip           = tooltip,
                                                             index             = idx,
                                                             is_mixed_content  = is_mixed_content,
                                                             metric_breakdown  = breakdown,
                                                            )
                                        )
        
        return highlighted_sentences

    
    def _calculate_sentence_ensemble_probability(self, sentence: str, metric_results: Dict[str, MetricResult], weights: Dict[str, float], 
                                                 ensemble_result: Optional[EnsembleResult] = None) -> Tuple[float, float, float, float, Dict[str, float]]:
        """
        Calculate sentence probabilities using ensemble methods with domain calibration
        """
        sentence_length = len(sentence.split())
        
        # Skip very short sentences from detailed ensemble analysis
        if (sentence_length < 3):
            return 0.4, 0.5, 0.1, 0.6, {"short_sentence": 0.4}
        
        # Calculate sentence-level metric results
        sentence_metric_results = dict()
        breakdown               = dict()
        
        for name, doc_result in metric_results.items():
            if doc_result.error is None:
                # Compute sentence-level probability for this metric
                sentence_prob                 = self._compute_sentence_metric(metric_name = name,
                                                                              sentence    = sentence,
                                                                              result      = doc_result,
                                                                              weight      = weights.get(name, 0.0),
                                                                             )
                
                # Create sentence-level MetricResult
                sentence_metric_results[name] = self._create_sentence_metric_result(metric_name = name,
                                                                                    ai_prob     = sentence_prob,
                                                                                    doc_result  = doc_result,
                                                                                   )
                
                breakdown[name]               = sentence_prob
        
        # Use ensemble to combine sentence-level metrics
        if sentence_metric_results:
            try:
                ensemble_sentence_result = self.ensemble.predict(metric_results = sentence_metric_results,
                                                                 domain         = self.domain,
                                                                )
                
                return (ensemble_sentence_result.ai_probability, ensemble_sentence_result.human_probability, ensemble_sentence_result.mixed_probability, 
                        ensemble_sentence_result.overall_confidence, breakdown)
                        
            except Exception as e:
                logger.warning(f"Sentence ensemble failed: {e}")
        
        # Fallback: weighted average
        return self._calculate_weighted_probability(metric_results, weights, breakdown)
    

    def _compute_sentence_metric(self, metric_name: str, sentence: str, result: MetricResult, weight: float) -> float:
        """
        Compute metric probability for a single sentence using domain-specific thresholds
        """
        sentence_length   = len(sentence.split())
        
        # Get domain-specific threshold for this metric
        metric_thresholds = getattr(self.domain_thresholds, metric_name, None)

        if not metric_thresholds:
            return result.ai_probability
        
        # Base probability from document-level result
        base_prob         = result.ai_probability
        
        # Apply domain-aware sentence-level adjustments
        adjusted_prob     = self._apply_metric_specific_adjustments(metric_name     = metric_name, 
                                                                    sentence        = sentence, 
                                                                    base_prob       = base_prob, 
                                                                    sentence_length = sentence_length, 
                                                                    thresholds      = metric_thresholds,
                                                                   )
        
        return adjusted_prob
    

    def _create_sentence_metric_result(self, metric_name: str, ai_prob: float, doc_result: MetricResult) -> MetricResult:
        """
        Create sentence-level MetricResult from document-level result
        """
        # Adjust confidence based on sentence characteristics
        sentence_confidence = self._calculate_sentence_confidence(doc_result.confidence)
        
        return MetricResult(metric_name       = metric_name,
                            ai_probability    = ai_prob,
                            human_probability = 1.0 - ai_prob,  
                            mixed_probability = 0.0,  
                            confidence        = sentence_confidence,
                            details           = doc_result.details,
                            error             = None,
                           )
    

    def _calculate_sentence_confidence(self, doc_confidence: float) -> float:
        """
        Calculate confidence for sentence-level analysis
        """
        # Sentence-level analysis typically has lower confidence
        return max(0.1, doc_confidence * 0.8)
    

    def _calculate_weighted_probability(self, metric_results: Dict[str, MetricResult], weights: Dict[str, float], breakdown: Dict[str, float]) -> Tuple[float, float, float, float, Dict[str, float]]:
        """
        Fallback weighted probability calculation
        """
        weighted_ai_probs    = list()
        weighted_human_probs = list()
        confidences          = list()
        
        for name, result in metric_results.items():
            if (result.error is None):
                weight = weights.get(name, 0.0)
                
                if (weight > 0):
                    weighted_ai_probs.append(result.ai_probability * weight)
                    weighted_human_probs.append(result.human_probability * weight)
                    confidences.append(result.confidence)
        
        if not weighted_ai_probs:
            return 0.5, 0.5, 0.0, 0.0, {}
        
        total_weight   = sum(weights.values())
        ai_prob        = sum(weighted_ai_probs) / total_weight if total_weight > 0 else 0.5
        human_prob     = sum(weighted_human_probs) / total_weight if total_weight > 0 else 0.5
        mixed_prob     = 0.0  # Fallback
        avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
        
        return ai_prob, human_prob, mixed_prob, avg_confidence, breakdown
    

    def _get_document_ensemble_probability(self, ensemble_result: Optional[EnsembleResult], metric_results: Dict[str, MetricResult],
                                           weights: Dict[str, float]) -> Tuple[float, float, float, float, Dict[str, float]]:
        """
        Get document-level ensemble probability
        """
        if ensemble_result:
            # Use existing ensemble result
            breakdown = {name: result.ai_probability for name, result in metric_results.items()}
            return (ensemble_result.ai_probability, ensemble_result.human_probability, ensemble_result.mixed_probability,
                    ensemble_result.overall_confidence, breakdown)
        
        else:
            # Calculate from metrics
            return self._calculate_weighted_probability(metric_results, weights, {})
    

    def _apply_domain_specific_adjustments(self, sentence: str, ai_prob: float, sentence_length: int) -> float:
        """
        Apply domain-specific adjustments to AI probability - ENHANCED
        """
        # Your existing domain adjustment logic is good, keeping it
        if (self.domain == Domain.CREATIVE):
            if (sentence_length > 30):
                ai_prob *= 0.9

            elif (self._has_complex_structure(sentence)):
                ai_prob *= 0.85
                
        elif (self.domain == Domain.ACADEMIC):
            if (sentence_length > 40):
                ai_prob *= 1.1

            elif (self._has_citation_patterns(sentence)):
                ai_prob *= 0.8
                
        elif (self.domain == Domain.SOCIAL_MEDIA):
            if (sentence_length < 10):
                ai_prob *= 0.7

            elif (self._has_informal_language(sentence)):
                ai_prob *= 0.8
                
        elif (self.domain in [Domain.LEGAL, Domain.MEDICAL]):
            if (self._has_technical_terms(sentence)):
                ai_prob *= 1.1

            elif (self._has_ambiguous_phrasing(sentence)):
                ai_prob *= 0.9
        
        return max(0.0, min(1.0, ai_prob))
    

    def _apply_metric_specific_adjustments(self, metric_name: str, sentence: str, base_prob: float, sentence_length: int, thresholds: MetricThresholds) -> float:
        """
        Apply metric-specific adjustments
        """
        # Use metrics from ensemble
        if (metric_name == "perplexity"):
            if (sentence_length < 8):
                return min(1.0, base_prob * 1.2)

            elif (sentence_length > 25):
                return max(0.0, base_prob * 0.8)
        
        elif (metric_name == "entropy"):
            words = sentence.split()
            
            if (len(words) > 3):
                unique_words = len(set(words))
                diversity    = unique_words / len(words)

                if (diversity < 0.6):
                    return min(1.0, base_prob * 1.2)

                elif (diversity > 0.8):
                    return max(0.0, base_prob * 0.8)
        
        elif (metric_name == "linguistic"):
            complexity_score = self._analyze_sentence_complexity(sentence)

            if (complexity_score < 0.3):
                return min(1.0, base_prob * 1.1)
            
            elif (complexity_score > 0.7):
                return max(0.0, base_prob * 0.9)
        
        elif (metric_name == "structural"):
            if ((sentence_length < 5) or (sentence_length > 40)):
                return max(0.0, base_prob * 0.8)

            elif (8 <= sentence_length <= 20):
                return min(1.0, base_prob * 1.1)
        
        elif (metric_name == "semantic_analysis"):
            if self._has_repetition(sentence):
                return min(1.0, base_prob * 1.2)
        
        elif (metric_name == "detect_gpt"):
            # DetectGPT adjustments for sentence level
            if (sentence_length > 15):
                return min(1.0, base_prob * 1.1)
        
        return base_prob
    

    def _get_color_for_probability(self, probability: float, is_mixed_content: bool = False, mixed_prob: float = 0.0) -> Tuple[str, str, str]:
        """
        Get color class with mixed content support
        """
        if is_mixed_content and mixed_prob > self.MIXED_THRESHOLD:
            return "mixed-content", "#e9d5ff", f"Mixed AI/Human content ({mixed_prob:.1%} mixed)"
        
        for min_thresh, max_thresh, color_class, color_hex, tooltip in self.COLOR_THRESHOLDS:
            if (min_thresh <= probability < max_thresh):
                return color_class, color_hex, tooltip
        
        return "uncertain", "#fef9c3", "Uncertain"
    

    def _generate_ensemble_tooltip(self, sentence: str, ai_prob: float, human_prob: float, mixed_prob: float, confidence: float, confidence_level: ConfidenceLevel, 
                                  tooltip_base: str, breakdown: Optional[Dict[str, float]] = None, is_mixed_content: bool = False) -> str:
        """
        Generate enhanced tooltip with ENSEMBLE information
        """
        tooltip = f"{tooltip_base}\n"
        
        if is_mixed_content:
            tooltip += "🔀 MIXED CONTENT DETECTED\n"
        
        tooltip += f"AI Probability: {ai_prob:.1%}\n"
        tooltip += f"Human Probability: {human_prob:.1%}\n"
        tooltip += f"Mixed Probability: {mixed_prob:.1%}\n"
        tooltip += f"Confidence: {confidence:.1%} ({confidence_level.value.replace('_', ' ').title()})\n"
        tooltip += f"Domain: {self.domain.value.title()}\n"
        tooltip += f"Length: {len(sentence.split())} words"
        
        if breakdown:
            tooltip += "\n\nMetric Breakdown:"
            # Show top 4 metrics
            for metric, prob in list(breakdown.items())[:4]:  
                tooltip += f"\n• {metric}: {prob:.1%}"
        
        tooltip += f"\n\nEnsemble Method: {self.ensemble.primary_method}"
        
        return tooltip


    def _has_citation_patterns(self, sentence: str) -> bool:
        """
        Check for academic citation patterns
        """
        citation_indicators = ['et al.', 'ibid.', 'cf.', 'e.g.', 'i.e.', 'vol.', 'pp.', 'ed.', 'trans.']
        return any(indicator in sentence for indicator in citation_indicators)
    

    def _has_informal_language(self, sentence: str) -> bool:
        """
        Check for informal language patterns
        """
        informal_indicators = ['lol', 'omg', 'btw', 'imo', 'tbh', 'afaik', 'smh', '👋', '😂', '❤️']
        return any(indicator in sentence.lower() for indicator in informal_indicators)
    

    def _has_technical_terms(self, sentence: str) -> bool:
        """
        Check for domain-specific technical terms
        """
        technical_indicators = ['hereinafter', 'whereas', 'aforementioned', 'diagnosis', 'prognosis', 'etiology']
        return any(indicator in sentence.lower() for indicator in technical_indicators)
    

    def _has_ambiguous_phrasing(self, sentence: str) -> bool:
        """
        Check for ambiguous phrasing that might indicate human writing
        """
        ambiguous_indicators = ['perhaps', 'maybe', 'possibly', 'likely', 'appears to', 'seems to']
        return any(indicator in sentence.lower() for indicator in ambiguous_indicators)
    

    def _has_complex_structure(self, sentence: str) -> bool:
        """
        Check if sentence has complex linguistic structure
        """
        words = sentence.split()
        if len(words) < 8:
            return False
        complex_indicators = ['which', 'that', 'although', 'because', 'while', 'when', 'if']
        return any(indicator in sentence.lower() for indicator in complex_indicators)
    

    def _analyze_sentence_complexity(self, sentence: str) -> float:
        """
        Analyze sentence complexity (0 = simple, 1 = complex)
        """
        words = sentence.split()
        if len(words) < 5:
            return 0.2
        
        complexity_indicators = ['although', 'because', 'while', 'when', 'if', 'since', 'unless', 'until', 'which', 'that', 'who', 'whom', 'whose', 'and', 'but', 'or', 'yet', 'so', 'however', 'therefore', 'moreover', 'furthermore', 'nevertheless', ',', ';', ':', '—']
        
        score                 = 0.0
        
        if (len(words) > 15):
            score += 0.3

        elif (len(words) > 25):
            score += 0.5
        
        indicator_count   = sum(1 for indicator in complexity_indicators if indicator in sentence.lower())
        score            += min(0.5, indicator_count * 0.1)
        
        clause_indicators = [',', ';', 'and', 'but', 'or', 'because', 'although']
        clause_count      = sum(1 for indicator in clause_indicators if indicator in sentence.lower())
        score             += min(0.2, clause_count * 0.05)
        
        return min(1.0, score)
    

    def _has_repetition(self, sentence: str) -> bool:
        """
        Check if sentence has word repetition (common in AI text)
        """
        words = sentence.lower().split()
        if len(words) < 6:
            return False
        
        word_counts = dict()

        for word in words:
            if (len(word) > 3):
                word_counts[word] = word_counts.get(word, 0) + 1
        
        repeated_words = [word for word, count in word_counts.items() if count > 2]
        
        return len(repeated_words) > 0
    

    def _split_sentences(self, text: str) -> List[str]:
        """
        Split the text chunk into multiple sentences
        """
        sentences          = self.text_processor.split_sentences(text)
        filtered_sentences = list()

        for sentence in sentences:
            clean_sentence = sentence.strip()
            if (len(clean_sentence) >= 10):
                filtered_sentences.append(clean_sentence)
        
        return filtered_sentences


    def generate_html(self, highlighted_sentences: List[HighlightedSentence], include_legend: bool = True, include_metrics: bool = False) -> str:
        """
        Generate HTML with highlighted sentences
        """
        html_parts = list()
        
        # Add CSS
        html_parts.append(self._generate_enhanced_css())
        
        # Add legend if requested
        if include_legend:
            html_parts.append(self._generate_legend_html())
        
        # Add highlighted text container
        html_parts.append('<div class="highlighted-text">')
        
        for sent in highlighted_sentences:
            extra_class = " mixed-highlight" if sent.is_mixed_content else ""
            html_parts.append(f'<span class="highlight {sent.color_class}{extra_class}" '
                            f'data-ai-prob="{sent.ai_probability:.4f}" '
                            f'data-human-prob="{sent.human_probability:.4f}" '
                            f'data-mixed-prob="{sent.mixed_probability:.4f}" '
                            f'data-confidence="{sent.confidence:.4f}" '
                            f'data-confidence-level="{sent.confidence_level.value}" '
                            f'data-domain="{self.domain.value}" '
                            f'data-sentence-idx="{sent.index}" '
                            f'data-is-mixed="{str(sent.is_mixed_content).lower()}" '
                            f'title="{sent.tooltip}">'
                            f'{sent.text}'
                            f'</span> ')
        
        html_parts.append('</div>')
        
        # Add metrics summary if requested
        if include_metrics and highlighted_sentences:
            html_parts.append(self._generate_metrics_summary(highlighted_sentences))
        
        return '\n'.join(html_parts)
    

    def _generate_enhanced_css(self) -> str:
        """
        Generate CSS for highlighting
        """
        return """
        <style>
        .highlighted-text {
            line-height: 1.8;
            font-size: 16px;
            font-family: 'Georgia', serif;
            padding: 20px;
            background: #ffffff;
            border-radius: 8px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
            margin-bottom: 20px;
        }
        
        .highlight {
            padding: 2px 4px;
            margin: 0 1px;
            border-radius: 3px;
            transition: all 0.2s ease;
            cursor: help;
            border-bottom: 2px solid transparent;
        }
        
        .highlight:hover {
            transform: scale(1.02);
            box-shadow: 0 2px 8px rgba(0,0,0,0.15);
            z-index: 10;
            position: relative;
        }
        
        /* AI indicators */
        .very-high-ai {
            background-color: #fecaca;
            border-bottom-color: #ef4444;
        }
        
        .high-ai {
            background-color: #fed7aa;
            border-bottom-color: #f97316;
        }
        
        .medium-ai {
            background-color: #fef3c7;
            border-bottom-color: #f59e0b;
        }
        
        /* Uncertain */
        .uncertain {
            background-color: #fef9c3;
            border-bottom-color: #fbbf24;
        }
        
        /* Human indicators */
        .medium-human {
            background-color: #ecfccb;
            border-bottom-color: #a3e635;
        }
        
        .high-human {
            background-color: #bbf7d0;
            border-bottom-color: #4ade80;
        }
        
        .very-high-human {
            background-color: #dcfce7;
            border-bottom-color: #22c55e;
        }
        
        /* Mixed content */
        .mixed-content {
            background-color: #e9d5ff;
            border-bottom-color: #a855f7;
            background-image: repeating-linear-gradient(45deg, transparent, transparent 5px, rgba(168, 85, 247, 0.1) 5px, rgba(168, 85, 247, 0.1) 10px);
        }
        
        .mixed-highlight:hover {
            border: 2px dashed #a855f7;
        }
        
        /* Legend and summary styles */
        .highlight-legend, .highlight-summary {
            margin-bottom: 20px;
            padding: 15px;
            background: #f9fafb;
            border-radius: 8px;
            border: 1px solid #e5e7eb;
        }
        
        .highlight-legend h4, .highlight-summary h4 {
            margin: 0 0 10px 0;
            font-size: 14px;
            font-weight: 600;
            color: #374151;
        }
        
        .legend-items {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 8px;
        }
        
        .legend-item {
            display: flex;
            align-items: center;
            gap: 8px;
            font-size: 13px;
        }
        
        .legend-color {
            width: 40px;
            height: 20px;
            border-radius: 4px;
            display: inline-block;
        }
        
        .legend-label {
            color: #6b7280;
        }
        
        .summary-stats {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 10px;
        }
        
        .stat-item {
            display: flex;
            justify-content: space-between;
            align-items: center;
            padding: 8px 12px;
            background: white;
            border-radius: 6px;
            border: 1px solid #e5e7eb;
        }
        
        .stat-label {
            font-size: 13px;
            color: #6b7280;
        }
        
        .stat-value {
            font-size: 13px;
            font-weight: 600;
            color: #374151;
        }
        </style>
        """
    

    def _generate_metrics_summary(self, sentences: List[HighlightedSentence]) -> str:
        """
        Generate summary statistics for highlighted sentences
        """
        if not sentences:
            return ""
        
        ai_probs            = [s.ai_probability for s in sentences]
        avg_ai_prob         = sum(ai_probs) / len(ai_probs)
        
        # Count sentences by category
        ai_sentences        = len([s for s in sentences if s.ai_probability >= 0.6])
        human_sentences     = len([s for s in sentences if s.ai_probability <= 0.4])
        uncertain_sentences = len([s for s in sentences if 0.4 < s.ai_probability < 0.6])
        mixed_sentences     = len([s for s in sentences if s.is_mixed_content])
        
        html                = f"""
        <div class="highlight-summary">
            <h4>Text Analysis Summary</h4>
            <div class="summary-stats">
                <div class="stat-item">
                    <span class="stat-label">Average AI Probability</span>
                    <span class="stat-value">{avg_ai_prob:.1%}</span>
                </div>
                <div class="stat-item">
                    <span class="stat-label">AI-like Sentences</span>
                    <span class="stat-value">{ai_sentences} ({ai_sentences/len(sentences):.1%})</span>
                </div>
                <div class="stat-item">
                    <span class="stat-label">Human-like Sentences</span>
                    <span class="stat-value">{human_sentences} ({human_sentences/len(sentences):.1%})</span>
                </div>
                <div class="stat-item">
                    <span class="stat-label">Uncertain Sentences</span>
                    <span class="stat-value">{uncertain_sentences} ({uncertain_sentences/len(sentences):.1%})</span>
                </div>
                <div class="stat-item">
                    <span class="stat-label">Mixed Content Sentences</span>
                    <span class="stat-value">{mixed_sentences} ({mixed_sentences/len(sentences):.1%})</span>
                </div>
                <div class="stat-item">
                    <span class="stat-label">Domain</span>
                    <span class="stat-value">{self.domain.value.title()}</span>
                </div>
            </div>
        </div>
        """
        return html
    

    def _generate_legend_html(self) -> str:
        """
        Generate HTML for color legend
        """
        html  = '<div class="highlight-legend">'
        html += '<h4>Detection Legend:</h4>'
        html += '<div class="legend-items">'
        
        # Add mixed content legend item
        html += (f'<div class="legend-item">'
                  f'<span class="legend-color mixed-content"></span>'
                  f'<span class="legend-label">Mixed AI/Human Content</span>'
                  f'</div>'
                 )
        
        for min_t, max_t, color_class, color_hex, label in self.COLOR_THRESHOLDS:
            range_text = f"{min_t:.0%}-{max_t:.0%}" if max_t < 1.01 else f"{min_t:.0%}+"
            html      += (f'<div class="legend-item">'
                          f'<span class="legend-color {color_class}"></span>'
                          f'<span class="legend-label">{label} ({range_text})</span>'
                          f'</div>'
                         )
        
        html += '</div></div>'
        return html


# Export
__all__ = ["TextHighlighter", 
           "HighlightedSentence",
          ]