# DEPENDENCIES import re from typing import List from typing import Dict from typing import Tuple from loguru import logger from typing import Optional from dataclasses import dataclass from config.threshold_config import Domain from metrics.base_metric import MetricResult from detector.ensemble import EnsembleResult from detector.ensemble import EnsembleClassifier from processors.text_processor import TextProcessor from config.threshold_config import ConfidenceLevel from config.threshold_config import MetricThresholds from config.threshold_config import get_confidence_level from config.threshold_config import get_threshold_for_domain from config.threshold_config import get_active_metric_weights @dataclass class HighlightedSentence: """ A sentence with highlighting information """ text : str ai_probability : float human_probability : float mixed_probability : float confidence : float confidence_level : ConfidenceLevel color_class : str tooltip : str index : int is_mixed_content : bool metric_breakdown : Optional[Dict[str, float]] = None class TextHighlighter: """ Generates sentence-level highlighting with ensemble results integration FEATURES: - Sentence-level highlighting with confidence scores - Domain-aware calibration - Ensemble-based probability aggregation - Mixed content detection - Explainable tooltips - Highlighting metrics calculation """ # Color thresholds with mixed content support - FIXED: No gaps COLOR_THRESHOLDS = [(0.00, 0.10, "very-high-human", "#dcfce7", "Very likely human-written"), (0.10, 0.25, "high-human", "#bbf7d0", "Likely human-written"), (0.25, 0.40, "medium-human", "#86efac", "Possibly human-written"), (0.40, 0.60, "uncertain", "#fef9c3", "Uncertain"), (0.60, 0.75, "medium-ai", "#fde68a", "Possibly AI-generated"), (0.75, 0.90, "high-ai", "#fed7aa", "Likely AI-generated"), (0.90, 1.00, "very-high-ai", "#fecaca", "Very likely AI-generated"), ] # Mixed content pattern MIXED_THRESHOLD = 0.25 # Risk weights RISK_WEIGHTS = {'very-high-ai' : 1.0, 'high-ai' : 0.8, 'medium-ai' : 0.6, 'uncertain' : 0.4, 'medium-human' : 0.2, 'high-human' : 0.1, 'very-high-human' : 0.0, 'mixed-content' : 0.7, } def __init__(self, domain: Domain = Domain.GENERAL, ensemble_classifier: Optional[EnsembleClassifier] = None): """ Initialize text highlighter with ENSEMBLE INTEGRATION Arguments: ---------- domain { Domain } : Text domain for adaptive thresholding ensemble_classifier { EnsembleClassifier } : Optional ensemble for sentence-level analysis """ self.text_processor = TextProcessor() self.domain = domain self.domain_thresholds = get_threshold_for_domain(domain) self.ensemble = ensemble_classifier or self._create_default_ensemble() def _create_default_ensemble(self) -> EnsembleClassifier: """ Create default ensemble classifier with proper error handling """ try: return EnsembleClassifier(primary_method = "confidence_calibrated", fallback_method = "domain_weighted", ) except Exception as e: logger.warning(f"Failed to create default ensemble: {e}. Using fallback mode.") # Return a minimal ensemble or raise based on requirements return EnsembleClassifier(primary_method = "weighted_average") def generate_highlights(self, text: str, metric_results: Dict[str, MetricResult], ensemble_result: Optional[EnsembleResult] = None, enabled_metrics: Optional[Dict[str, bool]] = None, use_sentence_level: bool = True) -> List[HighlightedSentence]: """ Generate sentence-level highlights with ensemble integration Arguments: ---------- text { str } : Original text metric_results { dict } : Results from all 6 metrics ensemble_result { EnsembleResult } : Optional document-level ensemble result enabled_metrics { dict } : Dict of metric_name -> is_enabled use_sentence_level { bool } : Whether to compute sentence-level probabilities Returns: -------- { list } : List of HighlightedSentence objects """ try: # Validate inputs if not text or not text.strip(): return self._handle_empty_text(text, metric_results, ensemble_result) # Get domain-appropriate weights for enabled metrics if enabled_metrics is None: enabled_metrics = {name: True for name in metric_results.keys()} weights = get_active_metric_weights(self.domain, enabled_metrics) # Split text into sentences with error handling sentences = self._split_sentences_with_fallback(text) if not sentences: return self._handle_no_sentences(text, metric_results, ensemble_result) # Calculate probabilities for each sentence using ENSEMBLE METHODS highlighted_sentences = list() for idx, sentence in enumerate(sentences): try: if use_sentence_level: # Use ENSEMBLE for sentence-level analysis ai_prob, human_prob, mixed_prob, confidence, breakdown = self._calculate_sentence_ensemble_probability(sentence = sentence, metric_results = metric_results, weights = weights, ensemble_result = ensemble_result, ) else: # Use document-level ensemble probabilities ai_prob, human_prob, mixed_prob, confidence, breakdown = self._get_document_ensemble_probability(ensemble_result = ensemble_result, metric_results = metric_results, weights = weights, ) # Apply domain-specific adjustments with limits ai_prob = self._apply_domain_specific_adjustments(sentence = sentence, ai_prob = ai_prob, sentence_length = len(sentence.split()), ) # Determine if this is mixed content is_mixed_content = (mixed_prob > self.MIXED_THRESHOLD) # Get confidence level confidence_level = get_confidence_level(confidence) # Get color class (consider mixed content) color_class, color_hex, tooltip_base = self._get_color_for_probability(probability = ai_prob, is_mixed_content = is_mixed_content, mixed_prob = mixed_prob, ) # Generate enhanced tooltip tooltip = self._generate_ensemble_tooltip(sentence = sentence, ai_prob = ai_prob, human_prob = human_prob, mixed_prob = mixed_prob, confidence = confidence, confidence_level = confidence_level, tooltip_base = tooltip_base, breakdown = breakdown, is_mixed_content = is_mixed_content, ) highlighted_sentences.append(HighlightedSentence(text = sentence, ai_probability = ai_prob, human_probability = human_prob, mixed_probability = mixed_prob, confidence = confidence, confidence_level = confidence_level, color_class = color_class, tooltip = tooltip, index = idx, is_mixed_content = is_mixed_content, metric_breakdown = breakdown, ) ) except Exception as e: logger.warning(f"Failed to process sentence {idx}: {e}") # Add fallback sentence highlighted_sentences.append(self._create_fallback_sentence(sentence, idx)) return highlighted_sentences except Exception as e: logger.error(f"Highlight generation failed: {e}") return self._create_error_fallback(text, metric_results) def _handle_empty_text(self, text: str, metric_results: Dict[str, MetricResult], ensemble_result: Optional[EnsembleResult]) -> List[HighlightedSentence]: """ Handle empty input text """ if ensemble_result: return [self._create_fallback_sentence(text = "No text content", index = 0, ai_prob = ensemble_result.ai_probability, human_prob = ensemble_result.human_probability, ) ] return [self._create_fallback_sentence("No text content", 0)] def _handle_no_sentences(self, text: str, metric_results: Dict[str, MetricResult], ensemble_result: Optional[EnsembleResult]) -> List[HighlightedSentence]: """ Handle case where no sentences could be extracted """ if (text and (len(text.strip()) > 0)): # Treat entire text as one sentence return [self._create_fallback_sentence(text.strip(), 0)] return [self._create_fallback_sentence("No processable content", 0)] def _create_fallback_sentence(self, text: str, index: int, ai_prob: float = 0.5, human_prob: float = 0.5) -> HighlightedSentence: """ Create a fallback sentence when processing fails """ confidence_level = get_confidence_level(0.3) color_class, _, tooltip_base = self._get_color_for_probability(probability = ai_prob, is_mixed_content = False, mixed_prob = 0.0, ) return HighlightedSentence(text = text, ai_probability = ai_prob, human_probability = human_prob, mixed_probability = 0.0, confidence = 0.3, confidence_level = confidence_level, color_class = color_class, tooltip = f"Fallback: {tooltip_base}\nProcessing failed for this sentence", index = index, is_mixed_content = False, metric_breakdown = {"fallback": ai_prob}, ) def _create_error_fallback(self, text: str, metric_results: Dict[str, MetricResult]) -> List[HighlightedSentence]: """ Create fallback when entire processing fails """ return [HighlightedSentence(text = text[:100] + "..." if len(text) > 100 else text, ai_probability = 0.5, human_probability = 0.5, mixed_probability = 0.0, confidence = 0.1, confidence_level = get_confidence_level(0.1), color_class = "uncertain", tooltip = "Error in text processing", index = 0, is_mixed_content = False, metric_breakdown = {"error": 0.5}, ) ] def _split_sentences_with_fallback(self, text: str) -> List[str]: """ Split text into sentences with comprehensive fallback handling """ try: sentences = self.text_processor.split_sentences(text) filtered_sentences = [s.strip() for s in sentences if len(s.strip()) >= 3] if filtered_sentences: return filtered_sentences # Fallback: split by common sentence endings fallback_sentences = re.split(r'[.!?]+', text) fallback_sentences = [s.strip() for s in fallback_sentences if len(s.strip()) >= 3] if fallback_sentences: return fallback_sentences # Ultimate fallback: treat as single sentence if meaningful if text.strip(): return [text.strip()] return [] except Exception as e: logger.warning(f"Sentence splitting failed, using fallback: {e}") # Return text as single sentence return [text] if text.strip() else [] def _calculate_sentence_ensemble_probability(self, sentence: str, metric_results: Dict[str, MetricResult], weights: Dict[str, float], ensemble_result: Optional[EnsembleResult] = None) -> Tuple[float, float, float, float, Dict[str, float]]: """ Calculate sentence probabilities using ensemble methods with domain calibration """ sentence_length = len(sentence.split()) # Handling short sentences - don't force neutral if (sentence_length < 3): # Return probabilities with lower confidence for very short sentences base_ai_prob = 0.5 # Low confidence for very short sentences base_confidence = 0.2 breakdown = {"short_sentence" : base_ai_prob} # Try to get some signal from available metrics for name, result in metric_results.items(): if ((result.error is None) and (weights.get(name, 0) > 0)): base_ai_prob = result.ai_probability breakdown[name] = base_ai_prob break return base_ai_prob, 1.0 - base_ai_prob, 0.0, base_confidence, breakdown # Calculate sentence-level metric results sentence_metric_results = dict() breakdown = dict() for name, doc_result in metric_results.items(): if doc_result.error is None: try: # Compute sentence-level probability for this metric sentence_prob = self._compute_sentence_metric(metric_name = name, sentence = sentence, result = doc_result, weight = weights.get(name, 0.0), ) # Create sentence-level MetricResult sentence_metric_results[name] = self._create_sentence_metric_result(metric_name = name, ai_prob = sentence_prob, doc_result = doc_result, sentence_length = sentence_length, ) breakdown[name] = sentence_prob except Exception as e: logger.warning(f"Metric {name} failed for sentence: {e}") # Use document probability as fallback breakdown[name] = doc_result.ai_probability # Use ensemble to combine sentence-level metrics if sentence_metric_results: try: ensemble_sentence_result = self.ensemble.predict(metric_results = sentence_metric_results, domain = self.domain, ) return (ensemble_sentence_result.ai_probability, ensemble_sentence_result.human_probability, ensemble_sentence_result.mixed_probability, ensemble_sentence_result.overall_confidence, breakdown) except Exception as e: logger.warning(f"Sentence ensemble failed: {e}") # Fallback: weighted average return self._calculate_weighted_probability(metric_results, weights, breakdown) def _compute_sentence_metric(self, metric_name: str, sentence: str, result: MetricResult, weight: float) -> float: """ Compute metric probability for a single sentence using domain-specific thresholds """ sentence_length = len(sentence.split()) # Get domain-specific threshold for this metric metric_thresholds = getattr(self.domain_thresholds, metric_name, None) if not metric_thresholds: return result.ai_probability # Base probability from document-level result base_prob = result.ai_probability # Apply domain-aware sentence-level adjustments adjusted_prob = self._apply_metric_specific_adjustments(metric_name = metric_name, sentence = sentence, base_prob = base_prob, sentence_length = sentence_length, thresholds = metric_thresholds, ) return adjusted_prob def _create_sentence_metric_result(self, metric_name: str, ai_prob: float, doc_result: MetricResult, sentence_length: int) -> MetricResult: """ Create sentence-level MetricResult from document-level result """ # IMPROVED: Calculate confidence based on sentence characteristics sentence_confidence = self._calculate_sentence_confidence(doc_result.confidence, sentence_length) return MetricResult(metric_name = metric_name, ai_probability = ai_prob, human_probability = 1.0 - ai_prob, mixed_probability = 0.0, confidence = sentence_confidence, details = doc_result.details, error = None, ) def _calculate_sentence_confidence(self, doc_confidence: float, sentence_length: int) -> float: """ IMPROVED: Calculate confidence for sentence-level analysis with length consideration """ base_reduction = 0.8 # Scale confidence penalty with sentence length length_penalty = max(0.3, min(1.0, sentence_length / 12.0)) # Normalize around 12 words return max(0.1, doc_confidence * base_reduction * length_penalty) def _calculate_weighted_probability(self, metric_results: Dict[str, MetricResult], weights: Dict[str, float], breakdown: Dict[str, float]) -> Tuple[float, float, float, float, Dict[str, float]]: """ Fallback weighted probability calculation """ weighted_ai_probs = list() weighted_human_probs = list() confidences = list() total_weight = 0.0 for name, result in metric_results.items(): if (result.error is None): weight = weights.get(name, 0.0) if (weight > 0): weighted_ai_probs.append(result.ai_probability * weight) weighted_human_probs.append(result.human_probability * weight) confidences.append(result.confidence) total_weight += weight if ((not weighted_ai_probs) or (total_weight == 0)): return 0.5, 0.5, 0.0, 0.5, breakdown or {} ai_prob = sum(weighted_ai_probs) / total_weight human_prob = sum(weighted_human_probs) / total_weight mixed_prob = 0.0 # Fallback avg_confidence = sum(confidences) / len(confidences) if confidences else 0.5 return ai_prob, human_prob, mixed_prob, avg_confidence, breakdown def _get_document_ensemble_probability(self, ensemble_result: Optional[EnsembleResult], metric_results: Dict[str, MetricResult], weights: Dict[str, float]) -> Tuple[float, float, float, float, Dict[str, float]]: """ Get document-level ensemble probability """ if ensemble_result: # Use existing ensemble result breakdown = {name: result.ai_probability for name, result in metric_results.items()} return (ensemble_result.ai_probability, ensemble_result.human_probability, ensemble_result.mixed_probability, ensemble_result.overall_confidence, breakdown) else: # Calculate from metrics return self._calculate_weighted_probability(metric_results, weights, {}) def _apply_domain_specific_adjustments(self, sentence: str, ai_prob: float, sentence_length: int) -> float: """ Apply domain-specific adjustments to AI probability with limits """ original_prob = ai_prob adjustments = list() sentence_lower = sentence.lower() # Technical & AI/ML domains if (self.domain in [Domain.AI_ML, Domain.SOFTWARE_DEV, Domain.TECHNICAL_DOC, Domain.ENGINEERING, Domain.SCIENCE]): if self._has_technical_terms(sentence_lower): adjustments.append(1.1) elif self._has_code_like_patterns(sentence): adjustments.append(1.15) elif (sentence_length > 35): adjustments.append(1.05) # Creative & informal domains elif (self.domain in [Domain.CREATIVE, Domain.SOCIAL_MEDIA, Domain.BLOG_PERSONAL]): if self._has_informal_language(sentence_lower): adjustments.append(0.7) elif self._has_emotional_language(sentence): adjustments.append(0.8) elif (sentence_length < 10): adjustments.append(0.8) # Academic & formal domains elif (self.domain in [Domain.ACADEMIC, Domain.LEGAL, Domain.MEDICAL]): if self._has_citation_patterns(sentence): adjustments.append(0.8) elif self._has_technical_terms(sentence_lower): adjustments.append(1.1) elif (sentence_length > 40): adjustments.append(1.1) # Business & professional domains elif (self.domain in [Domain.BUSINESS, Domain.MARKETING, Domain.JOURNALISM]): if self._has_business_jargon(sentence_lower): adjustments.append(1.05) elif self._has_ambiguous_phrasing(sentence_lower): adjustments.append(0.9) elif (15 <= sentence_length <= 25): adjustments.append(0.9) # Tutorial & educational domains elif (self.domain == Domain.TUTORIAL): if self._has_instructional_language(sentence_lower): adjustments.append(0.85) elif self._has_step_by_step_pattern(sentence): adjustments.append(0.8) elif self._has_examples(sentence): adjustments.append(0.9) # General domain - minimal adjustments elif (self.domain == Domain.GENERAL): if self._has_complex_structure(sentence): adjustments.append(0.9) elif self._has_repetition(sentence): adjustments.append(1.1) # Apply adjustments with limits - take strongest 2 adjustments maximum if adjustments: # Sort by impact (farthest from 1.0) adjustments.sort(key = lambda x: abs(x - 1.0), reverse = True) # Limit to 2 strongest strongest_adjustments = adjustments[:2] for adjustment in strongest_adjustments: ai_prob *= adjustment # Ensure probability stays within bounds and doesn't change too drastically : Maximum 30% change from original max_change = 0.3 bounded_prob = max(original_prob - max_change, min(original_prob + max_change, ai_prob)) return max(0.0, min(1.0, bounded_prob)) def _apply_metric_specific_adjustments(self, metric_name: str, sentence: str, base_prob: float, sentence_length: int, thresholds: MetricThresholds) -> float: """ Apply metric-specific adjustments """ # Use metrics from ensemble if (metric_name == "perplexity"): if (sentence_length < 8): return min(1.0, base_prob * 1.2) elif (sentence_length > 25): return max(0.0, base_prob * 0.8) elif (metric_name == "entropy"): words = sentence.split() if (len(words) > 3): unique_words = len(set(words)) diversity = unique_words / len(words) if (diversity < 0.6): return min(1.0, base_prob * 1.2) elif (diversity > 0.8): return max(0.0, base_prob * 0.8) elif (metric_name == "linguistic"): complexity_score = self._analyze_sentence_complexity(sentence) if (complexity_score < 0.3): return min(1.0, base_prob * 1.1) elif (complexity_score > 0.7): return max(0.0, base_prob * 0.9) elif (metric_name == "structural"): if ((sentence_length < 5) or (sentence_length > 40)): return max(0.0, base_prob * 0.8) elif (8 <= sentence_length <= 20): return min(1.0, base_prob * 1.1) elif (metric_name == "semantic_analysis"): if self._has_repetition(sentence): return min(1.0, base_prob * 1.2) elif (metric_name == "multi_perturbation_stability"): # MultiPerturbationStability adjustments for sentence level if (sentence_length > 15): return min(1.0, base_prob * 1.1) return base_prob def _get_color_for_probability(self, probability: float, is_mixed_content: bool = False, mixed_prob: float = 0.0) -> Tuple[str, str, str]: """ Get color class with mixed content support and no threshold gaps """ # Handle probability = 1.0 explicitly if (probability >= 1.0): return "very-high-ai", "#fecaca", "Very likely AI-generated (100%)" # Check mixed content first if (is_mixed_content and (mixed_prob > self.MIXED_THRESHOLD)): return "mixed-content", "#e9d5ff", f"Mixed AI/Human content ({mixed_prob:.1%} mixed)" # Iterate through thresholds correctly for min_thresh, max_thresh, color_class, color_hex, tooltip in self.COLOR_THRESHOLDS: if (min_thresh <= probability < max_thresh): return color_class, color_hex, tooltip # Fallback for probability = 1.0 (should be caught above, but just in case) return "very-high-ai", "#fecaca", "Very likely AI-generated" def _generate_ensemble_tooltip(self, sentence: str, ai_prob: float, human_prob: float, mixed_prob: float, confidence: float, confidence_level: ConfidenceLevel, tooltip_base: str, breakdown: Optional[Dict[str, float]] = None, is_mixed_content: bool = False) -> str: """ Generate enhanced tooltip with ENSEMBLE information """ tooltip = f"{tooltip_base}\n" if is_mixed_content: tooltip += "šŸ”€ MIXED CONTENT DETECTED\n" tooltip += f"AI Probability: {ai_prob:.1%}\n" tooltip += f"Human Probability: {human_prob:.1%}\n" tooltip += f"Mixed Probability: {mixed_prob:.1%}\n" tooltip += f"Confidence: {confidence:.1%} ({confidence_level.value.replace('_', ' ').title()})\n" tooltip += f"Domain: {self.domain.value.replace('_', ' ').title()}\n" tooltip += f"Length: {len(sentence.split())} words" if breakdown: tooltip += "\n\nMetric Breakdown:" # Show top 4 metrics for metric, prob in list(breakdown.items())[:4]: tooltip += f"\n• {metric}: {prob:.1%}" tooltip += f"\n\nEnsemble Method: {getattr(self.ensemble, 'primary_method', 'fallback')}" return tooltip def _has_citation_patterns(self, sentence: str) -> bool: """ Check for academic citation patterns """ citation_indicators = ['et al.', 'ibid.', 'cf.', 'e.g.', 'i.e.', 'vol.', 'pp.', 'ed.', 'trans.', 'reference', 'cited', 'according to'] return any(indicator in sentence.lower() for indicator in citation_indicators) def _has_informal_language(self, sentence: str) -> bool: """ Check for informal language patterns """ informal_indicators = ['lol', 'omg', 'btw', 'imo', 'tbh', 'afaik', 'smh', 'šŸ‘‹', 'šŸ˜‚', 'ā¤ļø', 'haha', 'wow', 'awesome'] return any(indicator in sentence.lower() for indicator in informal_indicators) def _has_technical_terms(self, sentence: str) -> bool: """ Check for domain-specific technical terms """ technical_indicators = ['hereinafter', 'whereas', 'aforementioned', 'diagnosis', 'prognosis', 'etiology', 'algorithm', 'neural network', 'machine learning', 'api', 'endpoint', 'database', 'quantum', 'thermodynamics', 'hypothesis', 'methodology'] return any(indicator in sentence.lower() for indicator in technical_indicators) def _has_ambiguous_phrasing(self, sentence: str) -> bool: """ Check for ambiguous phrasing that might indicate human writing """ ambiguous_indicators = ['perhaps', 'maybe', 'possibly', 'likely', 'appears to', 'seems to', 'might be', 'could be'] return any(indicator in sentence.lower() for indicator in ambiguous_indicators) def _has_complex_structure(self, sentence: str) -> bool: """ Check if sentence has complex linguistic structure """ words = sentence.split() if (len(words) < 8): return False complex_indicators = ['which', 'that', 'although', 'because', 'while', 'when', 'if', 'however', 'therefore'] return any(indicator in sentence.lower() for indicator in complex_indicators) def _has_emotional_language(self, sentence: str) -> bool: """ Check for emotional or subjective language """ emotional_indicators = ['feel', 'believe', 'think', 'wonder', 'hope', 'wish', 'love', 'hate', 'frustrating', 'exciting'] return any(indicator in sentence.lower() for indicator in emotional_indicators) def _has_business_jargon(self, sentence: str) -> bool: """ Check for business jargon """ jargon_indicators = ['synergy', 'leverage', 'bandwidth', 'circle back', 'touch base', 'value add', 'core competency'] return any(indicator in sentence.lower() for indicator in jargon_indicators) def _has_instructional_language(self, sentence: str) -> bool: """ Check for instructional language patterns """ instructional_indicators = ['step by step', 'firstly', 'secondly', 'finally', 'note that', 'remember to', 'make sure'] return any(indicator in sentence.lower() for indicator in instructional_indicators) def _has_step_by_step_pattern(self, sentence: str) -> bool: """ Check for step-by-step instructions """ step_patterns = ['step 1', 'step 2', 'step 3', 'step one', 'step two', 'first step', 'next step'] return any(pattern in sentence.lower() for pattern in step_patterns) def _has_examples(self, sentence: str) -> bool: """ Check for example indicators """ example_indicators = ['for example', 'for instance', 'such as', 'e.g.', 'as an example'] return any(indicator in sentence.lower() for indicator in example_indicators) def _has_code_like_patterns(self, sentence: str) -> bool: """ Check for code-like patterns in technical domains """ code_patterns = ['function', 'variable', 'class', 'method', 'import', 'def ', 'void ', 'public ', 'private '] return any(pattern in sentence for pattern in code_patterns) def _analyze_sentence_complexity(self, sentence: str) -> float: """ Analyze sentence complexity (0 = simple, 1 = complex) """ words = sentence.split() if (len(words) < 5): return 0.2 complexity_indicators = ['although', 'because', 'while', 'when', 'if', 'since', 'unless', 'until', 'which', 'that', 'who', 'whom', 'whose', 'and', 'but', 'or', 'yet', 'so', 'however', 'therefore', 'moreover', 'furthermore', 'nevertheless', ',', ';', ':', '—'] score = 0.0 if (len(words) > 15): score += 0.3 elif (len(words) > 25): score += 0.5 indicator_count = sum(1 for indicator in complexity_indicators if indicator in sentence.lower()) score += min(0.5, indicator_count * 0.1) clause_indicators = [',', ';', 'and', 'but', 'or', 'because', 'although'] clause_count = sum(1 for indicator in clause_indicators if indicator in sentence.lower()) score += min(0.2, clause_count * 0.05) return min(1.0, score) def _has_repetition(self, sentence: str) -> bool: """ Check if sentence has word repetition (common in AI text) """ words = sentence.lower().split() if (len(words) < 6): return False word_counts = dict() for word in words: if (len(word) > 3): word_counts[word] = word_counts.get(word, 0) + 1 repeated_words = [word for word, count in word_counts.items() if count > 2] return len(repeated_words) > 0 def _split_sentences(self, text: str) -> List[str]: """ Split the text chunk into multiple sentences """ sentences = self.text_processor.split_sentences(text) filtered_sentences = list() for sentence in sentences: clean_sentence = sentence.strip() if (len(clean_sentence) >= 3): filtered_sentences.append(clean_sentence) return filtered_sentences def generate_html(self, highlighted_sentences: List[HighlightedSentence], include_legend: bool = False, include_metrics: bool = True) -> str: """ Generate HTML with highlighted sentences Arguments: ---------- highlighted_sentences { List[HighlightedSentence] } : Sentences with highlighting data include_legend { bool } : Whether to include legend (set to False to avoid duplicates) include_metrics { bool } : Whether to include metrics summary Returns: -------- { str } : HTML content """ html_parts = list() # Add CSS html_parts.append(self._generate_enhanced_css()) # Only include legend if explicitly requested (usually False to avoid duplicates) if include_legend: html_parts.append(self._generate_legend_html()) # Add highlighted text container html_parts.append('
') for sent in highlighted_sentences: extra_class = " mixed-highlight" if sent.is_mixed_content else "" html_parts.append(f'' f'{sent.text}' f' ' ) html_parts.append('
') # Add metrics summary if requested (separate from legend) if include_metrics and highlighted_sentences: html_parts.append(self._generate_metrics_summary(highlighted_sentences)) return '\n'.join(html_parts) def _generate_enhanced_css(self) -> str: """ Generate CSS for highlighting for Better readability """ return """ """ def _generate_metrics_summary(self, sentences: List[HighlightedSentence]) -> str: """ Generate summary statistics for highlighted sentences """ if not sentences: return "" # Calculate summary metrics total_sentences = len(sentences) # Count sentences by category very_high_ai = len([s for s in sentences if s.color_class == "very-high-ai"]) high_ai = len([s for s in sentences if s.color_class == "high-ai"]) medium_ai = len([s for s in sentences if s.color_class == "medium-ai"]) uncertain = len([s for s in sentences if s.color_class == "uncertain"]) medium_human = len([s for s in sentences if s.color_class == "medium-human"]) high_human = len([s for s in sentences if s.color_class == "high-human"]) very_high_human = len([s for s in sentences if s.color_class == "very-high-human"]) mixed = len([s for s in sentences if s.color_class == "mixed-content"]) # Calculate overall risk score (weighted average) weighted_risk = 0.0 for sent in sentences: weight = self.RISK_WEIGHTS.get(sent.color_class, 0.4) weighted_risk += sent.ai_probability * weight overall_risk_score = weighted_risk / total_sentences if total_sentences else 0.0 # Calculate average probabilities avg_ai_prob = sum(s.ai_probability for s in sentences) / total_sentences avg_human_prob = sum(s.human_probability for s in sentences) / total_sentences # Sentence counts ai_sentences = very_high_ai + high_ai + medium_ai human_sentences = very_high_human + high_human + medium_human html = f"""

šŸ“Š Text Analysis Summary

Overall Risk Score {overall_risk_score:.1%}
Average AI Probability {avg_ai_prob:.1%}
AI Sentences {ai_sentences} ({ai_sentences/total_sentences:.1%})
Human Sentences {human_sentences} ({human_sentences/total_sentences:.1%})
Uncertain Sentences {uncertain} ({uncertain/total_sentences:.1%})
Mixed Sentences {mixed} ({mixed/total_sentences:.1%})
Total Sentences {total_sentences}
Domain {self.domain.value.replace('_', ' ').title()}
""" return html def _generate_legend_html(self) -> str: """ Generate legend HTML - Only used if explicitly requested """ return """

AI Detection Legend

Very Likely Human (0-10%)
Likely Human (10-25%)
Possibly Human (25-40%)
Uncertain (40-60%)
Possibly AI (60-75%)
Likely AI (75-90%)
Very Likely AI (90-100%)
Mixed Content
""" def calculate_metrics(self, highlighted_sentences: List[HighlightedSentence]) -> Dict[str, float]: """ Calculate metrics for external use Arguments: ---------- highlighted_sentences { List[HighlightedSentence] } : Sentences with highlighting data Returns: -------- { Dict[str, float] } : Dictionary with metrics """ if not highlighted_sentences: return {} total_sentences = len(highlighted_sentences) # Calculate weighted risk score weighted_risk = 0.0 for sent in highlighted_sentences: weight = self.RISK_WEIGHTS.get(sent.color_class, 0.4) weighted_risk += sent.ai_probability * weight overall_risk_score = weighted_risk / total_sentences # Count sentences by category ai_sentences = len([s for s in highlighted_sentences if s.ai_probability >= 0.6]) human_sentences = len([s for s in highlighted_sentences if s.ai_probability <= 0.4]) uncertain_sentences = len([s for s in highlighted_sentences if 0.4 < s.ai_probability < 0.6]) mixed_sentences = len([s for s in highlighted_sentences if s.is_mixed_content]) # Average probabilities avg_ai_prob = sum(s.ai_probability for s in highlighted_sentences) / total_sentences avg_human_prob = sum(s.human_probability for s in highlighted_sentences) / total_sentences avg_confidence = sum(s.confidence for s in highlighted_sentences) / total_sentences return {'overall_risk_score' : overall_risk_score, 'avg_ai_probability' : avg_ai_prob, 'avg_human_probability' : avg_human_prob, 'avg_confidence' : avg_confidence, 'ai_sentence_count' : ai_sentences, 'human_sentence_count' : human_sentences, 'uncertain_sentence_count' : uncertain_sentences, 'mixed_sentence_count' : mixed_sentences, 'total_sentences' : total_sentences, 'ai_sentence_percentage' : ai_sentences / total_sentences, 'human_sentence_percentage' : human_sentences / total_sentences, } # Export __all__ = ["TextHighlighter", "HighlightedSentence", ]