Text_Authenticator / services /highlighter.py
satyaki-mitra's picture
Architecture updated
44d0409
# DEPENDENCIES
import re
from typing import List
from typing import Dict
from typing import Tuple
from loguru import logger
from typing import Optional
from config.enums import Domain
from config.schemas import MetricResult
from config.schemas import EnsembleResult
from processors.text_processor import TextProcessor
from config.threshold_config import ConfidenceLevel
from config.schemas import HighlightedSentenceResult
from config.threshold_config import MetricThresholds
from config.threshold_config import get_confidence_level
from services.ensemble_classifier import EnsembleClassifier
from config.threshold_config import get_threshold_for_domain
from config.threshold_config import get_active_metric_weights
class TextHighlighter:
"""
Generates sentence-level highlighting with ensemble results integration
FEATURES:
- Sentence-level highlighting with confidence scores
- Domain-aware calibration
- Ensemble-assisted probability aggregation
- Hybrid content detection
- Explainable tooltips
"""
# Color thresholds - 4 categories
COLOR_THRESHOLDS = [(0.00, 0.40, "authentic", "#d1fae5", "Likely authentically written"), # Authentic: Synthetic probability < 0.4
(0.40, 0.60, "uncertain", "#fef3c7", "Uncertain authorship"), # Uncertain: 0.4 ≤ Synthetic probability < 0.6
(0.60, 0.80, "hybrid", "#e9d5ff", "Mixed synthetic/authentic content"), # Hybrid: 0.6 ≤ Synthetic probability < 0.8 OR explicit hybrid detection
(0.80, 1.01, "synthetic", "#fee2e2", "Likely synthetically generated"), # Synthetic: Synthetic probability ≥ 0.8
]
# Hybrid detection thresholds
HYBRID_PROB_THRESHOLD = 0.25 # Minimum hybrid probability to classify as hybrid
def __init__(self, domain: Domain = Domain.GENERAL, ensemble_classifier: Optional[EnsembleClassifier] = None):
"""
Initialize text highlighter with ENSEMBLE INTEGRATION
Arguments:
----------
domain { Domain } : Text domain for adaptive thresholding
ensemble_classifier { EnsembleClassifier } : Optional ensemble for sentence-level analysis
"""
self.text_processor = TextProcessor()
self.domain = domain
self.domain_thresholds = get_threshold_for_domain(domain)
self.ensemble = ensemble_classifier or self._create_default_ensemble()
def _create_default_ensemble(self) -> EnsembleClassifier:
"""
Create default ensemble classifier with proper error handling
"""
try:
return EnsembleClassifier(primary_method = "confidence_calibrated",
fallback_method = "domain_weighted",
)
except Exception as e:
logger.warning(f"Failed to create default ensemble: {e}. Using fallback mode.")
return EnsembleClassifier(primary_method = "domain_weighted",
fallback_method = "simple_average",
)
def generate_highlights(self, text: str, metric_results: Dict[str, MetricResult], ensemble_result: Optional[EnsembleResult] = None,
enabled_metrics: Optional[Dict[str, bool]] = None, use_sentence_level: bool = True) -> List[HighlightedSentenceResult]:
"""
Generate sentence-level highlights with ensemble integration
Arguments:
----------
text { str } : Original text
metric_results { dict } : Results from all metrics
ensemble_result { EnsembleResult } : Optional document-level ensemble result
enabled_metrics { dict } : Dict of metric_name -> is_enabled
use_sentence_level { bool } : Whether to compute sentence-level probabilities
Returns:
--------
{ list } : List of HighlightedSentenceResult objects
"""
try:
# Validate inputs
if not text or not text.strip():
return self._handle_empty_text(text = text,
metric_results = metric_results,
ensemble_result = ensemble_result,
)
# Get domain-appropriate weights for enabled metrics
if enabled_metrics is None:
enabled_metrics = {name: True for name in metric_results.keys()}
weights = get_active_metric_weights(self.domain, enabled_metrics)
# Split text into sentences with error handling
sentences = self._split_sentences_with_fallback(text = text)
if not sentences:
return self._handle_no_sentences(text, metric_results, ensemble_result)
# Calculate probabilities for each sentence using ENSEMBLE METHODS
highlighted_sentences = list()
for idx, sentence in enumerate(sentences):
try:
if use_sentence_level:
# Use ensemble for sentence-level analysis
synthetic_prob, authentic_prob, hybrid_prob, confidence, breakdown = self._calculate_sentence_ensemble_probability(sentence = sentence,
metric_results = metric_results,
weights = weights,
ensemble_result = ensemble_result,
)
else:
# Use document-level ensemble probabilities
synthetic_prob, authentic_prob, hybrid_prob, confidence, breakdown = self._get_document_ensemble_probability(ensemble_result = ensemble_result,
metric_results = metric_results,
weights = weights,
)
# Apply domain-specific adjustments with limits
synthetic_prob = self._apply_domain_specific_adjustments(sentence = sentence,
synthetic_prob = synthetic_prob,
sentence_length = len(sentence.split()),
)
# Determine if this is hybrid content
is_hybrid_content = self._is_hybrid_content(synthetic_prob = synthetic_prob,
hybrid_prob = hybrid_prob,
confidence = confidence,
)
# Get confidence level
confidence_level = get_confidence_level(confidence)
# Get color class (consider hybrid content)
color_class, color_hex, tooltip_base = self._get_color_for_probability(synthetic_prob = synthetic_prob,
is_hybrid_content = is_hybrid_content,
hybrid_prob = hybrid_prob,
)
# Generate enhanced tooltip
tooltip = self._generate_ensemble_tooltip(sentence = sentence,
synthetic_prob = synthetic_prob,
authentic_prob = authentic_prob,
hybrid_prob = hybrid_prob,
confidence = confidence,
confidence_level = confidence_level,
tooltip_base = tooltip_base,
breakdown = breakdown,
is_hybrid_content = is_hybrid_content,
)
highlighted_sentences.append(HighlightedSentenceResult(text = sentence,
synthetic_probability = synthetic_prob,
authentic_probability = authentic_prob,
hybrid_probability = hybrid_prob,
confidence = confidence,
confidence_level = confidence_level,
color_class = color_class,
tooltip = tooltip,
index = idx,
is_hybrid_content = is_hybrid_content,
metric_breakdown = breakdown,
)
)
except Exception as e:
logger.warning(f"Failed to process sentence {idx}: {e}")
# Add fallback sentence
highlighted_sentences.append(self._create_fallback_sentence(sentence, idx))
return highlighted_sentences
except Exception as e:
logger.error(f"Highlight generation failed: {e}")
return self._create_error_fallback(text, metric_results)
def _handle_empty_text(self, text: str, metric_results: Dict[str, MetricResult], ensemble_result: Optional[EnsembleResult]) -> List[HighlightedSentenceResult]:
"""
Handle empty input text
"""
if ensemble_result:
return [self._create_fallback_sentence(text = "No text content",
index = 0,
synthetic_prob = ensemble_result.synthetic_probability,
authentic_prob = ensemble_result.authentic_probability,
)
]
return [self._create_fallback_sentence("No text content", 0)]
def _handle_no_sentences(self, text: str, metric_results: Dict[str, MetricResult], ensemble_result: Optional[EnsembleResult]) -> List[HighlightedSentenceResult]:
"""
Handle case where no sentences could be extracted
"""
if text and text.strip():
# Treat entire text as one sentence
return [self._create_fallback_sentence(text.strip(), 0)]
return [self._create_fallback_sentence("No processable content", 0)]
def _create_fallback_sentence(self, text: str, index: int, synthetic_prob: float = 0.5, authentic_prob: float = 0.5) -> HighlightedSentenceResult:
"""
Create a fallback sentence when processing fails
"""
confidence_level = get_confidence_level(0.3)
color_class, _, tooltip_base = self._get_color_for_probability(synthetic_prob = synthetic_prob,
is_hybrid_content = False,
hybrid_prob = 0.0,
)
return HighlightedSentenceResult(text = text,
synthetic_probability = synthetic_prob,
authentic_probability = authentic_prob,
hybrid_probability = 0.0,
confidence = 0.3,
confidence_level = confidence_level,
color_class = color_class,
tooltip = f"Fallback: {tooltip_base}\nProcessing failed for this sentence",
index = index,
is_hybrid_content = False,
metric_breakdown = {"fallback": synthetic_prob},
)
def _create_error_fallback(self, text: str, metric_results: Dict[str, MetricResult]) -> List[HighlightedSentenceResult]:
"""
Create fallback when entire processing fails
"""
return [HighlightedSentenceResult(text = text[:100] + "..." if len(text) > 100 else text,
synthetic_probability = 0.5,
authentic_probability = 0.5,
hybrid_probability = 0.0,
confidence = 0.1,
confidence_level = get_confidence_level(0.1),
color_class = "uncertain",
tooltip = "Error in text processing",
index = 0,
is_hybrid_content = False,
metric_breakdown = {"error": 0.5},
)
]
def _split_sentences_with_fallback(self, text: str) -> List[str]:
"""
Split text into sentences with comprehensive fallback handling
"""
try:
sentences = self.text_processor.split_sentences(text)
filtered_sentences = [s.strip() for s in sentences if len(s.strip()) >= 3]
if filtered_sentences:
return filtered_sentences
# Fallback: split by common sentence endings
fallback_sentences = re.split(r'[.!?]+', text)
fallback_sentences = [s.strip() for s in fallback_sentences if len(s.strip()) >= 3]
if fallback_sentences:
return fallback_sentences
# Ultimate fallback: treat as single sentence if meaningful
if text.strip():
return [text.strip()]
return []
except Exception as e:
logger.warning(f"Sentence splitting failed, using fallback: {e}")
# Return text as single sentence
return [text] if text.strip() else []
def _calculate_sentence_ensemble_probability(self, sentence: str, metric_results: Dict[str, MetricResult], weights: Dict[str, float], ensemble_result: Optional[EnsembleResult] = None) -> Tuple[float, float, float, float, Dict[str, float]]:
"""
Calculate sentence probabilities using ensemble methods with domain calibration
"""
sentence_length = len(sentence.split())
# Handling very short sentences – do not force neutral, but reduce confidence
if (sentence_length < 3):
base_synthetic_prob = 0.5
base_confidence = 0.2
breakdown = {"short_sentence": base_synthetic_prob}
for name, result in metric_results.items():
if (result.error is None and weights.get(name, 0.0) > 0):
base_synthetic_prob = result.synthetic_probability
breakdown[name] = base_synthetic_prob
break
return (base_synthetic_prob,
1.0 - base_synthetic_prob,
0.0,
base_confidence,
breakdown
)
# Build sentence-level metric results
sentence_metric_results = dict()
breakdown = dict()
for name, doc_result in metric_results.items():
if doc_result.error is not None:
continue
try:
sentence_prob = self._compute_sentence_metric(metric_name = name,
sentence = sentence,
result = doc_result,
weight = weights.get(name, 0.0),
)
sentence_metric_results[name] = self._create_sentence_metric_result(metric_name = name,
synthetic_prob = sentence_prob,
doc_result = doc_result,
sentence_length = sentence_length,
)
breakdown[name] = sentence_prob
except Exception as e:
logger.warning(f"Metric {name} failed for sentence: {e}")
breakdown[name] = doc_result.synthetic_probability
# Ensemble aggregation (PRIMARY PATH)
if sentence_metric_results:
try:
ensemble_sentence_result = self.ensemble.predict(metric_results = sentence_metric_results,
domain = self.domain,
)
return (ensemble_sentence_result.synthetic_probability,
ensemble_sentence_result.authentic_probability,
ensemble_sentence_result.hybrid_probability,
ensemble_sentence_result.overall_confidence,
breakdown,
)
except Exception as e:
logger.warning(f"Sentence ensemble failed: {e}")
# Fallback: weighted average aggregation
return self._fallback_weighted_probability(metric_results, weights, breakdown)
def _compute_sentence_metric(self, metric_name: str, sentence: str, result: MetricResult, weight: float) -> float:
"""
Compute metric probability for a single sentence using domain-specific thresholds
"""
sentence_length = len(sentence.split())
# Get domain-specific threshold for this metric
metric_thresholds = getattr(self.domain_thresholds, metric_name, None)
if not metric_thresholds:
return result.synthetic_probability
# Base probability from document-level result
base_prob = result.synthetic_probability
# Apply domain-aware sentence-level adjustments
adjusted_prob = self._apply_metric_specific_adjustments(metric_name = metric_name,
sentence = sentence,
base_prob = base_prob,
sentence_length = sentence_length,
thresholds = metric_thresholds,
)
return adjusted_prob
def _create_sentence_metric_result(self, metric_name: str, synthetic_prob: float, doc_result: MetricResult, sentence_length: int) -> MetricResult:
"""
Create sentence-level MetricResult from document-level result
"""
# Calculate confidence based on sentence characteristics
sentence_confidence = self._calculate_sentence_confidence(doc_confidence = doc_result.confidence,
sentence_length = sentence_length,
)
return MetricResult(metric_name = metric_name,
synthetic_probability = synthetic_prob,
authentic_probability = 1.0 - synthetic_prob,
hybrid_probability = 0.0,
confidence = sentence_confidence,
details = doc_result.details,
error = None,
)
def _calculate_sentence_confidence(self, doc_confidence: float, sentence_length: int) -> float:
"""
Calculate confidence for sentence-level analysis with length consideration
"""
base_reduction = 0.8
# Scale confidence penalty with sentence length
length_penalty = max(0.3, min(1.0, sentence_length / 12.0)) # Normalize around 12 words
return max(0.1, doc_confidence * base_reduction * length_penalty)
def _fallback_weighted_probability(self, metric_results: Dict[str, MetricResult], weights: Dict[str, float], breakdown: Dict[str, float]) -> Tuple[float, float, float, float, Dict[str, float]]:
"""
Fallback weighted probability calculation
"""
weighted_synthetic_probs = list()
weighted_authentic_probs = list()
confidences = list()
total_weight = 0.0
for name, result in metric_results.items():
if result.error is None:
weight = weights.get(name, 0.0)
if (weight > 0):
weighted_synthetic_probs.append(result.synthetic_probability * weight)
weighted_authentic_probs.append(result.authentic_probability * weight)
confidences.append(result.confidence)
total_weight += weight
if not weighted_synthetic_probs or total_weight == 0:
return 0.5, 0.5, 0.0, 0.5, breakdown or {}
synthetic_prob = sum(weighted_synthetic_probs) / total_weight
authentic_prob = sum(weighted_authentic_probs) / total_weight
hybrid_prob = 0.0 # Fallback
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.5
return synthetic_prob, authentic_prob, hybrid_prob, avg_confidence, breakdown
def _get_document_ensemble_probability(self, ensemble_result: Optional[EnsembleResult], metric_results: Dict[str, MetricResult], weights: Dict[str, float]) -> Tuple[float, float, float, float, Dict[str, float]]:
"""
Get document-level ensemble probability
"""
if ensemble_result:
# Use existing ensemble result
breakdown = {name: result.synthetic_probability for name, result in metric_results.items()}
return (ensemble_result.synthetic_probability,
ensemble_result.authentic_probability,
ensemble_result.hybrid_probability,
ensemble_result.overall_confidence,
breakdown
)
else:
# Calculate from metrics
return self._fallback_weighted_probability(metric_results, weights, {})
def _apply_domain_specific_adjustments(self, sentence: str, synthetic_prob: float, sentence_length: int) -> float:
"""
Apply domain-specific adjustments to Synthetic probability with limits
"""
original_prob = synthetic_prob
adjustments = list()
sentence_lower = sentence.lower()
# Technical & AI/ML domains
if self.domain in [Domain.AI_ML, Domain.SOFTWARE_DEV, Domain.TECHNICAL_DOC, Domain.ENGINEERING, Domain.SCIENCE]:
if self._has_technical_terms(sentence_lower):
adjustments.append(1.1)
elif self._has_code_like_patterns(sentence):
adjustments.append(1.15)
elif (sentence_length > 35):
adjustments.append(1.05)
# Creative & informal domains
elif self.domain in [Domain.CREATIVE, Domain.SOCIAL_MEDIA, Domain.BLOG_PERSONAL]:
if self._has_informal_language(sentence_lower):
adjustments.append(0.7)
elif self._has_emotional_language(sentence):
adjustments.append(0.8)
elif sentence_length < 10:
adjustments.append(0.8)
# Academic & formal domains
elif self.domain in [Domain.ACADEMIC, Domain.LEGAL, Domain.MEDICAL]:
if self._has_citation_patterns(sentence):
adjustments.append(0.8)
elif self._has_technical_terms(sentence_lower):
adjustments.append(1.1)
elif (sentence_length > 40):
adjustments.append(1.1)
# Business & professional domains
elif self.domain in [Domain.BUSINESS, Domain.MARKETING, Domain.JOURNALISM]:
if self._has_business_jargon(sentence_lower):
adjustments.append(1.05)
elif self._has_ambiguous_phrasing(sentence_lower):
adjustments.append(0.9)
elif (15 <= sentence_length <= 25):
adjustments.append(0.9)
# Tutorial & educational domains
elif (self.domain == Domain.TUTORIAL):
if self._has_instructional_language(sentence_lower):
adjustments.append(0.85)
elif self._has_step_by_step_pattern(sentence):
adjustments.append(0.8)
elif self._has_examples(sentence):
adjustments.append(0.9)
# General domain - minimal adjustments
elif (self.domain == Domain.GENERAL):
if self._has_complex_structure(sentence):
adjustments.append(0.9)
elif self._has_repetition(sentence):
adjustments.append(1.1)
# Apply adjustments with limits - take strongest 2 adjustments maximum
if adjustments:
# Sort by impact (farthest from 1.0)
adjustments.sort(key = lambda x: abs(x - 1.0), reverse = True)
# Limit to 2 strongest
strongest_adjustments = adjustments[:2]
for adjustment in strongest_adjustments:
synthetic_prob *= adjustment
# Ensure probability stays within bounds and doesn't change too drastically
max_change = 0.3 # Maximum 30% change from original
bounded_prob = max(original_prob - max_change, min(original_prob + max_change, synthetic_prob))
return max(0.0, min(1.0, bounded_prob))
def _apply_metric_specific_adjustments(self, metric_name: str, sentence: str, base_prob: float, sentence_length: int, thresholds: MetricThresholds) -> float:
"""
Apply metric-specific adjustments
"""
# Use metrics from ensemble
if (metric_name == "perplexity"):
if (sentence_length < 8):
return min(1.0, base_prob * 1.2)
elif (sentence_length > 25):
return max(0.0, base_prob * 0.8)
elif (metric_name == "entropy"):
words = sentence.split()
if (len(words) > 3):
unique_words = len(set(words))
diversity = unique_words / len(words)
if (diversity < 0.6):
return min(1.0, base_prob * 1.2)
elif (diversity > 0.8):
return max(0.0, base_prob * 0.8)
elif (metric_name == "linguistic"):
complexity_score = self._analyze_sentence_complexity(sentence)
if (complexity_score < 0.3):
return min(1.0, base_prob * 1.1)
elif (complexity_score > 0.7):
return max(0.0, base_prob * 0.9)
elif (metric_name == "structural"):
if ((sentence_length < 5) or (sentence_length > 40)):
return max(0.0, base_prob * 0.8)
elif (8 <= sentence_length <= 20):
return min(1.0, base_prob * 1.1)
elif (metric_name == "semantic_analysis"):
if self._has_repetition(sentence):
return min(1.0, base_prob * 1.2)
elif (metric_name == "multi_perturbation_stability"):
# MultiPerturbationStability adjustments for sentence level
if (sentence_length > 15):
return min(1.0, base_prob * 1.1)
return base_prob
def _is_hybrid_content(self, synthetic_prob: float, hybrid_prob: float, confidence: float) -> bool:
"""
Determine if content should be classified as hybrid
"""
# Case 1: Explicit high hybrid probability from ensemble
if (hybrid_prob > self.HYBRID_PROB_THRESHOLD):
return True
# Case 2: High uncertainty combined with ambiguous synthetic probability
if (confidence < 0.3 and 0.4 <= synthetic_prob <= 0.7):
return True
# Case 3: Synthetic probability in hybrid range (0.6-0.8)
if (0.6 <= synthetic_prob < 0.8):
return True
return False
def _get_color_for_probability(self, synthetic_prob: float, is_hybrid_content: bool = False, hybrid_prob: float = 0.0) -> Tuple[str, str, str]:
"""
Get color class with simplified 4-category system
"""
# Handle hybrid content first
if is_hybrid_content:
return "hybrid", "#e9d5ff", f"Mixed synthetic/authentic content ({hybrid_prob:.1%} hybrid)"
# Iterate through simplified thresholds
for min_thresh, max_thresh, color_class, color_hex, tooltip in self.COLOR_THRESHOLDS:
if (min_thresh <= synthetic_prob < max_thresh):
return color_class, color_hex, tooltip
# Fallback for edge cases
return "uncertain", "#fef3c7", "Uncertain authorship"
def _generate_ensemble_tooltip(self, sentence: str, synthetic_prob: float, authentic_prob: float, hybrid_prob: float, confidence: float, confidence_level: ConfidenceLevel,
tooltip_base: str, breakdown: Optional[Dict[str, float]] = None, is_hybrid_content: bool = False) -> str:
"""
Generate enhanced tooltip with ENSEMBLE information
"""
tooltip = f"{tooltip_base}\n"
if is_hybrid_content:
tooltip += "🔀 HYBRID CONTENT DETECTED\n"
tooltip += f"Synthetic Probability: {synthetic_prob:.1%}\n"
tooltip += f"Authentic Probability: {authentic_prob:.1%}\n"
tooltip += f"Hybrid Probability: {hybrid_prob:.1%}\n"
tooltip += f"Confidence: {confidence:.1%} ({confidence_level.value.replace('_', ' ').title()})\n"
tooltip += f"Domain: {self.domain.value.replace('_', ' ').title()}\n"
tooltip += f"Length: {len(sentence.split())} words"
if breakdown:
tooltip += "\n\nMetric Breakdown:"
# Show top 4 metrics
for metric, prob in list(breakdown.items())[:4]:
tooltip += f"\n• {metric}: {prob:.1%}"
tooltip += f"\n\nEnsemble Method: {getattr(self.ensemble, 'primary_method', 'fallback')}"
return tooltip
def _has_citation_patterns(self, sentence: str) -> bool:
"""
Check for academic citation patterns
"""
citation_indicators = ['et al.', 'ibid.', 'cf.', 'e.g.', 'i.e.', 'vol.', 'pp.', 'ed.', 'trans.', 'reference', 'cited', 'according to']
return any(indicator in sentence.lower() for indicator in citation_indicators)
def _has_informal_language(self, sentence: str) -> bool:
"""
Check for informal language patterns
"""
informal_indicators = ['lol', 'omg', 'btw', 'imo', 'tbh', 'afaik', 'smh', '👋', '😂', '❤️', 'haha', 'wow', 'awesome']
return any(indicator in sentence.lower() for indicator in informal_indicators)
def _has_technical_terms(self, sentence: str) -> bool:
"""
Check for domain-specific technical terms
"""
technical_indicators = ['hereinafter', 'whereas', 'aforementioned', 'diagnosis', 'prognosis', 'etiology',
'algorithm', 'neural network', 'machine learning', 'api', 'endpoint', 'database',
'quantum', 'thermodynamics', 'hypothesis', 'methodology']
return any(indicator in sentence.lower() for indicator in technical_indicators)
def _has_ambiguous_phrasing(self, sentence: str) -> bool:
"""
Check for ambiguous phrasing that might indicate human writing
"""
ambiguous_indicators = ['perhaps', 'maybe', 'possibly', 'likely', 'appears to', 'seems to', 'might be', 'could be']
return any(indicator in sentence.lower() for indicator in ambiguous_indicators)
def _has_complex_structure(self, sentence: str) -> bool:
"""
Check if sentence has complex linguistic structure
"""
words = sentence.split()
if (len(words) < 8):
return False
complex_indicators = ['which', 'that', 'although', 'because', 'while', 'when', 'if', 'however', 'therefore']
return any(indicator in sentence.lower() for indicator in complex_indicators)
def _has_emotional_language(self, sentence: str) -> bool:
"""
Check for emotional or subjective language
"""
emotional_indicators = ['feel', 'believe', 'think', 'wonder', 'hope', 'wish', 'love', 'hate', 'frustrating', 'exciting']
return any(indicator in sentence.lower() for indicator in emotional_indicators)
def _has_business_jargon(self, sentence: str) -> bool:
"""
Check for business jargon
"""
jargon_indicators = ['synergy', 'leverage', 'bandwidth', 'circle back', 'touch base', 'value add', 'core competency']
return any(indicator in sentence.lower() for indicator in jargon_indicators)
def _has_instructional_language(self, sentence: str) -> bool:
"""
Check for instructional language patterns
"""
instructional_indicators = ['step by step', 'firstly', 'secondly', 'finally', 'note that', 'remember to', 'make sure']
return any(indicator in sentence.lower() for indicator in instructional_indicators)
def _has_step_by_step_pattern(self, sentence: str) -> bool:
"""
Check for step-by-step instructions
"""
step_patterns = ['step 1', 'step 2', 'step 3', 'step one', 'step two', 'first step', 'next step']
return any(pattern in sentence.lower() for pattern in step_patterns)
def _has_examples(self, sentence: str) -> bool:
"""
Check for example indicators
"""
example_indicators = ['for example', 'for instance', 'such as', 'e.g.', 'as an example']
return any(indicator in sentence.lower() for indicator in example_indicators)
def _has_code_like_patterns(self, sentence: str) -> bool:
"""
Check for code-like patterns in technical domains
"""
code_patterns = ['function', 'variable', 'class', 'method', 'import', 'def ', 'void ', 'public ', 'private ']
return any(pattern in sentence for pattern in code_patterns)
def _analyze_sentence_complexity(self, sentence: str) -> float:
"""
Analyze sentence complexity (0 = simple, 1 = complex)
"""
words = sentence.split()
if (len(words) < 5):
return 0.2
complexity_indicators = ['although', 'because', 'while', 'when', 'if', 'since', 'unless', 'until', 'which', 'that', 'who', 'whom', 'whose', 'and', 'but', 'or', 'yet', 'so', 'however', 'therefore', 'moreover', 'furthermore', 'nevertheless', ',', ';', ':', '—']
score = 0.0
if (len(words) > 15):
score += 0.3
elif (len(words) > 25):
score += 0.5
indicator_count = sum(1 for indicator in complexity_indicators if indicator in sentence.lower())
score += min(0.5, indicator_count * 0.1)
clause_indicators = [',', ';', 'and', 'but', 'or', 'because', 'although']
clause_count = sum(1 for indicator in clause_indicators if indicator in sentence.lower())
score += min(0.2, clause_count * 0.05)
return min(1.0, score)
def _has_repetition(self, sentence: str) -> bool:
"""
Check if sentence has word repetition (common in Synthetic text)
"""
words = sentence.lower().split()
if (len(words) < 6):
return False
word_counts = dict()
for word in words:
if (len(word) > 3):
word_counts[word] = word_counts.get(word, 0) + 1
repeated_words = [word for word, count in word_counts.items() if count > 2]
return (len(repeated_words) > 0)
def generate_html(self, highlighted_sentences: List[HighlightedSentenceResult], include_legend: bool = True) -> str:
"""
Generate HTML with highlighted sentences
Arguments:
----------
highlighted_sentences { List[HighlightedSentenceResult] } : Sentences with highlighting data
include_legend { bool } : Whether to include legend
Returns:
--------
{ str } : HTML content
"""
html_parts = list()
# Add CSS
html_parts.append(self._generate_css())
# Include legend if requested
if include_legend:
html_parts.append(self._generate_legend_html())
# Add highlighted text container
html_parts.append('<div class="highlighted-text">')
for sent in highlighted_sentences:
extra_class = " hybrid-highlight" if sent.is_hybrid_content else ""
html_parts.append(f'<span class="highlight {sent.color_class}{extra_class}" '
f'data-synthetic-prob="{sent.synthetic_probability:.4f}" '
f'data-authentic-prob="{sent.authentic_probability:.4f}" '
f'data-hybrid-prob="{sent.hybrid_probability:.4f}" '
f'data-confidence="{sent.confidence:.4f}" '
f'data-confidence-level="{sent.confidence_level.value}" '
f'data-domain="{self.domain.value}" '
f'data-sentence-idx="{sent.index}" '
f'data-is-hybrid="{str(sent.is_hybrid_content).lower()}" '
f'title="{sent.tooltip}">'
f'{sent.text}'
f'</span> ')
html_parts.append('</div>')
return '\n'.join(html_parts)
def _generate_css(self) -> str:
"""
Generate CSS for highlighting for better readability with 4 color types
"""
return """
<style>
.highlighted-text {
line-height: 1.8;
font-size: 16px;
font-family: 'Georgia', serif;
padding: 20px;
background: #ffffff;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
margin-bottom: 20px;
}
.highlight {
padding: 2px 4px;
margin: 0 1px;
border-radius: 3px;
transition: all 0.2s ease;
cursor: help;
border-bottom: 2px solid transparent;
color: #000000 !important;
font-weight: 500;
position: relative;
}
.highlight:hover {
transform: translateY(-1px);
box-shadow: 0 4px 12px rgba(0,0,0,0.15);
z-index: 10;
text-shadow: 0 1px 1px rgba(255,255,255,0.8);
}
/* Authentic - Green tones */
.authentic {
background-color: #d1fae5;
border-bottom-color: #10b981;
}
/* Uncertain - Yellow tones */
.uncertain {
background-color: #fef3c7;
border-bottom-color: #f59e0b;
}
/* Hybrid - Purple tones */
.hybrid {
background-color: #e9d5ff;
border-bottom-color: #a855f7;
}
.hybrid-highlight:hover {
border: 2px dashed #a855f7;
}
/* Synthetic - Red tones */
.synthetic {
background-color: #fee2e2;
border-bottom-color: #ef4444;
}
</style>
"""
def _generate_legend_html(self) -> str:
"""
Generate legend HTML for 4-category system
"""
return """
<div class="highlight-legend" style="margin-bottom: 20px; padding: 15px; background: #f8fafc; border-radius: 8px; border: 1px solid #e2e8f0;">
<h4 style="margin: 0 0 10px 0; font-size: 14px; font-weight: 600; color: #374151;">Text Analysis Legend</h4>
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 8px;">
<div style="display: flex; align-items: center; gap: 8px;">
<div style="width: 16px; height: 16px; background: #d1fae5; border: 1px solid #10b981; border-radius: 3px;"></div>
<span style="font-size: 12px; color: #374151;">Authentic (0-40% synthetic)</span>
</div>
<div style="display: flex; align-items: center; gap: 8px;">
<div style="width: 16px; height: 16px; background: #fef3c7; border: 1px solid #f59e0b; border-radius: 3px;"></div>
<span style="font-size: 12px; color: #374151;">Uncertain (40-60% synthetic)</span>
</div>
<div style="display: flex; align-items: center; gap: 8px;">
<div style="width: 16px; height: 16px; background: #e9d5ff; border: 1px solid #a855f7; border-radius: 3px;"></div>
<span style="font-size: 12px; color: #374151;">Hybrid (60-80% synthetic)</span>
</div>
<div style="display: flex; align-items: center; gap: 8px;">
<div style="width: 16px; height: 16px; background: #fee2e2; border: 1px solid #ef4444; border-radius: 3px;"></div>
<span style="font-size: 12px; color: #374151;">Synthetic (80-100% synthetic)</span>
</div>
</div>
</div>
"""
# Export
__all__ = ["TextHighlighter"]