Spaces:

satyaki-mitra
/

Text_Authenticator

Sleeping

App Files Files Community

Text_Authenticator / services /highlighter.py

satyaki-mitra

Architecture updated

44d0409 18 days ago

raw

history blame contribute delete

46.2 kB

	# DEPENDENCIES
	import re
	from typing import List
	from typing import Dict
	from typing import Tuple
	from loguru import logger
	from typing import Optional
	from config.enums import Domain
	from config.schemas import MetricResult
	from config.schemas import EnsembleResult
	from processors.text_processor import TextProcessor
	from config.threshold_config import ConfidenceLevel
	from config.schemas import HighlightedSentenceResult
	from config.threshold_config import MetricThresholds
	from config.threshold_config import get_confidence_level
	from services.ensemble_classifier import EnsembleClassifier
	from config.threshold_config import get_threshold_for_domain
	from config.threshold_config import get_active_metric_weights


	class TextHighlighter:
	"""
	Generates sentence-level highlighting with ensemble results integration

	FEATURES:
	- Sentence-level highlighting with confidence scores
	- Domain-aware calibration
	- Ensemble-assisted probability aggregation
	- Hybrid content detection
	- Explainable tooltips
	"""
	# Color thresholds - 4 categories
	COLOR_THRESHOLDS = [(0.00, 0.40, "authentic", "#d1fae5", "Likely authentically written"), # Authentic: Synthetic probability < 0.4
	(0.40, 0.60, "uncertain", "#fef3c7", "Uncertain authorship"), # Uncertain: 0.4 ≤ Synthetic probability < 0.6
	(0.60, 0.80, "hybrid", "#e9d5ff", "Mixed synthetic/authentic content"), # Hybrid: 0.6 ≤ Synthetic probability < 0.8 OR explicit hybrid detection
	(0.80, 1.01, "synthetic", "#fee2e2", "Likely synthetically generated"), # Synthetic: Synthetic probability ≥ 0.8
	]

	# Hybrid detection thresholds
	HYBRID_PROB_THRESHOLD = 0.25 # Minimum hybrid probability to classify as hybrid

	def __init__(self, domain: Domain = Domain.GENERAL, ensemble_classifier: Optional[EnsembleClassifier] = None):
	"""
	Initialize text highlighter with ENSEMBLE INTEGRATION

	Arguments:
	----------
	domain { Domain } : Text domain for adaptive thresholding

	ensemble_classifier { EnsembleClassifier } : Optional ensemble for sentence-level analysis
	"""
	self.text_processor = TextProcessor()
	self.domain = domain
	self.domain_thresholds = get_threshold_for_domain(domain)
	self.ensemble = ensemble_classifier or self._create_default_ensemble()


	def _create_default_ensemble(self) -> EnsembleClassifier:
	"""
	Create default ensemble classifier with proper error handling
	"""
	try:
	return EnsembleClassifier(primary_method = "confidence_calibrated",
	fallback_method = "domain_weighted",
	)
	except Exception as e:
	logger.warning(f"Failed to create default ensemble: {e}. Using fallback mode.")
	return EnsembleClassifier(primary_method = "domain_weighted",
	fallback_method = "simple_average",
	)


	def generate_highlights(self, text: str, metric_results: Dict[str, MetricResult], ensemble_result: Optional[EnsembleResult] = None,
	enabled_metrics: Optional[Dict[str, bool]] = None, use_sentence_level: bool = True) -> List[HighlightedSentenceResult]:
	"""
	Generate sentence-level highlights with ensemble integration

	Arguments:
	----------
	text { str } : Original text

	metric_results { dict } : Results from all metrics

	ensemble_result { EnsembleResult } : Optional document-level ensemble result

	enabled_metrics { dict } : Dict of metric_name -> is_enabled

	use_sentence_level { bool } : Whether to compute sentence-level probabilities

	Returns:
	--------
	{ list } : List of HighlightedSentenceResult objects
	"""
	try:
	# Validate inputs
	if not text or not text.strip():
	return self._handle_empty_text(text = text,
	metric_results = metric_results,
	ensemble_result = ensemble_result,
	)

	# Get domain-appropriate weights for enabled metrics
	if enabled_metrics is None:
	enabled_metrics = {name: True for name in metric_results.keys()}

	weights = get_active_metric_weights(self.domain, enabled_metrics)

	# Split text into sentences with error handling
	sentences = self._split_sentences_with_fallback(text = text)

	if not sentences:
	return self._handle_no_sentences(text, metric_results, ensemble_result)

	# Calculate probabilities for each sentence using ENSEMBLE METHODS
	highlighted_sentences = list()

	for idx, sentence in enumerate(sentences):
	try:
	if use_sentence_level:
	# Use ensemble for sentence-level analysis
	synthetic_prob, authentic_prob, hybrid_prob, confidence, breakdown = self._calculate_sentence_ensemble_probability(sentence = sentence,
	metric_results = metric_results,
	weights = weights,
	ensemble_result = ensemble_result,
	)
	else:
	# Use document-level ensemble probabilities
	synthetic_prob, authentic_prob, hybrid_prob, confidence, breakdown = self._get_document_ensemble_probability(ensemble_result = ensemble_result,
	metric_results = metric_results,
	weights = weights,
	)

	# Apply domain-specific adjustments with limits
	synthetic_prob = self._apply_domain_specific_adjustments(sentence = sentence,
	synthetic_prob = synthetic_prob,
	sentence_length = len(sentence.split()),
	)

	# Determine if this is hybrid content
	is_hybrid_content = self._is_hybrid_content(synthetic_prob = synthetic_prob,
	hybrid_prob = hybrid_prob,
	confidence = confidence,
	)

	# Get confidence level
	confidence_level = get_confidence_level(confidence)

	# Get color class (consider hybrid content)
	color_class, color_hex, tooltip_base = self._get_color_for_probability(synthetic_prob = synthetic_prob,
	is_hybrid_content = is_hybrid_content,
	hybrid_prob = hybrid_prob,
	)

	# Generate enhanced tooltip
	tooltip = self._generate_ensemble_tooltip(sentence = sentence,
	synthetic_prob = synthetic_prob,
	authentic_prob = authentic_prob,
	hybrid_prob = hybrid_prob,
	confidence = confidence,
	confidence_level = confidence_level,
	tooltip_base = tooltip_base,
	breakdown = breakdown,
	is_hybrid_content = is_hybrid_content,
	)

	highlighted_sentences.append(HighlightedSentenceResult(text = sentence,
	synthetic_probability = synthetic_prob,
	authentic_probability = authentic_prob,
	hybrid_probability = hybrid_prob,
	confidence = confidence,
	confidence_level = confidence_level,
	color_class = color_class,
	tooltip = tooltip,
	index = idx,
	is_hybrid_content = is_hybrid_content,
	metric_breakdown = breakdown,
	)
	)

	except Exception as e:
	logger.warning(f"Failed to process sentence {idx}: {e}")
	# Add fallback sentence
	highlighted_sentences.append(self._create_fallback_sentence(sentence, idx))

	return highlighted_sentences

	except Exception as e:
	logger.error(f"Highlight generation failed: {e}")
	return self._create_error_fallback(text, metric_results)


	def _handle_empty_text(self, text: str, metric_results: Dict[str, MetricResult], ensemble_result: Optional[EnsembleResult]) -> List[HighlightedSentenceResult]:
	"""
	Handle empty input text
	"""
	if ensemble_result:
	return [self._create_fallback_sentence(text = "No text content",
	index = 0,
	synthetic_prob = ensemble_result.synthetic_probability,
	authentic_prob = ensemble_result.authentic_probability,
	)
	]

	return [self._create_fallback_sentence("No text content", 0)]


	def _handle_no_sentences(self, text: str, metric_results: Dict[str, MetricResult], ensemble_result: Optional[EnsembleResult]) -> List[HighlightedSentenceResult]:
	"""
	Handle case where no sentences could be extracted
	"""
	if text and text.strip():
	# Treat entire text as one sentence
	return [self._create_fallback_sentence(text.strip(), 0)]

	return [self._create_fallback_sentence("No processable content", 0)]


	def _create_fallback_sentence(self, text: str, index: int, synthetic_prob: float = 0.5, authentic_prob: float = 0.5) -> HighlightedSentenceResult:
	"""
	Create a fallback sentence when processing fails
	"""
	confidence_level = get_confidence_level(0.3)
	color_class, _, tooltip_base = self._get_color_for_probability(synthetic_prob = synthetic_prob,
	is_hybrid_content = False,
	hybrid_prob = 0.0,
	)

	return HighlightedSentenceResult(text = text,
	synthetic_probability = synthetic_prob,
	authentic_probability = authentic_prob,
	hybrid_probability = 0.0,
	confidence = 0.3,
	confidence_level = confidence_level,
	color_class = color_class,
	tooltip = f"Fallback: {tooltip_base}\nProcessing failed for this sentence",
	index = index,
	is_hybrid_content = False,
	metric_breakdown = {"fallback": synthetic_prob},
	)


	def _create_error_fallback(self, text: str, metric_results: Dict[str, MetricResult]) -> List[HighlightedSentenceResult]:
	"""
	Create fallback when entire processing fails
	"""
	return [HighlightedSentenceResult(text = text[:100] + "..." if len(text) > 100 else text,
	synthetic_probability = 0.5,
	authentic_probability = 0.5,
	hybrid_probability = 0.0,
	confidence = 0.1,
	confidence_level = get_confidence_level(0.1),
	color_class = "uncertain",
	tooltip = "Error in text processing",
	index = 0,
	is_hybrid_content = False,
	metric_breakdown = {"error": 0.5},
	)
	]


	def _split_sentences_with_fallback(self, text: str) -> List[str]:
	"""
	Split text into sentences with comprehensive fallback handling
	"""
	try:
	sentences = self.text_processor.split_sentences(text)
	filtered_sentences = [s.strip() for s in sentences if len(s.strip()) >= 3]

	if filtered_sentences:
	return filtered_sentences

	# Fallback: split by common sentence endings
	fallback_sentences = re.split(r'[.!?]+', text)
	fallback_sentences = [s.strip() for s in fallback_sentences if len(s.strip()) >= 3]

	if fallback_sentences:
	return fallback_sentences

	# Ultimate fallback: treat as single sentence if meaningful
	if text.strip():
	return [text.strip()]

	return []

	except Exception as e:
	logger.warning(f"Sentence splitting failed, using fallback: {e}")
	# Return text as single sentence
	return [text] if text.strip() else []


	def _calculate_sentence_ensemble_probability(self, sentence: str, metric_results: Dict[str, MetricResult], weights: Dict[str, float], ensemble_result: Optional[EnsembleResult] = None) -> Tuple[float, float, float, float, Dict[str, float]]:
	"""
	Calculate sentence probabilities using ensemble methods with domain calibration
	"""
	sentence_length = len(sentence.split())

	# Handling very short sentences – do not force neutral, but reduce confidence
	if (sentence_length < 3):
	base_synthetic_prob = 0.5
	base_confidence = 0.2
	breakdown = {"short_sentence": base_synthetic_prob}

	for name, result in metric_results.items():
	if (result.error is None and weights.get(name, 0.0) > 0):
	base_synthetic_prob = result.synthetic_probability
	breakdown[name] = base_synthetic_prob
	break

	return (base_synthetic_prob,
	1.0 - base_synthetic_prob,
	0.0,
	base_confidence,
	breakdown
	)

	# Build sentence-level metric results
	sentence_metric_results = dict()
	breakdown = dict()

	for name, doc_result in metric_results.items():
	if doc_result.error is not None:
	continue

	try:
	sentence_prob = self._compute_sentence_metric(metric_name = name,
	sentence = sentence,
	result = doc_result,
	weight = weights.get(name, 0.0),
	)

	sentence_metric_results[name] = self._create_sentence_metric_result(metric_name = name,
	synthetic_prob = sentence_prob,
	doc_result = doc_result,
	sentence_length = sentence_length,
	)

	breakdown[name] = sentence_prob

	except Exception as e:
	logger.warning(f"Metric {name} failed for sentence: {e}")
	breakdown[name] = doc_result.synthetic_probability

	# Ensemble aggregation (PRIMARY PATH)
	if sentence_metric_results:
	try:
	ensemble_sentence_result = self.ensemble.predict(metric_results = sentence_metric_results,
	domain = self.domain,
	)

	return (ensemble_sentence_result.synthetic_probability,
	ensemble_sentence_result.authentic_probability,
	ensemble_sentence_result.hybrid_probability,
	ensemble_sentence_result.overall_confidence,
	breakdown,
	)

	except Exception as e:
	logger.warning(f"Sentence ensemble failed: {e}")

	# Fallback: weighted average aggregation
	return self._fallback_weighted_probability(metric_results, weights, breakdown)


	def _compute_sentence_metric(self, metric_name: str, sentence: str, result: MetricResult, weight: float) -> float:
	"""
	Compute metric probability for a single sentence using domain-specific thresholds
	"""
	sentence_length = len(sentence.split())

	# Get domain-specific threshold for this metric
	metric_thresholds = getattr(self.domain_thresholds, metric_name, None)

	if not metric_thresholds:
	return result.synthetic_probability

	# Base probability from document-level result
	base_prob = result.synthetic_probability

	# Apply domain-aware sentence-level adjustments
	adjusted_prob = self._apply_metric_specific_adjustments(metric_name = metric_name,
	sentence = sentence,
	base_prob = base_prob,
	sentence_length = sentence_length,
	thresholds = metric_thresholds,
	)

	return adjusted_prob


	def _create_sentence_metric_result(self, metric_name: str, synthetic_prob: float, doc_result: MetricResult, sentence_length: int) -> MetricResult:
	"""
	Create sentence-level MetricResult from document-level result
	"""
	# Calculate confidence based on sentence characteristics
	sentence_confidence = self._calculate_sentence_confidence(doc_confidence = doc_result.confidence,
	sentence_length = sentence_length,
	)

	return MetricResult(metric_name = metric_name,
	synthetic_probability = synthetic_prob,
	authentic_probability = 1.0 - synthetic_prob,
	hybrid_probability = 0.0,
	confidence = sentence_confidence,
	details = doc_result.details,
	error = None,
	)


	def _calculate_sentence_confidence(self, doc_confidence: float, sentence_length: int) -> float:
	"""
	Calculate confidence for sentence-level analysis with length consideration
	"""
	base_reduction = 0.8
	# Scale confidence penalty with sentence length
	length_penalty = max(0.3, min(1.0, sentence_length / 12.0)) # Normalize around 12 words

	return max(0.1, doc_confidence * base_reduction * length_penalty)


	def _fallback_weighted_probability(self, metric_results: Dict[str, MetricResult], weights: Dict[str, float], breakdown: Dict[str, float]) -> Tuple[float, float, float, float, Dict[str, float]]:
	"""
	Fallback weighted probability calculation
	"""
	weighted_synthetic_probs = list()
	weighted_authentic_probs = list()
	confidences = list()
	total_weight = 0.0

	for name, result in metric_results.items():
	if result.error is None:
	weight = weights.get(name, 0.0)

	if (weight > 0):
	weighted_synthetic_probs.append(result.synthetic_probability * weight)
	weighted_authentic_probs.append(result.authentic_probability * weight)
	confidences.append(result.confidence)
	total_weight += weight

	if not weighted_synthetic_probs or total_weight == 0:
	return 0.5, 0.5, 0.0, 0.5, breakdown or {}

	synthetic_prob = sum(weighted_synthetic_probs) / total_weight
	authentic_prob = sum(weighted_authentic_probs) / total_weight
	hybrid_prob = 0.0 # Fallback
	avg_confidence = sum(confidences) / len(confidences) if confidences else 0.5

	return synthetic_prob, authentic_prob, hybrid_prob, avg_confidence, breakdown


	def _get_document_ensemble_probability(self, ensemble_result: Optional[EnsembleResult], metric_results: Dict[str, MetricResult], weights: Dict[str, float]) -> Tuple[float, float, float, float, Dict[str, float]]:
	"""
	Get document-level ensemble probability
	"""
	if ensemble_result:
	# Use existing ensemble result
	breakdown = {name: result.synthetic_probability for name, result in metric_results.items()}
	return (ensemble_result.synthetic_probability,
	ensemble_result.authentic_probability,
	ensemble_result.hybrid_probability,
	ensemble_result.overall_confidence,
	breakdown
	)

	else:
	# Calculate from metrics
	return self._fallback_weighted_probability(metric_results, weights, {})


	def _apply_domain_specific_adjustments(self, sentence: str, synthetic_prob: float, sentence_length: int) -> float:
	"""
	Apply domain-specific adjustments to Synthetic probability with limits
	"""
	original_prob = synthetic_prob
	adjustments = list()
	sentence_lower = sentence.lower()

	# Technical & AI/ML domains
	if self.domain in [Domain.AI_ML, Domain.SOFTWARE_DEV, Domain.TECHNICAL_DOC, Domain.ENGINEERING, Domain.SCIENCE]:
	if self._has_technical_terms(sentence_lower):
	adjustments.append(1.1)

	elif self._has_code_like_patterns(sentence):
	adjustments.append(1.15)

	elif (sentence_length > 35):
	adjustments.append(1.05)

	# Creative & informal domains
	elif self.domain in [Domain.CREATIVE, Domain.SOCIAL_MEDIA, Domain.BLOG_PERSONAL]:
	if self._has_informal_language(sentence_lower):
	adjustments.append(0.7)

	elif self._has_emotional_language(sentence):
	adjustments.append(0.8)

	elif sentence_length < 10:
	adjustments.append(0.8)

	# Academic & formal domains
	elif self.domain in [Domain.ACADEMIC, Domain.LEGAL, Domain.MEDICAL]:
	if self._has_citation_patterns(sentence):
	adjustments.append(0.8)

	elif self._has_technical_terms(sentence_lower):
	adjustments.append(1.1)

	elif (sentence_length > 40):
	adjustments.append(1.1)

	# Business & professional domains
	elif self.domain in [Domain.BUSINESS, Domain.MARKETING, Domain.JOURNALISM]:
	if self._has_business_jargon(sentence_lower):
	adjustments.append(1.05)

	elif self._has_ambiguous_phrasing(sentence_lower):
	adjustments.append(0.9)

	elif (15 <= sentence_length <= 25):
	adjustments.append(0.9)

	# Tutorial & educational domains
	elif (self.domain == Domain.TUTORIAL):
	if self._has_instructional_language(sentence_lower):
	adjustments.append(0.85)

	elif self._has_step_by_step_pattern(sentence):
	adjustments.append(0.8)

	elif self._has_examples(sentence):
	adjustments.append(0.9)

	# General domain - minimal adjustments
	elif (self.domain == Domain.GENERAL):
	if self._has_complex_structure(sentence):
	adjustments.append(0.9)

	elif self._has_repetition(sentence):
	adjustments.append(1.1)

	# Apply adjustments with limits - take strongest 2 adjustments maximum
	if adjustments:
	# Sort by impact (farthest from 1.0)
	adjustments.sort(key = lambda x: abs(x - 1.0), reverse = True)

	# Limit to 2 strongest
	strongest_adjustments = adjustments[:2]

	for adjustment in strongest_adjustments:
	synthetic_prob *= adjustment

	# Ensure probability stays within bounds and doesn't change too drastically
	max_change = 0.3 # Maximum 30% change from original
	bounded_prob = max(original_prob - max_change, min(original_prob + max_change, synthetic_prob))

	return max(0.0, min(1.0, bounded_prob))


	def _apply_metric_specific_adjustments(self, metric_name: str, sentence: str, base_prob: float, sentence_length: int, thresholds: MetricThresholds) -> float:
	"""
	Apply metric-specific adjustments
	"""
	# Use metrics from ensemble
	if (metric_name == "perplexity"):
	if (sentence_length < 8):
	return min(1.0, base_prob * 1.2)

	elif (sentence_length > 25):
	return max(0.0, base_prob * 0.8)

	elif (metric_name == "entropy"):
	words = sentence.split()

	if (len(words) > 3):
	unique_words = len(set(words))
	diversity = unique_words / len(words)

	if (diversity < 0.6):
	return min(1.0, base_prob * 1.2)

	elif (diversity > 0.8):
	return max(0.0, base_prob * 0.8)

	elif (metric_name == "linguistic"):
	complexity_score = self._analyze_sentence_complexity(sentence)

	if (complexity_score < 0.3):
	return min(1.0, base_prob * 1.1)

	elif (complexity_score > 0.7):
	return max(0.0, base_prob * 0.9)

	elif (metric_name == "structural"):
	if ((sentence_length < 5) or (sentence_length > 40)):
	return max(0.0, base_prob * 0.8)

	elif (8 <= sentence_length <= 20):
	return min(1.0, base_prob * 1.1)

	elif (metric_name == "semantic_analysis"):
	if self._has_repetition(sentence):
	return min(1.0, base_prob * 1.2)

	elif (metric_name == "multi_perturbation_stability"):
	# MultiPerturbationStability adjustments for sentence level
	if (sentence_length > 15):
	return min(1.0, base_prob * 1.1)

	return base_prob


	def _is_hybrid_content(self, synthetic_prob: float, hybrid_prob: float, confidence: float) -> bool:
	"""
	Determine if content should be classified as hybrid
	"""
	# Case 1: Explicit high hybrid probability from ensemble
	if (hybrid_prob > self.HYBRID_PROB_THRESHOLD):
	return True

	# Case 2: High uncertainty combined with ambiguous synthetic probability
	if (confidence < 0.3 and 0.4 <= synthetic_prob <= 0.7):
	return True

	# Case 3: Synthetic probability in hybrid range (0.6-0.8)
	if (0.6 <= synthetic_prob < 0.8):
	return True

	return False


	def _get_color_for_probability(self, synthetic_prob: float, is_hybrid_content: bool = False, hybrid_prob: float = 0.0) -> Tuple[str, str, str]:
	"""
	Get color class with simplified 4-category system
	"""
	# Handle hybrid content first
	if is_hybrid_content:
	return "hybrid", "#e9d5ff", f"Mixed synthetic/authentic content ({hybrid_prob:.1%} hybrid)"

	# Iterate through simplified thresholds
	for min_thresh, max_thresh, color_class, color_hex, tooltip in self.COLOR_THRESHOLDS:
	if (min_thresh <= synthetic_prob < max_thresh):
	return color_class, color_hex, tooltip

	# Fallback for edge cases
	return "uncertain", "#fef3c7", "Uncertain authorship"


	def _generate_ensemble_tooltip(self, sentence: str, synthetic_prob: float, authentic_prob: float, hybrid_prob: float, confidence: float, confidence_level: ConfidenceLevel,
	tooltip_base: str, breakdown: Optional[Dict[str, float]] = None, is_hybrid_content: bool = False) -> str:
	"""
	Generate enhanced tooltip with ENSEMBLE information
	"""
	tooltip = f"{tooltip_base}\n"

	if is_hybrid_content:
	tooltip += "🔀 HYBRID CONTENT DETECTED\n"

	tooltip += f"Synthetic Probability: {synthetic_prob:.1%}\n"
	tooltip += f"Authentic Probability: {authentic_prob:.1%}\n"
	tooltip += f"Hybrid Probability: {hybrid_prob:.1%}\n"
	tooltip += f"Confidence: {confidence:.1%} ({confidence_level.value.replace('_', ' ').title()})\n"
	tooltip += f"Domain: {self.domain.value.replace('_', ' ').title()}\n"
	tooltip += f"Length: {len(sentence.split())} words"

	if breakdown:
	tooltip += "\n\nMetric Breakdown:"
	# Show top 4 metrics
	for metric, prob in list(breakdown.items())[:4]:
	tooltip += f"\n• {metric}: {prob:.1%}"

	tooltip += f"\n\nEnsemble Method: {getattr(self.ensemble, 'primary_method', 'fallback')}"

	return tooltip


	def _has_citation_patterns(self, sentence: str) -> bool:
	"""
	Check for academic citation patterns
	"""
	citation_indicators = ['et al.', 'ibid.', 'cf.', 'e.g.', 'i.e.', 'vol.', 'pp.', 'ed.', 'trans.', 'reference', 'cited', 'according to']
	return any(indicator in sentence.lower() for indicator in citation_indicators)


	def _has_informal_language(self, sentence: str) -> bool:
	"""
	Check for informal language patterns
	"""
	informal_indicators = ['lol', 'omg', 'btw', 'imo', 'tbh', 'afaik', 'smh', '👋', '😂', '❤️', 'haha', 'wow', 'awesome']
	return any(indicator in sentence.lower() for indicator in informal_indicators)


	def _has_technical_terms(self, sentence: str) -> bool:
	"""
	Check for domain-specific technical terms
	"""
	technical_indicators = ['hereinafter', 'whereas', 'aforementioned', 'diagnosis', 'prognosis', 'etiology',
	'algorithm', 'neural network', 'machine learning', 'api', 'endpoint', 'database',
	'quantum', 'thermodynamics', 'hypothesis', 'methodology']

	return any(indicator in sentence.lower() for indicator in technical_indicators)


	def _has_ambiguous_phrasing(self, sentence: str) -> bool:
	"""
	Check for ambiguous phrasing that might indicate human writing
	"""
	ambiguous_indicators = ['perhaps', 'maybe', 'possibly', 'likely', 'appears to', 'seems to', 'might be', 'could be']
	return any(indicator in sentence.lower() for indicator in ambiguous_indicators)


	def _has_complex_structure(self, sentence: str) -> bool:
	"""
	Check if sentence has complex linguistic structure
	"""
	words = sentence.split()
	if (len(words) < 8):
	return False

	complex_indicators = ['which', 'that', 'although', 'because', 'while', 'when', 'if', 'however', 'therefore']
	return any(indicator in sentence.lower() for indicator in complex_indicators)


	def _has_emotional_language(self, sentence: str) -> bool:
	"""
	Check for emotional or subjective language
	"""
	emotional_indicators = ['feel', 'believe', 'think', 'wonder', 'hope', 'wish', 'love', 'hate', 'frustrating', 'exciting']
	return any(indicator in sentence.lower() for indicator in emotional_indicators)


	def _has_business_jargon(self, sentence: str) -> bool:
	"""
	Check for business jargon
	"""
	jargon_indicators = ['synergy', 'leverage', 'bandwidth', 'circle back', 'touch base', 'value add', 'core competency']
	return any(indicator in sentence.lower() for indicator in jargon_indicators)


	def _has_instructional_language(self, sentence: str) -> bool:
	"""
	Check for instructional language patterns
	"""
	instructional_indicators = ['step by step', 'firstly', 'secondly', 'finally', 'note that', 'remember to', 'make sure']
	return any(indicator in sentence.lower() for indicator in instructional_indicators)


	def _has_step_by_step_pattern(self, sentence: str) -> bool:
	"""
	Check for step-by-step instructions
	"""
	step_patterns = ['step 1', 'step 2', 'step 3', 'step one', 'step two', 'first step', 'next step']
	return any(pattern in sentence.lower() for pattern in step_patterns)


	def _has_examples(self, sentence: str) -> bool:
	"""
	Check for example indicators
	"""
	example_indicators = ['for example', 'for instance', 'such as', 'e.g.', 'as an example']
	return any(indicator in sentence.lower() for indicator in example_indicators)


	def _has_code_like_patterns(self, sentence: str) -> bool:
	"""
	Check for code-like patterns in technical domains
	"""
	code_patterns = ['function', 'variable', 'class', 'method', 'import', 'def ', 'void ', 'public ', 'private ']
	return any(pattern in sentence for pattern in code_patterns)


	def _analyze_sentence_complexity(self, sentence: str) -> float:
	"""
	Analyze sentence complexity (0 = simple, 1 = complex)
	"""
	words = sentence.split()
	if (len(words) < 5):
	return 0.2

	complexity_indicators = ['although', 'because', 'while', 'when', 'if', 'since', 'unless', 'until', 'which', 'that', 'who', 'whom', 'whose', 'and', 'but', 'or', 'yet', 'so', 'however', 'therefore', 'moreover', 'furthermore', 'nevertheless', ',', ';', ':', '—']

	score = 0.0

	if (len(words) > 15):
	score += 0.3

	elif (len(words) > 25):
	score += 0.5

	indicator_count = sum(1 for indicator in complexity_indicators if indicator in sentence.lower())
	score += min(0.5, indicator_count * 0.1)

	clause_indicators = [',', ';', 'and', 'but', 'or', 'because', 'although']
	clause_count = sum(1 for indicator in clause_indicators if indicator in sentence.lower())
	score += min(0.2, clause_count * 0.05)

	return min(1.0, score)


	def _has_repetition(self, sentence: str) -> bool:
	"""
	Check if sentence has word repetition (common in Synthetic text)
	"""
	words = sentence.lower().split()
	if (len(words) < 6):
	return False

	word_counts = dict()

	for word in words:
	if (len(word) > 3):
	word_counts[word] = word_counts.get(word, 0) + 1

	repeated_words = [word for word, count in word_counts.items() if count > 2]
	return (len(repeated_words) > 0)


	def generate_html(self, highlighted_sentences: List[HighlightedSentenceResult], include_legend: bool = True) -> str:
	"""
	Generate HTML with highlighted sentences

	Arguments:
	----------
	highlighted_sentences { List[HighlightedSentenceResult] } : Sentences with highlighting data

	include_legend { bool } : Whether to include legend

	Returns:
	--------
	{ str } : HTML content
	"""
	html_parts = list()

	# Add CSS
	html_parts.append(self._generate_css())

	# Include legend if requested
	if include_legend:
	html_parts.append(self._generate_legend_html())

	# Add highlighted text container
	html_parts.append('<div class="highlighted-text">')

	for sent in highlighted_sentences:
	extra_class = " hybrid-highlight" if sent.is_hybrid_content else ""
	html_parts.append(f'<span class="highlight {sent.color_class}{extra_class}" '
	f'data-synthetic-prob="{sent.synthetic_probability:.4f}" '
	f'data-authentic-prob="{sent.authentic_probability:.4f}" '
	f'data-hybrid-prob="{sent.hybrid_probability:.4f}" '
	f'data-confidence="{sent.confidence:.4f}" '
	f'data-confidence-level="{sent.confidence_level.value}" '
	f'data-domain="{self.domain.value}" '
	f'data-sentence-idx="{sent.index}" '
	f'data-is-hybrid="{str(sent.is_hybrid_content).lower()}" '
	f'title="{sent.tooltip}">'
	f'{sent.text}'
	f'</span> ')

	html_parts.append('</div>')

	return '\n'.join(html_parts)


	def _generate_css(self) -> str:
	"""
	Generate CSS for highlighting for better readability with 4 color types
	"""
	return """
	<style>
	.highlighted-text {
	line-height: 1.8;
	font-size: 16px;
	font-family: 'Georgia', serif;
	padding: 20px;
	background: #ffffff;
	border-radius: 8px;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	margin-bottom: 20px;
	}

	.highlight {
	padding: 2px 4px;
	margin: 0 1px;
	border-radius: 3px;
	transition: all 0.2s ease;
	cursor: help;
	border-bottom: 2px solid transparent;
	color: #000000 !important;
	font-weight: 500;
	position: relative;
	}

	.highlight:hover {
	transform: translateY(-1px);
	box-shadow: 0 4px 12px rgba(0,0,0,0.15);
	z-index: 10;
	text-shadow: 0 1px 1px rgba(255,255,255,0.8);
	}

	/* Authentic - Green tones */
	.authentic {
	background-color: #d1fae5;
	border-bottom-color: #10b981;
	}

	/* Uncertain - Yellow tones */
	.uncertain {
	background-color: #fef3c7;
	border-bottom-color: #f59e0b;
	}

	/* Hybrid - Purple tones */
	.hybrid {
	background-color: #e9d5ff;
	border-bottom-color: #a855f7;
	}

	.hybrid-highlight:hover {
	border: 2px dashed #a855f7;
	}

	/* Synthetic - Red tones */
	.synthetic {
	background-color: #fee2e2;
	border-bottom-color: #ef4444;
	}
	</style>
	"""


	def _generate_legend_html(self) -> str:
	"""
	Generate legend HTML for 4-category system
	"""
	return """
	<div class="highlight-legend" style="margin-bottom: 20px; padding: 15px; background: #f8fafc; border-radius: 8px; border: 1px solid #e2e8f0;">
	<h4 style="margin: 0 0 10px 0; font-size: 14px; font-weight: 600; color: #374151;">Text Analysis Legend</h4>
	<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 8px;">
	<div style="display: flex; align-items: center; gap: 8px;">
	<div style="width: 16px; height: 16px; background: #d1fae5; border: 1px solid #10b981; border-radius: 3px;"></div>
	<span style="font-size: 12px; color: #374151;">Authentic (0-40% synthetic)</span>
	</div>
	<div style="display: flex; align-items: center; gap: 8px;">
	<div style="width: 16px; height: 16px; background: #fef3c7; border: 1px solid #f59e0b; border-radius: 3px;"></div>
	<span style="font-size: 12px; color: #374151;">Uncertain (40-60% synthetic)</span>
	</div>
	<div style="display: flex; align-items: center; gap: 8px;">
	<div style="width: 16px; height: 16px; background: #e9d5ff; border: 1px solid #a855f7; border-radius: 3px;"></div>
	<span style="font-size: 12px; color: #374151;">Hybrid (60-80% synthetic)</span>
	</div>
	<div style="display: flex; align-items: center; gap: 8px;">
	<div style="width: 16px; height: 16px; background: #fee2e2; border: 1px solid #ef4444; border-radius: 3px;"></div>
	<span style="font-size: 12px; color: #374151;">Synthetic (80-100% synthetic)</span>
	</div>
	</div>
	</div>
	"""


	# Export
	__all__ = ["TextHighlighter"]