import numpy as np class BasicExplainer: def __init__(self, thresholds=None, triage_conf_threshold=0.8, enable_triage=True): """ Args: thresholds (dict): e.g. { "noiseprint_mismatch": 2.5, "residual_energy_p95": 0.08, "fft_peakiness": 3.0 } triage_conf_threshold (float): minimum confidence to avoid marking a conflicted case as UNCERTAIN. enable_triage (bool): if True, mark conflicted low-confidence cases as UNCERTAIN in the narrative. """ self.thresholds = thresholds or {} self.triage_conf_threshold = triage_conf_threshold self.enable_triage = enable_triage def explain(self, features, proba, prediction_label, ood_status=None, contributions=None, top_k_contributions=3): """ Generate a text explanation. Args: features (dict): Feature dictionary for a single sample. proba (float): Probability of being fake (class 1). prediction_label (int): 0 (real) or 1 (fake). ood_status (dict, optional): output of SimpleClassifier.predict_uncertainty for this single sample, e.g. { 'probs': [p], 'dist_real': [..], 'dist_fake': [..], 'dist_min': [..], 'is_ood': [..] } contributions (dict, optional): local feature contributions where positive values push toward FAKE and negative toward REAL. top_k_contributions (int): how many top-magnitude contributions to surface. Returns: str: Explanation text (markdown-friendly). """ explanation_parts = [] # -------------------- OOD detection handling -------------------- is_ood = False dist_real = None dist_fake = None if ood_status is not None: is_ood_arr = ood_status.get('is_ood') if is_ood_arr is not None: is_ood = bool(np.asarray(is_ood_arr)[0]) dist_real_arr = ood_status.get('dist_real') dist_fake_arr = ood_status.get('dist_fake') if dist_real_arr is not None: dist_real = float(np.asarray(dist_real_arr)[0]) if dist_fake_arr is not None: dist_fake = float(np.asarray(dist_fake_arr)[0]) if is_ood: explanation_parts.append("⚠️ **UNCERTAIN / POTENTIALLY OUT-OF-DISTRIBUTION**") if dist_real is not None and dist_fake is not None: explanation_parts.append( f"The feature vector lies far from both Real and Fake training clusters " f"(dist_real={dist_real:.1f}, dist_fake={dist_fake:.1f}). " f"Note: OOD detection cannot be validated without proper evaluation data." ) explanation_parts.append( "The decision below should be treated with caution.\n" ) # -------------------- Confidence / base label -------------------- # proba is P(fake); P(real) = 1 - proba if prediction_label == 1: confidence = proba base_label_str = "FAKE" else: confidence = 1.0 - proba base_label_str = "REAL" if confidence > 0.8: confidence_str = "high" elif confidence > 0.6: confidence_str = "moderate" else: confidence_str = "low" # -------------------- Forensic cues: collect support -------------------- supports_fake = 0 supports_real = 0 # Noiseprint mismatch nm = None thr_nm = None if 'noiseprint_mismatch' in features and 'noiseprint_mismatch' in self.thresholds: nm = float(features['noiseprint_mismatch']) thr_nm = float(self.thresholds['noiseprint_mismatch']) # High mismatch ⇒ evidence for FAKE, low ⇒ evidence for REAL if nm > thr_nm: supports_fake += 1 else: supports_real += 1 # Residual energy p95 re = None thr_re = None if 'residual_energy_p95' in features and 'residual_energy_p95' in self.thresholds: re = float(features['residual_energy_p95']) thr_re = float(self.thresholds['residual_energy_p95']) # High residual energy ⇒ evidence for FAKE if re > thr_re: supports_fake += 1 else: supports_real += 1 # FFT peakiness fp = None thr_fp = None if 'fft_peakiness' in features and 'fft_peakiness' in self.thresholds: fp = float(features['fft_peakiness']) thr_fp = float(self.thresholds['fft_peakiness']) # High peakiness ⇒ evidence for FAKE; otherwise treat as neutral/weak if fp > thr_fp: supports_fake += 1 conflict = (supports_fake > 0 and supports_real > 0) # -------------------- Suspiciously clean detection -------------------- # If ALL forensic cues are below threshold (supports_real > 0 and supports_fake == 0), # AND the prediction is REAL, this could indicate a modern generator that evades detection. # Flag as potentially suspicious if all cues are "clean" but confidence isn't very high. suspiciously_clean = (supports_fake == 0 and supports_real >= 2 and prediction_label == 0 and confidence < 0.98) # -------------------- Triage decision (narrative only) -------------------- triage_label = base_label_str if self.enable_triage and conflict and confidence < self.triage_conf_threshold: triage_label = "UNCERTAIN" elif self.enable_triage and suspiciously_clean and confidence < 0.95: # Modern generators like Flux may evade all forensic cues triage_label = "UNCERTAIN" # Intro sentence if triage_label == "UNCERTAIN" and suspiciously_clean: explanation_parts.append( f"⚠️ **CAUTION**: The detector predicts this image is **{base_label_str}** " f"with {confidence_str} confidence ({confidence:.2f}), " f"but ALL forensic cues are below threshold. This could indicate a modern generator " f"(like Flux, DALL-E 3, or Midjourney v6) that evades traditional forensic detection. " f"**Manual review recommended.**" ) elif triage_label == "UNCERTAIN": explanation_parts.append( f"The detector predicts this image is **{base_label_str}** " f"with {confidence_str} confidence ({confidence:.2f}), " f"but forensic cues conflict, so the case is marked **UNCERTAIN**." ) else: explanation_parts.append( f"The model predicts this image is **{base_label_str}** " f"with {confidence_str} confidence ({confidence:.2f})." ) # -------------------- Detailed cue explanations -------------------- cues_used = 0 # Noiseprint mismatch explanation if nm is not None and thr_nm is not None: if nm > thr_nm: # high mismatch → FAKE evidence if prediction_label == 1: explanation_parts.append( f"- **Noiseprint**: camera-model fingerprint is atypical for natural cameras " f"(mismatch={nm:.2f} > {thr_nm:.2f}), supporting the FAKE hypothesis." ) else: explanation_parts.append( f"- **Noiseprint**: camera-model fingerprint is atypical for natural cameras " f"(mismatch={nm:.2f} > {thr_nm:.2f}), which would usually suggest a FAKE; " f"however, other cues push the detector towards REAL." ) else: # low mismatch → REAL evidence if prediction_label == 0: explanation_parts.append( f"- **Noiseprint**: fingerprint lies within the range seen in training real images " f"(mismatch={nm:.2f} <= {thr_nm:.2f}), supporting the REAL hypothesis." ) else: explanation_parts.append( f"- **Noiseprint**: fingerprint lies within the range seen in training real images " f"(mismatch={nm:.2f} <= {thr_nm:.2f}), but other forensic cues indicate synthesis." ) cues_used += 1 # Residual energy explanation if re is not None and thr_re is not None: if re > thr_re: # high residual energy → FAKE evidence if prediction_label == 1: explanation_parts.append( f"- **Denoiser residual**: high 95th-percentile residual energy " f"(p95={re:.4f} > {thr_re:.4f}), supporting the FAKE hypothesis as " f"strong high-frequency artifacts are typical for generated images." ) else: explanation_parts.append( f"- **Denoiser residual**: high 95th-percentile residual energy " f"(p95={re:.4f} > {thr_re:.4f}), which would usually suggest synthesis; " f"here it conflicts with the REAL prediction." ) else: # low residual energy → REAL evidence if prediction_label == 0: explanation_parts.append( f"- **Denoiser residual**: residual energy (p95={re:.4f}) is within the range " f"observed for training real photos, consistent with a REAL image." ) else: explanation_parts.append( f"- **Denoiser residual**: residual energy (p95={re:.4f}) is not strongly abnormal; " f"the FAKE decision is driven more by other forensic cues." ) cues_used += 1 # FFT peakiness explanation if fp is not None and thr_fp is not None: if fp > thr_fp: if prediction_label == 1: explanation_parts.append( f"- **Frequency spectrum**: the Fourier magnitude has unusually sharp peaks " f"(peakiness={fp:.2f} > {thr_fp:.2f}), often linked to upsampling patterns " f"of generative models." ) else: explanation_parts.append( f"- **Frequency spectrum**: unusually sharp peaks in the Fourier magnitude " f"(peakiness={fp:.2f} > {thr_fp:.2f}), which is more typical for generated images " f"and conflicts with the REAL prediction." ) cues_used += 1 elif prediction_label == 1: # Even if below threshold, mention it if prediction is FAKE and it's close to threshold if fp > thr_fp * 0.8: # Within 80% of threshold explanation_parts.append( f"- **Frequency spectrum**: peakiness ({fp:.2f}) is moderately elevated " f"(threshold: {thr_fp:.2f}), contributing to the FAKE classification." ) cues_used += 1 # -------------------- Data-driven drivers (show what actually drove the decision) -------------------- if contributions: sorted_contribs = sorted(contributions.items(), key=lambda x: abs(x[1]), reverse=True) # Show top 5-8 features for better explanation top = sorted_contribs[:max(top_k_contributions, 8)] pos = [(name, val) for name, val in top if val > 0] neg = [(name, val) for name, val in top if val < 0] if pos: explanation_parts.append(f"\n**Features driving FAKE classification:**") # Show top 5-8 features that push toward FAKE pos_display = [f"{name} ({val:+.3f})" for name, val in pos[:8]] explanation_parts.append(f"- {', '.join(pos_display)}") if neg: explanation_parts.append(f"\n**Features supporting REAL classification:**") # Show top 3-5 features that push toward REAL neg_display = [f"{name} ({val:+.3f})" for name, val in neg[:5]] explanation_parts.append(f"- {', '.join(neg_display)}") elif not contributions and (cues_used == 0 or (prediction_label == 1 and cues_used < 2)): # If no strong forensic cues but high confidence, explain it's a combination explanation_parts.append( f"\n**Note**: While the primary forensic cues (Noiseprint, Residuals, FFT) don't individually " f"strongly indicate synthesis, the model's decision is based on a combination of many features " f"including DCT coefficients, FFT radial profiles, residual statistics, and other frequency-domain " f"characteristics. The high confidence ({confidence:.1%}) suggests these subtle patterns collectively " f"indicate synthetic generation." ) # List some of the other features that might be contributing other_features = [] if 'dct_mean' in features: other_features.append("DCT coefficients") if 'fft_radial_mean' in features: other_features.append("FFT radial profiles") if 'residual_skew' in features: other_features.append("residual statistics") if 'residual_kurtosis' in features: other_features.append("residual distribution shape") if other_features: explanation_parts.append( f"The model analyzes {', '.join(other_features)} and other frequency-domain patterns " f"that collectively indicate synthetic generation, even when individual cues are subtle." ) # In high-conflict cases, add a final triage note if triage_label == "UNCERTAIN" and not is_ood: explanation_parts.append( "Because the forensic cues point in different directions at only moderate confidence, " "this image should be flagged for manual review or stress-testing (e.g., recompression)." ) return "\n".join(explanation_parts)