Spaces:

lukhsaankumar
/

DeepFakeDetectorBackend

Sleeping

App Files Files Community

DeepFakeDetectorBackend / app /services /llm_service.py

lukhsaankumar

Deploy DeepFake Detector API - 2026-03-07 09:12:00

df4a21a 3 days ago

raw

history blame contribute delete

25.1 kB

	"""
	LLM Service for generating human-readable explanations of model predictions.

	Uses Google Gemini to translate model-space evidence (heatmaps, attention maps)
	into human-understandable hypotheses with proper hedging language.
	"""

	import json
	import base64
	from typing import Any, Dict, List, Optional
	from functools import lru_cache

	from app.core.config import get_settings
	from app.core.logging import get_logger

	logger = get_logger(__name__)

	# Model type descriptions for the LLM
	MODEL_TYPE_DESCRIPTIONS = {
	"cnn-transfer": {
	"type": "rgb_texture_cnn",
	"description": "Analyzes RGB pixel textures, colors, and fine details at multiple scales",
	"typical_cues": ["skin texture uniformity", "shading gradients", "fine detail at boundaries"]
	},
	"vit-base": {
	"type": "patch_consistency_vit",
	"description": "Analyzes global consistency and relationships between image patches",
	"typical_cues": ["lighting consistency", "background blur patterns", "patch-level coherence"]
	},
	"deit-distilled": {
	"type": "patch_consistency_vit",
	"description": "Analyzes global consistency with knowledge distillation for refined attention",
	"typical_cues": ["global-local consistency", "texture repetition", "depth coherence"]
	},
	"gradfield-cnn": {
	"type": "edge_coherence_cnn",
	"description": "Analyzes edge patterns, boundary sharpness, and gradient field coherence",
	"typical_cues": ["edge smoothness", "boundary naturalness", "gradient consistency"]
	}
	}

	# User-facing display information for each model (used in frontend)
	MODEL_DISPLAY_INFO = {
	"cnn-transfer": {
	"display_name": "Texture Analysis",
	"short_name": "CNN",
	"method_name": "Grad-CAM",
	"method_description": "Gradient-weighted Class Activation Mapping",
	"educational_text": (
	"This model examines fine-grained texture patterns and pixel-level details. "
	"The heatmap highlights regions where texture anomalies were detected. "
	"AI-generated images often have subtle texture inconsistencies - overly smooth skin, "
	"unnatural fabric patterns, or repetitive background textures that this model can detect."
	),
	"what_it_looks_for": [
	"Skin texture uniformity vs natural variation",
	"Fine detail preservation at edges and boundaries",
	"Color gradient smoothness and shading realism"
	]
	},
	"vit-base": {
	"display_name": "Patch Consistency",
	"short_name": "ViT",
	"method_name": "Attention Rollout",
	"method_description": "Aggregated attention across all transformer layers",
	"educational_text": (
	"This model analyzes how different parts of the image relate to each other. "
	"The heatmap shows which image patches drew the most attention. "
	"AI-generated images may have inconsistencies between regions - "
	"mismatched lighting, perspective errors, or elements that don't quite fit together."
	),
	"what_it_looks_for": [
	"Consistency of lighting across the image",
	"Spatial relationships between objects",
	"Background-foreground coherence"
	]
	},
	"deit-distilled": {
	"display_name": "Global Structure",
	"short_name": "DeiT",
	"method_name": "Attention Rollout",
	"method_description": "Distilled attention patterns from teacher model",
	"educational_text": (
	"This model uses knowledge distillation to detect global structural anomalies. "
	"The heatmap reveals areas where the overall image structure seems inconsistent. "
	"AI-generated images sometimes have subtle global issues - "
	"like depth inconsistencies or anatomical improbabilities."
	),
	"what_it_looks_for": [
	"Global-to-local consistency",
	"Depth and perspective coherence",
	"Structural plausibility of objects"
	]
	},
	"gradfield-cnn": {
	"display_name": "Edge Coherence",
	"short_name": "GradField",
	"method_name": "Gradient Field Analysis",
	"method_description": "Analysis of image gradient patterns and edge transitions",
	"educational_text": (
	"This model analyzes edge patterns and how colors transition across boundaries. "
	"The heatmap highlights areas with unusual edge characteristics. "
	"AI-generated images often have telltale edge artifacts - "
	"unnaturally sharp or blurry boundaries, inconsistent edge directions, or gradient anomalies."
	),
	"what_it_looks_for": [
	"Edge sharpness consistency",
	"Natural boundary transitions",
	"Gradient flow coherence"
	]
	}
	}

	def get_model_display_info(model_name: str) -> Dict[str, Any]:
	"""Get display info for a model, with fallback for unknown models."""
	return MODEL_DISPLAY_INFO.get(model_name, {
	"display_name": model_name.replace("-", " ").title(),
	"short_name": model_name[:3].upper(),
	"method_name": "Analysis",
	"method_description": "Model-specific analysis",
	"educational_text": f"This model ({model_name}) analyzes the image for signs of AI generation.",
	"what_it_looks_for": ["Image anomalies", "Generation artifacts"]
	})

	SYSTEM_PROMPT = """You are an AI image analysis interpreter for a deepfake detection system. Your role is to translate model evidence into human-understandable hypotheses.

	CRITICAL RULES:
	1. NEVER claim certainty. Always use hedging language: "may", "suggests", "possible", "could indicate", "might show"
	2. ALWAYS cite which model's evidence supports each statement (e.g., "based on CNN heatmap focus")
	3. If evidence is diffuse or unclear, say so explicitly: "Evidence is spread across the image; interpretation is less certain"
	4. Provide user-checkable observations, not definitive claims about what IS fake
	5. Remember: you are explaining what the MODEL focused on, not proving the image is fake

	MODEL TYPES AND WHAT THEY ANALYZE:
	- CNN (rgb_texture_cnn): Pixel textures, colors, fine details - looks for texture anomalies
	- ViT/DeiT (patch_consistency_vit): Global consistency, patch relationships - looks for coherence issues
	- GradField (edge_coherence_cnn): Edge patterns, boundaries, gradient fields - looks for edge artifacts

	OUTPUT FORMAT:
	You must respond with valid JSON matching this exact structure:
	{
	"per_model_insights": {
	"<model_name>": {
	"what_model_relied_on": "One sentence describing the model's focus area",
	"possible_cues": ["Cue 1 with hedging (based on evidence)", "Cue 2...", "Cue 3..."],
	"confidence_note": "Note about confidence level"
	}
	},
	"consensus_summary": [
	"Bullet 1 about model agreement/disagreement",
	"Bullet 2 about overall evidence pattern"
	]
	}"""


	class LLMService:
	"""Service for generating LLM-powered explanations of model predictions."""

	def __init__(self):
	self._client = None
	self._model_name = None
	self._enabled = False
	self._initialize()

	def _initialize(self):
	"""Initialize the Gemini client if API key is available."""
	settings = get_settings()

	if not settings.llm_enabled:
	logger.info("LLM explanations disabled: No GOOGLE_API_KEY configured")
	return

	try:
	from google import genai
	self._client = genai.Client(api_key=settings.GOOGLE_API_KEY)
	self._model_name = settings.GEMINI_MODEL
	self._enabled = True
	logger.info(f"LLM service initialized with model: {settings.GEMINI_MODEL}")
	except ImportError:
	logger.warning("google-genai package not installed. LLM explanations disabled.")
	except Exception as e:
	logger.error(f"Failed to initialize LLM service: {e}")

	@property
	def enabled(self) -> bool:
	"""Check if LLM explanations are available."""
	return self._enabled

	def build_evidence_packet(
	self,
	model_name: str,
	model_output: Dict[str, Any]
	) -> Dict[str, Any]:
	"""
	Build a structured evidence packet from model output.

	Args:
	model_name: Name of the model (e.g., "cnn-transfer")
	model_output: Raw output from the model's predict() method

	Returns:
	Structured evidence packet for LLM consumption
	"""
	model_info = MODEL_TYPE_DESCRIPTIONS.get(model_name, {
	"type": "unknown",
	"description": "Unknown model type",
	"typical_cues": []
	})

	return {
	"model_name": model_name,
	"model_type": model_info["type"],
	"model_description": model_info["description"],
	"prob_fake": model_output.get("prob_fake", 0.0),
	"prediction": model_output.get("pred", "unknown"),
	"focus_summary": model_output.get("focus_summary", "focus pattern not available"),
	"explainability_type": model_output.get("explainability_type", "unknown"),
	"typical_cues_for_this_model": model_info["typical_cues"]
	}

	def generate_explanation(
	self,
	original_image_b64: Optional[str],
	submodel_outputs: Dict[str, Dict[str, Any]],
	include_images: bool = True
	) -> Optional[Dict[str, Any]]:
	"""
	Generate LLM explanation for model predictions.

	Args:
	original_image_b64: Base64-encoded original image (optional)
	submodel_outputs: Dict mapping model names to their outputs
	include_images: Whether to include images in the prompt (uses vision model)

	Returns:
	ExplanationResult dict or None if generation fails
	"""
	if not self._enabled:
	logger.warning("LLM explanations requested but service not enabled")
	return None

	try:
	# Build evidence packets for all models
	evidence_packets = {}
	for model_name, output in submodel_outputs.items():
	evidence_packets[model_name] = self.build_evidence_packet(model_name, output)

	# Build the prompt
	user_prompt = self._build_user_prompt(evidence_packets, submodel_outputs)

	# Build content parts (text + optional images)
	content_parts = []

	# Add images if requested and available
	if include_images:
	# Add original image
	if original_image_b64:
	content_parts.append({
	"mime_type": "image/png",
	"data": original_image_b64
	})
	content_parts.append("Original image shown above.\n\n")

	# Add heatmap overlays for each model
	for model_name, output in submodel_outputs.items():
	if output.get("heatmap_base64"):
	content_parts.append({
	"mime_type": "image/png",
	"data": output["heatmap_base64"]
	})
	content_parts.append(f"Heatmap overlay for {model_name} shown above.\n\n")

	# Add the main text prompt
	content_parts.append(user_prompt)

	# Call the LLM using new google.genai API
	logger.info("Generating LLM explanation...")
	from google.genai import types

	# Build the parts list for the new API
	parts = []
	for part in content_parts:
	if isinstance(part, dict) and "mime_type" in part:
	# Image part
	parts.append(types.Part.from_bytes(
	data=__import__('base64').b64decode(part["data"]),
	mime_type=part["mime_type"]
	))
	else:
	# Text part
	parts.append(types.Part.from_text(text=str(part)))

	response = self._client.models.generate_content(
	model=self._model_name,
	contents=[SYSTEM_PROMPT] + parts,
	config=types.GenerateContentConfig(
	temperature=0.3,
	top_p=0.8,
	max_output_tokens=2048,
	)
	)

	# Parse the response
	return self._parse_response(response.text, list(submodel_outputs.keys()))

	except Exception as e:
	logger.error(f"Failed to generate LLM explanation: {e}")
	return None

	def _build_user_prompt(
	self,
	evidence_packets: Dict[str, Dict],
	submodel_outputs: Dict[str, Dict]
	) -> str:
	"""Build the user prompt with evidence data."""

	# Calculate some aggregate stats
	prob_fakes = [p["prob_fake"] for p in evidence_packets.values()]
	avg_prob = sum(prob_fakes) / len(prob_fakes) if prob_fakes else 0
	agreement = "Models generally agree" if max(prob_fakes) - min(prob_fakes) < 0.3 else "Models show disagreement"

	prompt = f"""I have {len(evidence_packets)} deepfake detection models analyzing an image.

	EVIDENCE FROM EACH MODEL:
	{json.dumps(evidence_packets, indent=2)}

	AGGREGATE ANALYSIS:
	- Average fake probability: {avg_prob:.1%}
	- Model agreement: {agreement}
	- Probability range: {min(prob_fakes):.1%} to {max(prob_fakes):.1%}

	TASK:
	For each model, provide:
	1. "what_model_relied_on": One sentence describing where the model focused (cite the focus_summary)
	2. "possible_cues": 2-4 possible visual cues a human could check, phrased as hypotheses with hedging language
	3. "confidence_note": Assessment based on prob_fake value and focus pattern

	Then provide "consensus_summary": 2-3 bullets about where models agreed/disagreed and overall evidence quality.

	Remember: Use hedging language ("may", "suggests", "possible"). Never claim certainty.

	Respond with valid JSON only, no markdown formatting."""

	return prompt

	def _parse_response(
	self,
	response_text: str,
	expected_models: List[str]
	) -> Optional[Dict[str, Any]]:
	"""Parse and validate the LLM response."""

	try:
	# Try to extract JSON from the response
	# Sometimes the model wraps it in markdown code blocks
	text = response_text.strip()
	if text.startswith("```"):
	# Remove markdown code block
	lines = text.split("\n")
	text = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
	text = text.strip()

	result = json.loads(text)

	# Validate structure
	if "per_model_insights" not in result:
	logger.warning("LLM response missing per_model_insights")
	result["per_model_insights"] = {}

	if "consensus_summary" not in result:
	logger.warning("LLM response missing consensus_summary")
	result["consensus_summary"] = ["Model analysis completed."]

	# Ensure all expected models have entries (fill with defaults if missing)
	for model_name in expected_models:
	if model_name not in result["per_model_insights"]:
	result["per_model_insights"][model_name] = {
	"what_model_relied_on": f"The {model_name} model analyzed the image.",
	"possible_cues": ["Evidence details not available for this model."],
	"confidence_note": "Unable to generate detailed analysis."
	}

	return result

	except json.JSONDecodeError as e:
	logger.error(f"Failed to parse LLM response as JSON: {e}")
	logger.debug(f"Raw response: {response_text[:500]}...")

	# Return a fallback response
	return {
	"per_model_insights": {
	model: {
	"what_model_relied_on": f"The {model} model analyzed the image.",
	"possible_cues": ["Unable to generate detailed explanation."],
	"confidence_note": "LLM response parsing failed."
	}
	for model in expected_models
	},
	"consensus_summary": ["Model analysis completed but detailed explanation unavailable."]
	}

	def generate_single_model_explanation(
	self,
	model_name: str,
	prob_fake: float,
	original_image_b64: Optional[str] = None,
	heatmap_b64: Optional[str] = None,
	focus_summary: Optional[str] = None,
	contribution_percentage: Optional[float] = None
	) -> Optional[Dict[str, Any]]:
	"""
	Generate LLM explanation for a single model's prediction.

	This is more token-efficient than generating all explanations at once,
	and allows users to request explanations on-demand per model.

	Args:
	model_name: Name of the model (e.g., "cnn-transfer")
	prob_fake: The model's fake probability
	original_image_b64: Base64-encoded original image
	heatmap_b64: Base64-encoded heatmap overlay
	focus_summary: Text summary of where model focused
	contribution_percentage: How much this model contributed to fusion decision

	Returns:
	Dict with insight for this model or None if generation fails
	"""
	if not self._enabled:
	logger.warning("LLM explanations requested but service not enabled")
	return None

	try:
	# Get display info for this model
	display_info = get_model_display_info(model_name)
	model_type_info = MODEL_TYPE_DESCRIPTIONS.get(model_name, {
	"type": "unknown",
	"description": "Unknown model type",
	"typical_cues": []
	})

	# Build focused prompt for single model
	prompt = f"""You are analyzing a single model's output from a deepfake detection system.

	MODEL INFORMATION:
	- Display Name: {display_info['display_name']}
	- Analysis Method: {display_info['method_name']} ({display_info['method_description']})
	- What It Analyzes: {model_type_info['description']}
	- Typical Cues It Detects: {', '.join(model_type_info['typical_cues'])}

	DETECTION RESULTS:
	- Fake Probability: {prob_fake:.1%}
	- Prediction: {"Likely AI-Generated" if prob_fake >= 0.5 else "Likely Real"}
	- Focus Summary: {focus_summary or "Not available"}
	{f"- Contribution to Final Decision: {contribution_percentage:.1f}%" if contribution_percentage else ""}

	The heatmap shows where this model focused its attention. Brighter/warmer colors indicate higher attention.

	TASK:
	Analyze the image and heatmap to explain what this specific model detected. Provide:
	1. A clear explanation of what the model focused on and why it might indicate AI generation (or authenticity)
	2. 2-4 specific visual cues a human could verify, phrased as hypotheses with hedging language
	3. A confidence assessment based on the probability and focus pattern

	CRITICAL: Use hedging language - "may", "suggests", "possible", "could indicate". Never claim certainty.

	Respond with valid JSON matching this exact structure:
	{{
	"key_finding": "One sentence main finding about what the model detected",
	"what_model_saw": "2-3 sentences explaining what the model detected and why it matters",
	"important_regions": ["Region 1 with hedging language", "Region 2...", "Region 3..."],
	"confidence_qualifier": "Assessment of reliability with appropriate hedging"
	}}

	Respond with valid JSON only, no markdown formatting."""

	# Build content parts
	content_parts = []

	if original_image_b64:
	from google.genai import types
	content_parts.append(types.Part.from_bytes(
	data=base64.b64decode(original_image_b64),
	mime_type="image/png"
	))
	content_parts.append(types.Part.from_text(text="Original image shown above.\n\n"))

	if heatmap_b64:
	from google.genai import types
	content_parts.append(types.Part.from_bytes(
	data=base64.b64decode(heatmap_b64),
	mime_type="image/png"
	))
	content_parts.append(types.Part.from_text(text=f"{display_info['method_name']} heatmap shown above.\n\n"))

	from google.genai import types
	content_parts.append(types.Part.from_text(text=prompt))

	# Call the LLM with JSON response mode
	logger.info(f"Generating LLM explanation for {model_name}...")

	response = self._client.models.generate_content(
	model=self._model_name,
	contents=content_parts,
	config=types.GenerateContentConfig(
	temperature=0.3,
	top_p=0.8,
	max_output_tokens=2048, # Increased to avoid truncation
	response_mime_type="application/json",
	)
	)

	# Parse response - even with JSON mode, sometimes there are issues
	text = response.text.strip()

	try:
	result = json.loads(text)
	except json.JSONDecodeError as parse_err:
	# Log the problematic text for debugging
	logger.warning(f"Initial JSON parse failed: {parse_err}")
	logger.warning(f"Raw text (first 500 chars): {repr(text[:500])}")

	# Try to fix common issues: newlines inside strings
	# Replace literal newlines with escaped ones, but only inside quoted strings
	import re

	# More robust approach: find all string values and escape newlines
	def escape_newlines_in_strings(s):
	result = []
	in_string = False
	escape_next = False
	for i, c in enumerate(s):
	if escape_next:
	result.append(c)
	escape_next = False
	continue
	if c == '\\':
	escape_next = True
	result.append(c)
	continue
	if c == '"' and not escape_next:
	in_string = not in_string
	result.append(c)
	continue
	if in_string and c == '\n':
	result.append('\\n')
	elif in_string and c == '\r':
	result.append('\\r')
	else:
	result.append(c)
	return ''.join(result)

	fixed_text = escape_newlines_in_strings(text)
	result = json.loads(fixed_text)

	# Add model metadata to result
	result["model_name"] = model_name

	return result

	except json.JSONDecodeError as e:
	logger.error(f"Failed to parse single model LLM response: {e}")
	return {
	"model_name": model_name,
	"key_finding": f"The {display_info['display_name']} detected potential signs of manipulation.",
	"what_model_saw": f"The model analyzed the image but detailed analysis could not be parsed. The fake probability was {prob_fake:.1%}.",
	"important_regions": ["Unable to identify specific regions."],
	"confidence_qualifier": "Analysis completed but detailed explanation unavailable due to parsing error."
	}
	except Exception as e:
	logger.error(f"Failed to generate single model explanation: {e}")
	return None


	# Global singleton
	_llm_service: Optional[LLMService] = None


	def get_llm_service() -> LLMService:
	"""Get the global LLM service instance."""
	global _llm_service
	if _llm_service is None:
	_llm_service = LLMService()
	return _llm_service