# explainer.py # Generates forensic explanations using HuggingFace chat router # Falls back to a structured template if API fails import os import json from openai import OpenAI from dotenv import load_dotenv load_dotenv() _client = None def _get_client() -> OpenAI: """Lazy-init the HF chat client.""" global _client if _client is None: token = os.getenv("HF_TOKEN") if not token: raise RuntimeError("HF_TOKEN not set in .env file") _client = OpenAI( base_url="https://router.huggingface.co/v1", api_key=token, ) return _client def explain_detection(detection: dict, input_type: str) -> dict: """ Generate a three-audience forensic explanation for a detection result. Returns a dict with technical_signals, plain_english, etc. Falls back gracefully if the LLM call fails. """ try: return _call_llm(detection, input_type) except Exception as e: print(f" ⚠️ Explainer LLM failed ({e}), using fallback template.") return _fallback(detection) def _call_llm(detection: dict, input_type: str) -> dict: verdict = detection.get("verdict", "UNKNOWN") confidence = detection.get("confidence", 0) severity = detection.get("severity", "LOW") extra = "" if input_type == "video": extra = f""" - Fake frame ratio: {detection.get('fake_probability', 'N/A')}% - Frames analyzed: {detection.get('frames_analyzed', 'N/A')} - Most suspicious timestamp: {detection.get('most_suspicious_timestamp', 'N/A')}s""" prompt = f"""You are a deepfake forensics expert for SENTINEL, an AI-powered cybersecurity platform. Detection result: - Input type: {input_type} - Verdict: {verdict} - Confidence: {confidence}% - Severity: {severity}{extra} Return ONLY a valid JSON object — no markdown, no explanation, no extra text. {{ "technical_signals": [ "specific forensic artifact 1 (e.g. GAN grid pattern at 512px boundary)", "specific forensic artifact 2 (e.g. facial blending seam visible at jaw line)", "specific forensic artifact 3 (e.g. unnatural specular reflection in left eye)" ], "plain_english": "2 clear sentences explaining this to a non-technical person.", "manipulation_areas": ["facial region 1", "facial region 2"], "recommended_action": "One specific action the user should take right now.", "mitre_technique": "T1565.001 - Stored Data Manipulation" }} Rules: - If DEEPFAKE: name real GAN artifacts — boundary blending, texture inconsistency, eye reflection anomalies, lighting direction mismatch, hair edge artifacts, temporal flickering. - If AUTHENTIC: name the positive signals — consistent EXIF metadata, natural skin texture variance, coherent lighting, authentic noise patterns. - Be specific. Never use generic phrases like "image looks suspicious".""" client = _get_client() completion = client.chat.completions.create( model="mistralai/Mistral-7B-Instruct-v0.3", messages=[{"role": "user", "content": prompt}], max_tokens=600, temperature=0.3, # Lower temp = more consistent JSON output ) raw = completion.choices[0].message.content.strip() print(f" 🤖 Explainer raw output: {raw[:100]}...") # Strip markdown code fences if present if "```" in raw: parts = raw.split("```") for part in parts: part = part.strip() if part.startswith("json"): part = part[4:].strip() if part.startswith("{"): raw = part break return json.loads(raw) def _fallback(detection: dict) -> dict: """Structured fallback when LLM is unavailable.""" verdict = detection.get("verdict", "UNKNOWN") confidence = detection.get("confidence", 0) if verdict == "DEEPFAKE": signals = [ f"Model confidence {confidence}% indicates high likelihood of synthetic generation", "GAN-based artifacts detected in facial texture regions", "Boundary blending inconsistencies identified near facial edges", ] plain = ( f"This content appears to be AI-generated or manipulated with {confidence}% confidence. " "It shows technical patterns characteristic of deepfake generation tools." ) action = "Do not share or use this content. Verify the original source independently." else: signals = [ f"Authenticity confidence: {confidence}%", "Natural noise distribution consistent with real camera capture", "No GAN fingerprint patterns detected", ] plain = ( f"This content appears authentic with {confidence}% confidence. " "No deepfake manipulation signatures were detected." ) action = "Content appears authentic. Standard verification still recommended for sensitive use cases." return { "technical_signals": signals, "plain_english": plain, "manipulation_areas": [], "recommended_action": action, "mitre_technique": "T1565.001 - Stored Data Manipulation", }