Crison11 commited on
Commit
5de4db9
·
1 Parent(s): dba94fc

feat: enhance detector agent with fast screening

Browse files
Files changed (4) hide show
  1. README.md +2 -5
  2. app.py +4 -4
  3. detector_agent.py +510 -12
  4. requirements.txt +2 -0
README.md CHANGED
@@ -20,9 +20,10 @@ This Space now provides a LangGraph-based AI-image detection agent:
20
  3. The tool layer combines:
21
  - common-sense consistency probe (agent-callable tool)
22
  - image metadata inspection (via `exiftool`)
 
23
  - low-level forensic heuristics
24
  - external vision specialist probe with auto multi-region discovery and zoomed crop analysis
25
- 4. The agent synthesizes the evidence and returns:
26
  - `classification`: `Real` / `Fake`
27
  - `confidence`: `0-100`
28
 
@@ -34,10 +35,6 @@ Set the following in your Space settings:
34
  - `OPENAI_MODEL` (optional, default: `gpt-5-mini`)
35
  - `OPENAI_BASE_URL` (optional, for OpenAI-compatible third-party services)
36
  - `APP_API_TOKEN` (optional, used to protect API endpoint calls)
37
- - `DETECTOR_CACHE_ENABLED` (optional, default: `1`)
38
- - `DETECTOR_CACHE_TTL_SECONDS` (optional, default: `21600`)
39
- - `DETECTOR_CACHE_DIR` (optional, default: `/tmp/aifo_detector_cache`)
40
- - `DETECTOR_GRAPH_RECURSION_LIMIT` (optional, default: `24`)
41
 
42
  ## UI Features
43
 
 
20
  3. The tool layer combines:
21
  - common-sense consistency probe (agent-callable tool)
22
  - image metadata inspection (via `exiftool`)
23
+ - two pretrained Hugging Face AI-image detectors
24
  - low-level forensic heuristics
25
  - external vision specialist probe with auto multi-region discovery and zoomed crop analysis
26
+ 5. The agent synthesizes the evidence and returns:
27
  - `classification`: `Real` / `Fake`
28
  - `confidence`: `0-100`
29
 
 
35
  - `OPENAI_MODEL` (optional, default: `gpt-5-mini`)
36
  - `OPENAI_BASE_URL` (optional, for OpenAI-compatible third-party services)
37
  - `APP_API_TOKEN` (optional, used to protect API endpoint calls)
 
 
 
 
38
 
39
  ## UI Features
40
 
app.py CHANGED
@@ -62,9 +62,6 @@ Before running, configure these Hugging Face Space Secrets:
62
  - `OPENAI_MODEL` (optional, default: `gpt-5-mini`)
63
  - `OPENAI_BASE_URL` (optional, for compatible third-party endpoints)
64
  - `APP_API_TOKEN` (optional, protect API endpoint calls)
65
- - `DETECTOR_CACHE_ENABLED` (optional, default: `1`)
66
- - `DETECTOR_CACHE_TTL_SECONDS` (optional, default: `21600`)
67
- - `DETECTOR_CACHE_DIR` (optional, default: `/tmp/aifo_detector_cache`)
68
  """
69
  )
70
 
@@ -80,7 +77,10 @@ Before running, configure these Hugging Face Space Secrets:
80
  run_btn.click(
81
  fn=analyze_image_ui,
82
  inputs=[image_input],
83
- outputs=[classification_out, confidence_out],
 
 
 
84
  api_name="analyze_ui",
85
  )
86
 
 
62
  - `OPENAI_MODEL` (optional, default: `gpt-5-mini`)
63
  - `OPENAI_BASE_URL` (optional, for compatible third-party endpoints)
64
  - `APP_API_TOKEN` (optional, protect API endpoint calls)
 
 
 
65
  """
66
  )
67
 
 
77
  run_btn.click(
78
  fn=analyze_image_ui,
79
  inputs=[image_input],
80
+ outputs=[
81
+ classification_out,
82
+ confidence_out,
83
+ ],
84
  api_name="analyze_ui",
85
  )
86
 
detector_agent.py CHANGED
@@ -7,6 +7,7 @@ import shutil
7
  import subprocess
8
  import tempfile
9
  import time
 
10
  from pathlib import Path
11
  from typing import Any, Literal
12
 
@@ -20,6 +21,17 @@ from openai import OpenAI
20
  from PIL import ExifTags, Image, ImageChops, ImageStat
21
  from pydantic import BaseModel, Field, field_validator
22
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  AGENT_SYSTEM_PROMPT = """
25
  You are an image-forensics coordinator.
@@ -28,7 +40,7 @@ You do not see the image directly. Your job is to decide which tools to call.
28
  Rules:
29
  - Prefer gathering evidence before reaching a conclusion.
30
  - Prefer calling the common-sense consistency tool early when image logic/plausibility matters.
31
- - Usually call inspect_image_metadata first, then at least one deterministic local forensic tool and one vision probe.
32
  - For vision probe calls, prefer multi-region inspection to cover diverse clues (faces, text, boundaries, limbs, animals, reflections).
33
  - Stop calling tools once you have enough evidence. Do not provide the final verdict yourself.
34
  - Keep the process efficient. Usually 2-5 tool calls are enough.
@@ -39,13 +51,47 @@ SYNTHESIS_SYSTEM_PROMPT = """
39
  You are a senior image-forensics judge.
40
  Review the collected tool evidence and return a final verdict.
41
 
42
- Output requirements:
43
  - classification must be Real or Fake
44
  - confidence must be an integer between 0 and 100
45
- - output only these two fields
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- Fake means likely AI-generated or not a real camera photo.
48
- Real means likely a real camera photo.
 
 
 
 
 
 
 
 
 
49
  """.strip()
50
 
51
 
@@ -185,9 +231,21 @@ AI_METADATA_KEYWORDS = [
185
  ]
186
 
187
 
188
- CACHE_SCHEMA_VERSION = "detector_cache_v1"
189
  CACHE_DEFAULT_TTL_SECONDS = 6 * 60 * 60
190
  GRAPH_RECURSION_LIMIT_DEFAULT = 24
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
 
193
  def _normalize_final_classification(value: Any) -> str:
@@ -209,9 +267,22 @@ def _normalize_final_confidence(value: Any) -> int:
209
  return max(0, min(100, numeric))
210
 
211
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  class DetectorVerdict(BaseModel):
213
  classification: Literal["Real", "Fake"]
214
  confidence: int = Field(ge=0, le=100)
 
215
 
216
  @field_validator("classification", mode="before")
217
  @classmethod
@@ -223,6 +294,14 @@ class DetectorVerdict(BaseModel):
223
  def normalize_confidence(cls, value: Any) -> int:
224
  return _normalize_final_confidence(value)
225
 
 
 
 
 
 
 
 
 
226
 
227
  def _build_openai_client() -> OpenAI:
228
  api_key = os.getenv("OPENAI_API_KEY")
@@ -344,6 +423,7 @@ def _build_detector_cache_key(image: Image.Image, image_path: str | None) -> str
344
  [
345
  AGENT_SYSTEM_PROMPT,
346
  SYNTHESIS_SYSTEM_PROMPT,
 
347
  VISION_TOOL_PROMPT,
348
  VISION_REGION_DISCOVERY_PROMPT,
349
  COMMON_SENSE_SYSTEM_PROMPT,
@@ -356,6 +436,9 @@ def _build_detector_cache_key(image: Image.Image, image_path: str | None) -> str
356
  "image_digest": image_digest,
357
  "model": model,
358
  "base_url": base_url,
 
 
 
359
  "prompt_fingerprint": prompt_fingerprint,
360
  }
361
  raw = json.dumps(payload, ensure_ascii=False, sort_keys=True)
@@ -386,9 +469,26 @@ def _load_cached_result(cache_key: str, ttl_seconds: int) -> dict[str, Any] | No
386
  except Exception:
387
  pass
388
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  return {
390
- "classification": _normalize_final_classification(result.get("classification")),
391
- "confidence": _normalize_final_confidence(result.get("confidence")),
 
392
  }
393
 
394
 
@@ -836,6 +936,333 @@ def _to_float(value: Any, default: float) -> float:
836
  return default
837
 
838
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
839
  def _normalize_discovered_regions(payload: dict[str, Any], max_regions: int) -> list[dict[str, Any]]:
840
  raw_regions = payload.get("regions", [])
841
  if not isinstance(raw_regions, list):
@@ -1095,7 +1522,11 @@ def _messages_to_text(messages: list[Any]) -> str:
1095
  return "\n\n".join(lines)
1096
 
1097
 
1098
- def _build_tools(image: Image.Image, image_path: str | None = None) -> list[Any]:
 
 
 
 
1099
  @tool
1100
  def common_sense_consistency_probe() -> str:
1101
  """Analyze full-image common-sense consistency (anatomy, text, physics, geometry, lighting, reflections)."""
@@ -1106,6 +1537,13 @@ def _build_tools(image: Image.Image, image_path: str | None = None) -> list[Any]
1106
  """Use exiftool to extract selected metadata tags and summarize forensic clues."""
1107
  return _json_dumps(_metadata_report(image, image_path=image_path))
1108
 
 
 
 
 
 
 
 
1109
  @tool
1110
  def run_low_level_forensics() -> str:
1111
  """Run low-level image heuristics such as texture smoothness, saturation, clipping, and ELA artifacts."""
@@ -1126,13 +1564,22 @@ def _build_tools(image: Image.Image, image_path: str | None = None) -> list[Any]
1126
  return [
1127
  common_sense_consistency_probe,
1128
  inspect_image_metadata,
 
1129
  run_low_level_forensics,
1130
  vision_specialist_probe,
1131
  ]
1132
 
1133
 
1134
- def _build_graph(image: Image.Image, image_path: str | None = None):
1135
- tools = _build_tools(image, image_path=image_path)
 
 
 
 
 
 
 
 
1136
  llm = _build_langchain_model()
1137
  llm_with_tools = llm.bind_tools(tools)
1138
  tool_node = ToolNode(tools, handle_tool_errors=True)
@@ -1230,7 +1677,54 @@ def run_detector_agent(
1230
  if cached is not None:
1231
  return dict(cached)
1232
 
1233
- graph = _build_graph(image, image_path=image_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1234
  initial_message = HumanMessage(
1235
  content=(
1236
  "Analyze the uploaded image with the available tools. "
@@ -1244,6 +1738,10 @@ def run_detector_agent(
1244
  result = {
1245
  "classification": _normalize_final_classification(verdict.classification),
1246
  "confidence": _normalize_final_confidence(verdict.confidence),
 
 
 
 
1247
  }
1248
  if cache_enabled and cache_key:
1249
  try:
 
7
  import subprocess
8
  import tempfile
9
  import time
10
+ from functools import lru_cache
11
  from pathlib import Path
12
  from typing import Any, Literal
13
 
 
21
  from PIL import ExifTags, Image, ImageChops, ImageStat
22
  from pydantic import BaseModel, Field, field_validator
23
 
24
+ try:
25
+ import torch
26
+ from transformers import AutoImageProcessor, AutoModelForImageClassification
27
+
28
+ HF_DETECTOR_RUNTIME_AVAILABLE = True
29
+ except Exception:
30
+ torch = None
31
+ AutoImageProcessor = None
32
+ AutoModelForImageClassification = None
33
+ HF_DETECTOR_RUNTIME_AVAILABLE = False
34
+
35
 
36
  AGENT_SYSTEM_PROMPT = """
37
  You are an image-forensics coordinator.
 
40
  Rules:
41
  - Prefer gathering evidence before reaching a conclusion.
42
  - Prefer calling the common-sense consistency tool early when image logic/plausibility matters.
43
+ - Usually call inspect_image_metadata and run_pretrained_hf_detectors early, then at least one deterministic local forensic tool and one vision probe.
44
  - For vision probe calls, prefer multi-region inspection to cover diverse clues (faces, text, boundaries, limbs, animals, reflections).
45
  - Stop calling tools once you have enough evidence. Do not provide the final verdict yourself.
46
  - Keep the process efficient. Usually 2-5 tool calls are enough.
 
51
  You are a senior image-forensics judge.
52
  Review the collected tool evidence and return a final verdict.
53
 
54
+ Output requirements (JSON only):
55
  - classification must be Real or Fake
56
  - confidence must be an integer between 0 and 100
57
+ - summary must be one concise paragraph that combines:
58
+ 1) the final analysis,
59
+ 2) evidence basis for the verdict,
60
+ 3) image-specific forensic reasons
61
+
62
+ Evidence weighting guidance:
63
+ - Treat pretrained detector outputs as auxiliary signals.
64
+ - If detector signals conflict with visual/metadata evidence, lower confidence and reflect uncertainty.
65
+
66
+ Fake means likely AI-generated or not a real photo.
67
+ Real means likely a real photo.
68
+ """.strip()
69
+
70
+
71
+ FAST_SCREENING_SYSTEM_PROMPT = """
72
+ You are a fast image-authenticity screener.
73
+ Task:
74
+ - Determine whether the image is likely a real photo or likely AI-generated synthetic imagery.
75
+ - Use only visible evidence in the image.
76
+
77
+ Return strict JSON only:
78
+ {
79
+ "assessment": "REAL|AI_GENERATED|UNSURE",
80
+ "confidence": 0-100,
81
+ "summary": "one concise paragraph"
82
+ }
83
 
84
+ Definitions:
85
+ - REAL: likely captured by a real camera from the physical world (normal edits/compression are allowed).
86
+ - AI_GENERATED: likely primarily synthesized by a generative model.
87
+
88
+ Decision policy:
89
+ - Prefer speed over exhaustive analysis.
90
+ - Use strong visual cues only: anatomy coherence, text fidelity, geometry/perspective, reflections/shadows,
91
+ object boundaries, repeated texture patterns, and local artifacts.
92
+ - Output REAL or AI_GENERATED only if evidence is clearly one-sided.
93
+ - Output UNSURE when evidence is mixed/weak/insufficient.
94
+ - summary must include main cues and uncertainty in one short paragraph.
95
  """.strip()
96
 
97
 
 
231
  ]
232
 
233
 
234
+ CACHE_SCHEMA_VERSION = "detector_cache_v5"
235
  CACHE_DEFAULT_TTL_SECONDS = 6 * 60 * 60
236
  GRAPH_RECURSION_LIMIT_DEFAULT = 24
237
+ FAST_SCREENING_ENABLED = True
238
+ FAST_SCREENING_CONFIDENCE_THRESHOLD = 80
239
+ HF_PRETRAINED_DETECTOR_MODELS = [
240
+ {
241
+ "name": "organika_sdxl_detector",
242
+ "repo_id": "Organika/sdxl-detector",
243
+ },
244
+ {
245
+ "name": "haywoodsloan_ai_image_detector_dev_deploy",
246
+ "repo_id": "haywoodsloan/ai-image-detector-dev-deploy",
247
+ },
248
+ ]
249
 
250
 
251
  def _normalize_final_classification(value: Any) -> str:
 
267
  return max(0, min(100, numeric))
268
 
269
 
270
+ def _normalize_non_empty_text(value: Any, fallback: str) -> str:
271
+ text = str(value or "").strip()
272
+ if not text:
273
+ return fallback
274
+ return text
275
+
276
+
277
+ def _is_real_like_label(label: str) -> bool:
278
+ normalized = label.strip().lower()
279
+ return normalized in {"real", "authentic", "natural", "photo", "photograph", "camera"}
280
+
281
+
282
  class DetectorVerdict(BaseModel):
283
  classification: Literal["Real", "Fake"]
284
  confidence: int = Field(ge=0, le=100)
285
+ summary: str = ""
286
 
287
  @field_validator("classification", mode="before")
288
  @classmethod
 
294
  def normalize_confidence(cls, value: Any) -> int:
295
  return _normalize_final_confidence(value)
296
 
297
+ @field_validator("summary", mode="before")
298
+ @classmethod
299
+ def normalize_summary(cls, value: Any) -> str:
300
+ return _normalize_non_empty_text(
301
+ value,
302
+ "Insufficient evidence for a detailed summary.",
303
+ )
304
+
305
 
306
  def _build_openai_client() -> OpenAI:
307
  api_key = os.getenv("OPENAI_API_KEY")
 
423
  [
424
  AGENT_SYSTEM_PROMPT,
425
  SYNTHESIS_SYSTEM_PROMPT,
426
+ FAST_SCREENING_SYSTEM_PROMPT,
427
  VISION_TOOL_PROMPT,
428
  VISION_REGION_DISCOVERY_PROMPT,
429
  COMMON_SENSE_SYSTEM_PROMPT,
 
436
  "image_digest": image_digest,
437
  "model": model,
438
  "base_url": base_url,
439
+ "fast_screening_enabled": FAST_SCREENING_ENABLED,
440
+ "fast_screening_confidence_threshold": FAST_SCREENING_CONFIDENCE_THRESHOLD,
441
+ "hf_pretrained_detector_repos": [item["repo_id"] for item in HF_PRETRAINED_DETECTOR_MODELS],
442
  "prompt_fingerprint": prompt_fingerprint,
443
  }
444
  raw = json.dumps(payload, ensure_ascii=False, sort_keys=True)
 
469
  except Exception:
470
  pass
471
  return None
472
+ classification = _normalize_final_classification(result.get("classification"))
473
+ confidence = _normalize_final_confidence(result.get("confidence"))
474
+ summary_value = result.get("summary")
475
+ if not str(summary_value or "").strip():
476
+ legacy_parts = [
477
+ str(result.get("final_analysis") or "").strip(),
478
+ str(result.get("judgment_basis") or "").strip(),
479
+ str(result.get("image_analysis_reason") or "").strip(),
480
+ ]
481
+ legacy_parts = [part for part in legacy_parts if part]
482
+ if legacy_parts:
483
+ summary_value = " ".join(legacy_parts)
484
+ summary = _normalize_non_empty_text(
485
+ summary_value,
486
+ f"Cached verdict: {classification} (confidence={confidence}).",
487
+ )
488
  return {
489
+ "classification": classification,
490
+ "confidence": confidence,
491
+ "summary": summary,
492
  }
493
 
494
 
 
936
  return default
937
 
938
 
939
+ def _normalize_fast_screening_result(data: dict[str, Any]) -> dict[str, Any]:
940
+ assessment = str(data.get("assessment", "UNSURE")).upper()
941
+ if assessment not in {"REAL", "AI_GENERATED", "UNSURE"}:
942
+ assessment = "UNSURE"
943
+
944
+ confidence = _normalize_final_confidence(data.get("confidence"))
945
+ summary = _normalize_non_empty_text(
946
+ data.get("summary"),
947
+ "Fast screening was inconclusive.",
948
+ )
949
+ return {
950
+ "assessment": assessment,
951
+ "confidence": confidence,
952
+ "summary": summary,
953
+ }
954
+
955
+
956
+ def _fast_screening_detector_signal_text(pretrained_signal: dict[str, Any] | None) -> str:
957
+ if not isinstance(pretrained_signal, dict):
958
+ return "No pretrained detector signal available."
959
+
960
+ aggregate = pretrained_signal.get("aggregate", {})
961
+ if not isinstance(aggregate, dict):
962
+ aggregate = {}
963
+
964
+ lines: list[str] = [
965
+ "Pretrained detector auxiliary signals (label 'artificial' => AI-generated):",
966
+ (
967
+ f"- aggregate.overall_hint={aggregate.get('overall_hint', 'UNCERTAIN')}, "
968
+ f"aggregate.confidence={aggregate.get('confidence', 0)}, "
969
+ f"aggregate.mean_artificial_probability={aggregate.get('mean_artificial_probability', None)}"
970
+ ),
971
+ ]
972
+
973
+ detectors = pretrained_signal.get("detectors", [])
974
+ if isinstance(detectors, list):
975
+ for item in detectors[:4]:
976
+ if not isinstance(item, dict):
977
+ continue
978
+ lines.append(
979
+ (
980
+ f"- {item.get('name', 'unknown')}: "
981
+ f"label={item.get('predicted_label', 'N/A')}, "
982
+ f"confidence={item.get('predicted_confidence', 'N/A')}, "
983
+ f"artificial_probability={item.get('artificial_probability', 'N/A')}, "
984
+ f"signal={item.get('signal', 'UNCERTAIN')}"
985
+ )
986
+ )
987
+
988
+ failures = pretrained_signal.get("load_failures", [])
989
+ if isinstance(failures, list) and failures:
990
+ lines.append(f"- load_failures={len(failures)}")
991
+ return "\n".join(lines)
992
+
993
+
994
+ def _run_fast_screening(
995
+ image: Image.Image,
996
+ pretrained_signal: dict[str, Any] | None = None,
997
+ ) -> dict[str, Any]:
998
+ client = _build_openai_client()
999
+ model = os.getenv("OPENAI_MODEL", "gpt-5-mini")
1000
+ data_url = _image_to_data_url(image)
1001
+ detector_signal_text = _fast_screening_detector_signal_text(pretrained_signal)
1002
+
1003
+ response = client.responses.create(
1004
+ model=model,
1005
+ input=[
1006
+ {"role": "system", "content": [{"type": "input_text", "text": FAST_SCREENING_SYSTEM_PROMPT}]},
1007
+ {
1008
+ "role": "user",
1009
+ "content": [
1010
+ {
1011
+ "type": "input_text",
1012
+ "text": (
1013
+ "Classify image authenticity in one quick pass: REAL vs AI_GENERATED vs UNSURE. "
1014
+ "This is a real-photo-vs-AI-generated decision task.\n"
1015
+ "Use pretrained detector signals below as auxiliary evidence (not as absolute truth):\n"
1016
+ f"{detector_signal_text}\n"
1017
+ "Return strict JSON only."
1018
+ ),
1019
+ },
1020
+ {"type": "input_image", "image_url": data_url},
1021
+ ],
1022
+ },
1023
+ ],
1024
+ )
1025
+
1026
+ raw_text = _extract_output_text(response)
1027
+ parsed = _safe_parse_json(raw_text)
1028
+ if parsed is None:
1029
+ return {
1030
+ "assessment": "UNSURE",
1031
+ "confidence": 35,
1032
+ "summary": "Fast screening returned non-JSON output; escalate to full forensic workflow.",
1033
+ "error": raw_text or "(empty response)",
1034
+ }
1035
+ return _normalize_fast_screening_result(parsed)
1036
+
1037
+
1038
+ @lru_cache(maxsize=1)
1039
+ def _load_hf_pretrained_detectors() -> dict[str, Any]:
1040
+ if not HF_DETECTOR_RUNTIME_AVAILABLE or torch is None:
1041
+ raise RuntimeError("transformers/torch runtime is unavailable")
1042
+
1043
+ device = "cuda" if torch.cuda.is_available() else "cpu"
1044
+ loaded: list[dict[str, Any]] = []
1045
+ failures: list[dict[str, str]] = []
1046
+ for item in HF_PRETRAINED_DETECTOR_MODELS:
1047
+ repo_id = item["repo_id"]
1048
+ name = item["name"]
1049
+ try:
1050
+ processor = AutoImageProcessor.from_pretrained(repo_id) # type: ignore[union-attr]
1051
+ model = AutoModelForImageClassification.from_pretrained(repo_id) # type: ignore[union-attr]
1052
+ model.to(device)
1053
+ model.eval()
1054
+
1055
+ id2label_raw = getattr(model.config, "id2label", {}) or {}
1056
+ id2label: dict[int, str] = {}
1057
+ for key, value in id2label_raw.items():
1058
+ try:
1059
+ index = int(key)
1060
+ except Exception:
1061
+ index = int(_to_float(key, -1))
1062
+ if index < 0:
1063
+ continue
1064
+ id2label[index] = str(value)
1065
+ loaded.append(
1066
+ {
1067
+ "name": name,
1068
+ "repo_id": repo_id,
1069
+ "processor": processor,
1070
+ "model": model,
1071
+ "id2label": id2label,
1072
+ }
1073
+ )
1074
+ except Exception as exc:
1075
+ failures.append({"name": name, "repo_id": repo_id, "error": f"{type(exc).__name__}: {exc}"})
1076
+
1077
+ return {"device": device, "detectors": loaded, "load_failures": failures}
1078
+
1079
+
1080
+ def _predict_single_hf_detector(runtime: dict[str, Any], image: Image.Image) -> dict[str, Any]:
1081
+ if not HF_DETECTOR_RUNTIME_AVAILABLE or torch is None:
1082
+ return {
1083
+ "name": runtime.get("name", "unknown"),
1084
+ "repo_id": runtime.get("repo_id", ""),
1085
+ "available": False,
1086
+ "error": "transformers/torch runtime is unavailable",
1087
+ }
1088
+
1089
+ name = runtime["name"]
1090
+ repo_id = runtime["repo_id"]
1091
+ processor = runtime["processor"]
1092
+ model = runtime["model"]
1093
+ id2label = runtime["id2label"]
1094
+ device = next(model.parameters()).device
1095
+
1096
+ inputs = processor(images=image.convert("RGB"), return_tensors="pt")
1097
+ inputs = {key: value.to(device) for key, value in inputs.items()}
1098
+
1099
+ with torch.no_grad():
1100
+ logits = model(**inputs).logits
1101
+ probs = torch.softmax(logits, dim=-1)[0]
1102
+
1103
+ top_index = int(torch.argmax(probs).item())
1104
+ top_confidence = float(probs[top_index].item())
1105
+ top_label = str(id2label.get(top_index, str(top_index)))
1106
+ top_label_normalized = top_label.strip().lower()
1107
+
1108
+ artificial_index: int | None = None
1109
+ for idx, label in id2label.items():
1110
+ if str(label).strip().lower() == "artificial":
1111
+ artificial_index = idx
1112
+ break
1113
+
1114
+ artificial_probability: float | None = None
1115
+ real_probability: float | None = None
1116
+ if artificial_index is not None:
1117
+ artificial_probability = float(probs[artificial_index].item())
1118
+ real_probability = max(0.0, min(1.0, 1.0 - artificial_probability))
1119
+ elif top_label_normalized == "artificial":
1120
+ artificial_probability = top_confidence
1121
+ real_probability = max(0.0, min(1.0, 1.0 - artificial_probability))
1122
+ elif _is_real_like_label(top_label_normalized):
1123
+ real_probability = top_confidence
1124
+ artificial_probability = max(0.0, min(1.0, 1.0 - real_probability))
1125
+
1126
+ signal = "UNCERTAIN"
1127
+ if artificial_probability is not None:
1128
+ if artificial_probability >= 0.75:
1129
+ signal = "AI_HINT"
1130
+ elif artificial_probability <= 0.25:
1131
+ signal = "REAL_HINT"
1132
+ elif top_label_normalized == "artificial" and top_confidence >= 0.75:
1133
+ signal = "AI_HINT"
1134
+ elif _is_real_like_label(top_label_normalized) and top_confidence >= 0.75:
1135
+ signal = "REAL_HINT"
1136
+
1137
+ class_probabilities: list[dict[str, Any]] = []
1138
+ for idx in range(int(probs.shape[-1])):
1139
+ label = str(id2label.get(idx, str(idx)))
1140
+ class_probabilities.append(
1141
+ {
1142
+ "label": label,
1143
+ "probability": round(float(probs[idx].item()) * 100, 2),
1144
+ }
1145
+ )
1146
+ class_probabilities.sort(key=lambda item: item["probability"], reverse=True)
1147
+
1148
+ return {
1149
+ "name": name,
1150
+ "repo_id": repo_id,
1151
+ "available": True,
1152
+ "predicted_label": top_label,
1153
+ "predicted_confidence": round(top_confidence * 100, 2),
1154
+ "artificial_probability": None if artificial_probability is None else round(artificial_probability * 100, 2),
1155
+ "real_probability": None if real_probability is None else round(real_probability * 100, 2),
1156
+ "signal": signal,
1157
+ "class_probabilities": class_probabilities[:5],
1158
+ "label_interpretation": "Label 'artificial' is interpreted as AI-generated imagery.",
1159
+ }
1160
+
1161
+
1162
+ def _aggregate_hf_detector_predictions(predictions: list[dict[str, Any]]) -> dict[str, Any]:
1163
+ valid = [item for item in predictions if item.get("available") is True]
1164
+ if not valid:
1165
+ return {
1166
+ "available_detector_count": 0,
1167
+ "overall_hint": "UNCERTAIN",
1168
+ "confidence": 0,
1169
+ "mean_artificial_probability": None,
1170
+ "observation": "No pretrained detector predictions are available.",
1171
+ }
1172
+
1173
+ artificial_probs: list[float] = []
1174
+ ai_hint_count = 0
1175
+ real_hint_count = 0
1176
+
1177
+ for item in valid:
1178
+ ap_raw = item.get("artificial_probability")
1179
+ if ap_raw is not None:
1180
+ artificial_probs.append(max(0.0, min(100.0, float(ap_raw))) / 100.0)
1181
+ signal = str(item.get("signal", "UNCERTAIN"))
1182
+ if signal == "AI_HINT":
1183
+ ai_hint_count += 1
1184
+ elif signal == "REAL_HINT":
1185
+ real_hint_count += 1
1186
+
1187
+ mean_ap: float | None = None
1188
+ if artificial_probs:
1189
+ mean_ap = float(sum(artificial_probs) / len(artificial_probs))
1190
+
1191
+ if mean_ap is not None:
1192
+ if mean_ap >= 0.65:
1193
+ overall_hint = "AI_GENERATED"
1194
+ elif mean_ap <= 0.35:
1195
+ overall_hint = "REAL"
1196
+ else:
1197
+ overall_hint = "UNCERTAIN"
1198
+ confidence = int(round(min(100.0, max(0.0, abs(mean_ap - 0.5) * 200.0))))
1199
+ else:
1200
+ if ai_hint_count > real_hint_count:
1201
+ overall_hint = "AI_GENERATED"
1202
+ confidence = 60
1203
+ elif real_hint_count > ai_hint_count:
1204
+ overall_hint = "REAL"
1205
+ confidence = 60
1206
+ else:
1207
+ overall_hint = "UNCERTAIN"
1208
+ confidence = 40
1209
+
1210
+ return {
1211
+ "available_detector_count": len(valid),
1212
+ "overall_hint": overall_hint,
1213
+ "confidence": confidence,
1214
+ "mean_artificial_probability": None if mean_ap is None else round(mean_ap * 100, 2),
1215
+ "ai_hint_count": ai_hint_count,
1216
+ "real_hint_count": real_hint_count,
1217
+ "observation": (
1218
+ "Both pretrained detectors treat label 'artificial' as the AI-image signal. "
1219
+ "Use this output as auxiliary evidence, not standalone proof."
1220
+ ),
1221
+ }
1222
+
1223
+
1224
+ def _run_pretrained_hf_detectors(image: Image.Image) -> dict[str, Any]:
1225
+ if not HF_DETECTOR_RUNTIME_AVAILABLE:
1226
+ return {
1227
+ "runtime_available": False,
1228
+ "detectors": [],
1229
+ "aggregate": {
1230
+ "available_detector_count": 0,
1231
+ "overall_hint": "UNCERTAIN",
1232
+ "confidence": 0,
1233
+ "mean_artificial_probability": None,
1234
+ "observation": "transformers/torch is not available in runtime.",
1235
+ },
1236
+ "load_failures": [
1237
+ {"name": "runtime", "repo_id": "", "error": "transformers/torch is not installed or import failed"}
1238
+ ],
1239
+ }
1240
+
1241
+ runtime = _load_hf_pretrained_detectors()
1242
+ detectors = runtime.get("detectors", [])
1243
+ predictions: list[dict[str, Any]] = []
1244
+ for detector in detectors:
1245
+ try:
1246
+ predictions.append(_predict_single_hf_detector(detector, image=image))
1247
+ except Exception as exc:
1248
+ predictions.append(
1249
+ {
1250
+ "name": detector.get("name", "unknown"),
1251
+ "repo_id": detector.get("repo_id", ""),
1252
+ "available": False,
1253
+ "error": f"{type(exc).__name__}: {exc}",
1254
+ }
1255
+ )
1256
+ aggregate = _aggregate_hf_detector_predictions(predictions)
1257
+ return {
1258
+ "runtime_available": True,
1259
+ "device": runtime.get("device"),
1260
+ "detectors": predictions,
1261
+ "aggregate": aggregate,
1262
+ "load_failures": runtime.get("load_failures", []),
1263
+ }
1264
+
1265
+
1266
  def _normalize_discovered_regions(payload: dict[str, Any], max_regions: int) -> list[dict[str, Any]]:
1267
  raw_regions = payload.get("regions", [])
1268
  if not isinstance(raw_regions, list):
 
1522
  return "\n\n".join(lines)
1523
 
1524
 
1525
+ def _build_tools(
1526
+ image: Image.Image,
1527
+ image_path: str | None = None,
1528
+ pretrained_hf_signal: dict[str, Any] | None = None,
1529
+ ) -> list[Any]:
1530
  @tool
1531
  def common_sense_consistency_probe() -> str:
1532
  """Analyze full-image common-sense consistency (anatomy, text, physics, geometry, lighting, reflections)."""
 
1537
  """Use exiftool to extract selected metadata tags and summarize forensic clues."""
1538
  return _json_dumps(_metadata_report(image, image_path=image_path))
1539
 
1540
+ @tool
1541
+ def run_pretrained_hf_detectors() -> str:
1542
+ """Run two pretrained Hugging Face AI-image detectors and return their predictions as auxiliary forensic signals."""
1543
+ if isinstance(pretrained_hf_signal, dict):
1544
+ return _json_dumps(pretrained_hf_signal)
1545
+ return _json_dumps(_run_pretrained_hf_detectors(image=image))
1546
+
1547
  @tool
1548
  def run_low_level_forensics() -> str:
1549
  """Run low-level image heuristics such as texture smoothness, saturation, clipping, and ELA artifacts."""
 
1564
  return [
1565
  common_sense_consistency_probe,
1566
  inspect_image_metadata,
1567
+ run_pretrained_hf_detectors,
1568
  run_low_level_forensics,
1569
  vision_specialist_probe,
1570
  ]
1571
 
1572
 
1573
+ def _build_graph(
1574
+ image: Image.Image,
1575
+ image_path: str | None = None,
1576
+ pretrained_hf_signal: dict[str, Any] | None = None,
1577
+ ):
1578
+ tools = _build_tools(
1579
+ image,
1580
+ image_path=image_path,
1581
+ pretrained_hf_signal=pretrained_hf_signal,
1582
+ )
1583
  llm = _build_langchain_model()
1584
  llm_with_tools = llm.bind_tools(tools)
1585
  tool_node = ToolNode(tools, handle_tool_errors=True)
 
1677
  if cached is not None:
1678
  return dict(cached)
1679
 
1680
+ pretrained_hf_signal: dict[str, Any] | None = None
1681
+
1682
+ if FAST_SCREENING_ENABLED:
1683
+ try:
1684
+ pretrained_hf_signal = _run_pretrained_hf_detectors(image=image)
1685
+ except Exception:
1686
+ pretrained_hf_signal = None
1687
+
1688
+ try:
1689
+ fast = _run_fast_screening(
1690
+ image=image,
1691
+ pretrained_signal=pretrained_hf_signal,
1692
+ )
1693
+ threshold = FAST_SCREENING_CONFIDENCE_THRESHOLD
1694
+ fast_assessment = str(fast.get("assessment", "UNSURE")).upper()
1695
+ fast_confidence = _normalize_final_confidence(fast.get("confidence"))
1696
+ if fast_assessment in {"REAL", "AI_GENERATED"} and fast_confidence >= threshold:
1697
+ detector_hint_text = ""
1698
+ if isinstance(pretrained_hf_signal, dict):
1699
+ aggregate = pretrained_hf_signal.get("aggregate", {})
1700
+ if isinstance(aggregate, dict):
1701
+ detector_hint_text = (
1702
+ f" Auxiliary pretrained-detector hint="
1703
+ f"{aggregate.get('overall_hint', 'UNCERTAIN')} "
1704
+ f"(confidence={aggregate.get('confidence', 0)})."
1705
+ )
1706
+ result = {
1707
+ "classification": "Real" if fast_assessment == "REAL" else "Fake",
1708
+ "confidence": fast_confidence,
1709
+ "summary": _normalize_non_empty_text(
1710
+ f"{fast.get('summary', '')}{detector_hint_text}".strip(),
1711
+ "Fast screening reached a decisive result.",
1712
+ ),
1713
+ }
1714
+ if cache_enabled and cache_key:
1715
+ try:
1716
+ _save_cached_result(cache_key=cache_key, result=result)
1717
+ except Exception:
1718
+ pass
1719
+ return result
1720
+ except Exception:
1721
+ pass
1722
+
1723
+ graph = _build_graph(
1724
+ image,
1725
+ image_path=image_path,
1726
+ pretrained_hf_signal=pretrained_hf_signal,
1727
+ )
1728
  initial_message = HumanMessage(
1729
  content=(
1730
  "Analyze the uploaded image with the available tools. "
 
1738
  result = {
1739
  "classification": _normalize_final_classification(verdict.classification),
1740
  "confidence": _normalize_final_confidence(verdict.confidence),
1741
+ "summary": _normalize_non_empty_text(
1742
+ verdict.summary,
1743
+ "Insufficient evidence for a detailed summary.",
1744
+ ),
1745
  }
1746
  if cache_enabled and cache_key:
1747
  try:
requirements.txt CHANGED
@@ -6,3 +6,5 @@ numpy>=1.26.0
6
  openai>=1.40.0
7
  Pillow>=10.0.0
8
  pydantic>=2.7.0
 
 
 
6
  openai>=1.40.0
7
  Pillow>=10.0.0
8
  pydantic>=2.7.0
9
+ torch>=2.3.0
10
+ transformers>=4.44.0