Spyderzz commited on
Commit
4c9e797
Β·
1 Parent(s): e2bfbf6

fix analyze cache and language options

Browse files
Files changed (1) hide show
  1. api/v1/analyze.py +463 -84
api/v1/analyze.py CHANGED
@@ -1,12 +1,10 @@
1
- from __future__ import annotations
2
-
3
  import json
4
  import os
5
  import time
6
  import uuid
7
  from datetime import datetime, timezone
8
 
9
- from fastapi import APIRouter, Body, Depends, File, UploadFile
10
  from pydantic import BaseModel
11
  from loguru import logger
12
  from sqlalchemy.orm import Session
@@ -40,11 +38,10 @@ from services.screenshot_service import (
40
  )
41
  from services.ela_service import generate_ela_base64
42
  from services.exif_service import extract_exif
43
- from services.image_service import load_image_from_bytes
44
  from services.llm_explainer import generate_llm_summary
45
  from schemas.common import ProcessingSummary, Verdict
46
  from services.artifact_detector import scan_artifacts
47
- from services.image_service import preprocess_and_classify
48
  from services.news_lookup import search_news_full
49
  from services.vlm_breakdown import generate_vlm_breakdown
50
  from services.text_service import (
@@ -55,8 +52,22 @@ from services.text_service import (
55
  score_sensationalism,
56
  )
57
  from services.video_service import analyze_video
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  from utils.file_handler import read_upload_bytes, save_upload_to_tempfile
59
- from utils.scoring import compute_authenticity_score, get_verdict_label
60
 
61
  router = APIRouter(prefix="/analyze", tags=["analyze"])
62
 
@@ -64,9 +75,34 @@ IMAGE_MAX_MB = 20
64
  VIDEO_MAX_MB = 100
65
  VIDEO_NUM_FRAMES = 16
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  @router.post("/image", response_model=ImageAnalysisResponse)
 
 
69
  async def analyze_image(
 
 
 
 
70
  file: UploadFile = File(...),
71
  db: Session = Depends(get_db),
72
  user: User | None = Depends(optional_current_user),
@@ -79,8 +115,16 @@ async def analyze_image(
79
  )
80
  stages.append("validation")
81
 
82
- pil, clf = preprocess_and_classify(raw)
83
- stages.append("classification")
 
 
 
 
 
 
 
 
84
 
85
  indicators = scan_artifacts(pil, raw)
86
  stages.append("artifact_scanning")
@@ -89,7 +133,10 @@ async def analyze_image(
89
  heatmap_status = "success"
90
  heatmap = ""
91
  try:
92
- heatmap = generate_heatmap_base64(pil)
 
 
 
93
  stages.append("heatmap_generation")
94
  except Exception as e: # noqa: BLE001
95
  logger.warning(f"Heatmap generation failed, continuing: {e}")
@@ -119,18 +166,37 @@ async def analyze_image(
119
  except Exception as e: # noqa: BLE001
120
  logger.warning(f"EXIF extraction failed, continuing: {e}")
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  score = compute_authenticity_score(clf.confidence, clf.label)
123
 
124
- # Apply EXIF trust adjustment to the score
125
- if exif_summary and exif_summary.trust_adjustment != 0:
126
- score = int(round(max(0, min(100, score + exif_summary.trust_adjustment))))
 
 
127
 
128
  label, severity = get_verdict_label(score)
129
  duration_ms = int((time.perf_counter() - start) * 1000)
130
 
131
- analysis_id = str(uuid.uuid4())
132
-
133
- response = ImageAnalysisResponse(
134
  analysis_id=analysis_id,
135
  media_type="image",
136
  timestamp=datetime.now(timezone.utc).isoformat(),
@@ -148,6 +214,8 @@ async def analyze_image(
148
  heatmap_status=heatmap_status,
149
  artifact_indicators=indicators,
150
  exif=exif_summary,
 
 
151
  ),
152
  trusted_sources=[],
153
  contradicting_evidence=[],
@@ -155,51 +223,81 @@ async def analyze_image(
155
  stages_completed=stages,
156
  total_duration_ms=duration_ms,
157
  model_used=settings.IMAGE_MODEL_ID,
 
 
158
  ),
159
  )
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  record = AnalysisRecord(
162
  user_id=user.id if user else None,
163
  media_type="image",
164
  verdict=label,
165
  authenticity_score=float(score),
166
- result_json=json.dumps(response.model_dump(
167
  exclude={"explainability": {"heatmap_base64", "ela_base64", "boxes_base64"}}
168
  )),
 
 
 
169
  )
170
  db.add(record)
171
  db.commit()
172
  db.refresh(record)
173
- response.record_id = record.id
174
  logger.info(f"Saved AnalysisRecord id={record.id} score={score} verdict={label}")
175
 
176
- # ── Phase 12: LLM explainability card (runs after DB save so we have record_id) ──
177
- try:
178
- llm_summary = generate_llm_summary(
179
- payload=response.model_dump(
180
- exclude={"explainability": {"heatmap_base64", "ela_base64", "boxes_base64"}}
181
- ),
182
- record_id=str(record.id),
183
- )
184
- response.explainability.llm_summary = llm_summary
185
  stages.append("llm_explanation")
186
- except Exception as e: # noqa: BLE001
187
- logger.warning(f"LLM explainer failed, continuing: {e}")
188
 
189
- # ── Phase 14: VLM detailed breakdown (vision LLM scores 6 perceptual components) ──
190
- try:
191
- vlm_bd = generate_vlm_breakdown(pil, record_id=str(record.id))
192
- if vlm_bd:
193
- response.explainability.vlm_breakdown = vlm_bd
194
- stages.append("vlm_breakdown")
195
- except Exception as e: # noqa: BLE001
196
- logger.warning(f"VLM breakdown failed, continuing: {e}")
197
 
198
- return response
199
 
200
 
201
  @router.post("/video", response_model=VideoAnalysisResponse)
 
 
202
  async def analyze_video_endpoint(
 
 
 
 
203
  file: UploadFile = File(...),
204
  db: Session = Depends(get_db),
205
  user: User | None = Depends(optional_current_user),
@@ -213,27 +311,67 @@ async def analyze_video_endpoint(
213
  )
214
  stages.append("validation")
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  try:
217
  agg = analyze_video(path, num_frames=VIDEO_NUM_FRAMES)
218
  stages.append("frame_extraction")
219
  stages.append("frame_classification")
220
  stages.append("aggregation")
221
- finally:
 
 
222
  try:
223
  os.unlink(path)
224
  except OSError:
225
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
- if agg.insufficient_faces:
228
- score = 50
229
- label = "Insufficient face content"
230
- severity = "warning"
231
- else:
232
- score = int(round(max(0.0, min(100.0, (1.0 - agg.mean_suspicious_prob) * 100.0))))
233
- label, severity = get_verdict_label(score)
234
  duration_ms = int((time.perf_counter() - start) * 1000)
235
 
236
- response = VideoAnalysisResponse(
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  analysis_id=str(uuid.uuid4()),
238
  media_type="video",
239
  timestamp=datetime.now(timezone.utc).isoformat(),
@@ -266,47 +404,86 @@ async def analyze_video_endpoint(
266
  )
267
  for f in agg.frames
268
  ],
 
 
 
 
 
269
  ),
270
  processing_summary=ProcessingSummary(
271
  stages_completed=stages,
272
  total_duration_ms=duration_ms,
273
  model_used=settings.IMAGE_MODEL_ID,
 
 
274
  ),
275
  )
276
 
 
 
 
 
 
 
 
 
 
277
  record = AnalysisRecord(
278
  user_id=user.id if user else None,
279
  media_type="video",
280
  verdict=label,
281
  authenticity_score=float(score),
282
- result_json=json.dumps(response.model_dump()),
 
 
 
283
  )
284
  db.add(record)
285
  db.commit()
286
  db.refresh(record)
287
- response.record_id = record.id
288
  logger.info(
289
  f"Saved AnalysisRecord id={record.id} video score={score} verdict={label} "
290
  f"frames={agg.num_frames_sampled} susp={agg.num_suspicious_frames}"
291
  )
292
 
293
- # Phase 12: LLM explainability card
294
  try:
295
- response.llm_summary = generate_llm_summary(
296
- payload=response.model_dump(), record_id=str(record.id),
 
 
 
 
297
  )
298
  except Exception as e: # noqa: BLE001
299
- logger.warning(f"LLM explainer failed for video: {e}")
 
 
 
 
 
 
 
 
 
 
300
 
301
- return response
302
 
303
 
304
  class TextAnalyzeBody(BaseModel):
305
  text: str
 
 
306
 
307
 
308
  @router.post("/text", response_model=TextAnalysisResponse)
 
 
309
  async def analyze_text_endpoint(
 
 
310
  body: TextAnalyzeBody = Body(...),
311
  db: Session = Depends(get_db),
312
  user: User | None = Depends(optional_current_user),
@@ -315,7 +492,7 @@ async def analyze_text_endpoint(
315
  stages: list[str] = []
316
 
317
  # Phase 13: language detection β€” routes to multilang model when non-English
318
- lang = detect_language(body.text)
319
  stages.append("language_detection")
320
 
321
  clf = classify_text(body.text, language=lang)
@@ -345,10 +522,12 @@ async def analyze_text_endpoint(
345
  effective_fake_prob = news.truth_override.fake_prob_after
346
  stages.append("truth_override_applied")
347
 
348
- # Weighted score: 70% classifier + 20% inverse sensationalism + 10% manipulation penalty
 
349
  manip_penalty = min(len(manip) * 5, 30)
350
  raw_score = (1.0 - effective_fake_prob) * 100.0
351
- weighted = raw_score * 0.70 + max(0, 100 - sens.score) * 0.20 + max(0, 100 - manip_penalty) * 0.10
 
352
  score = int(round(max(0.0, min(100.0, weighted))))
353
  label, severity = get_verdict_label(score)
354
  duration_ms = int((time.perf_counter() - start) * 1000)
@@ -358,7 +537,7 @@ async def analyze_text_endpoint(
358
  else settings.TEXT_MODEL_ID
359
  )
360
 
361
- response = TextAnalysisResponse(
362
  analysis_id=str(uuid.uuid4()),
363
  media_type="text",
364
  timestamp=datetime.now(timezone.utc).isoformat(),
@@ -403,6 +582,7 @@ async def analyze_text_endpoint(
403
  stages_completed=stages,
404
  total_duration_ms=duration_ms,
405
  model_used=model_used,
 
406
  ),
407
  )
408
 
@@ -411,27 +591,30 @@ async def analyze_text_endpoint(
411
  media_type="text",
412
  verdict=label,
413
  authenticity_score=float(score),
414
- result_json=json.dumps(response.model_dump()),
415
  )
416
  db.add(record)
417
  db.commit()
418
  db.refresh(record)
419
- response.record_id = record.id
420
  logger.info(f"Saved AnalysisRecord id={record.id} text score={score} verdict={label}")
421
 
422
- # Phase 12: LLM explainability card
423
- try:
424
- response.llm_summary = generate_llm_summary(
425
- payload=response.model_dump(), record_id=str(record.id),
426
- )
427
- except Exception as e: # noqa: BLE001
428
- logger.warning(f"LLM explainer failed for text: {e}")
429
 
430
- return response
431
 
432
 
433
  @router.post("/screenshot", response_model=ScreenshotAnalysisResponse)
 
 
434
  async def analyze_screenshot_endpoint(
 
 
 
 
435
  file: UploadFile = File(...),
436
  db: Session = Depends(get_db),
437
  user: User | None = Depends(optional_current_user),
@@ -444,6 +627,15 @@ async def analyze_screenshot_endpoint(
444
  )
445
  stages.append("validation")
446
 
 
 
 
 
 
 
 
 
 
447
  pil = load_image_from_bytes(raw)
448
  ocr_boxes = run_ocr(pil)
449
  stages.append("ocr")
@@ -451,7 +643,7 @@ async def analyze_screenshot_endpoint(
451
  full_text = extract_full_text(ocr_boxes)
452
 
453
  # Phase 13: language detection on extracted OCR text
454
- lang = detect_language(full_text) if full_text else "en"
455
  stages.append("language_detection")
456
 
457
  clf = classify_text(full_text, language=lang) if full_text else None
@@ -493,12 +685,12 @@ async def analyze_screenshot_endpoint(
493
  manip_penalty = min(len(manip) * 5, 30)
494
  layout_penalty = min(len(layout) * 5, 15)
495
  raw_score = (1.0 - effective_fake_prob) * 100.0
496
- weighted = (
497
- raw_score * 0.65
498
- + max(0, 100 - sens.score) * 0.20
499
- + max(0, 100 - manip_penalty) * 0.10
500
- + max(0, 100 - layout_penalty) * 0.05
501
  )
 
502
  if not full_text.strip():
503
  weighted = 50
504
  score = int(round(max(0.0, min(100.0, weighted))))
@@ -511,7 +703,7 @@ async def analyze_screenshot_endpoint(
511
  else f"{settings.TEXT_MODEL_ID} + EasyOCR"
512
  )
513
 
514
- response = ScreenshotAnalysisResponse(
515
  analysis_id=str(uuid.uuid4()),
516
  media_type="screenshot",
517
  timestamp=datetime.now(timezone.utc).isoformat(),
@@ -554,28 +746,215 @@ async def analyze_screenshot_endpoint(
554
  stages_completed=stages,
555
  total_duration_ms=duration_ms,
556
  model_used=model_used_str,
 
557
  ),
558
  )
559
 
 
 
 
 
 
 
 
 
 
 
560
  record = AnalysisRecord(
561
  user_id=user.id if user else None,
562
  media_type="screenshot",
563
  verdict=label,
564
  authenticity_score=float(score),
565
- result_json=json.dumps(response.model_dump()),
 
 
 
566
  )
567
  db.add(record)
568
  db.commit()
569
  db.refresh(record)
570
- response.record_id = record.id
571
  logger.info(f"Saved AnalysisRecord id={record.id} screenshot score={score} verdict={label}")
572
 
573
- # Phase 12: LLM explainability card
574
- try:
575
- response.llm_summary = generate_llm_summary(
576
- payload=response.model_dump(), record_id=str(record.id),
577
- )
578
- except Exception as e: # noqa: BLE001
579
- logger.warning(f"LLM explainer failed for screenshot: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
580
 
581
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import os
3
  import time
4
  import uuid
5
  from datetime import datetime, timezone
6
 
7
+ from fastapi import APIRouter, BackgroundTasks, Body, Depends, File, HTTPException, Query, Request, Response, UploadFile, status
8
  from pydantic import BaseModel
9
  from loguru import logger
10
  from sqlalchemy.orm import Session
 
38
  )
39
  from services.ela_service import generate_ela_base64
40
  from services.exif_service import extract_exif
41
+ from services.image_service import classify_image, load_image_from_bytes
42
  from services.llm_explainer import generate_llm_summary
43
  from schemas.common import ProcessingSummary, Verdict
44
  from services.artifact_detector import scan_artifacts
 
45
  from services.news_lookup import search_news_full
46
  from services.vlm_breakdown import generate_vlm_breakdown
47
  from services.text_service import (
 
52
  score_sensationalism,
53
  )
54
  from services.video_service import analyze_video
55
+ from services.audio_service import analyze_audio, AudioAnalysis
56
+ from services.metadata_writer import write_verdict_metadata
57
+ from services.rate_limit import ANON_ANALYZE, AUTH_ANALYZE, is_anon, is_authed, limiter
58
+ from services.dedup_cache import lookup_cached, cached_payload
59
+ from services.storage import (
60
+ make_image_thumbnail,
61
+ make_video_thumbnail,
62
+ save_bytes,
63
+ save_file,
64
+ save_overlay,
65
+ sha256_bytes,
66
+ sha256_file,
67
+ )
68
+ from services.job_queue import registry as job_registry, run_job
69
  from utils.file_handler import read_upload_bytes, save_upload_to_tempfile
70
+ from utils.scoring import compute_authenticity_score, compute_video_authenticity_score, get_verdict_label
71
 
72
  router = APIRouter(prefix="/analyze", tags=["analyze"])
73
 
 
75
  VIDEO_MAX_MB = 100
76
  VIDEO_NUM_FRAMES = 16
77
 
78
+ _IMAGE_EXCLUDE = {"explainability": {"heatmap_base64", "ela_base64", "boxes_base64"}}
79
+
80
+
81
+ def _resolve_language_hint(text: str, language_hint: str | None) -> str:
82
+ hint = (language_hint or "auto").strip().lower()
83
+ if hint and hint != "auto":
84
+ return hint
85
+ return detect_language(text)
86
+
87
+
88
+ def _compute_llm_summary(resp, *, record_id: int, user, media_kind: str, exclude: dict | None = None):
89
+ """Generate the LLM summary for `resp`. Swallows provider errors gracefully."""
90
+ try:
91
+ payload = resp.model_dump(exclude=exclude) if exclude else resp.model_dump()
92
+ return generate_llm_summary(payload=payload, record_id=str(record_id))
93
+ except Exception as e: # noqa: BLE001
94
+ logger.warning(f"LLM explainer failed for {media_kind}: {e}")
95
+ return None
96
+
97
 
98
  @router.post("/image", response_model=ImageAnalysisResponse)
99
+ @limiter.limit(ANON_ANALYZE, exempt_when=is_authed)
100
+ @limiter.limit(AUTH_ANALYZE, exempt_when=is_anon)
101
  async def analyze_image(
102
+ request: Request,
103
+ response: Response,
104
+ cache: bool = Query(default=True),
105
+ language_hint: str = Query(default="auto"),
106
  file: UploadFile = File(...),
107
  db: Session = Depends(get_db),
108
  user: User | None = Depends(optional_current_user),
 
115
  )
116
  stages.append("validation")
117
 
118
+ # Phase 19.1 β€” SHA-256 dedup cache
119
+ media_hash = sha256_bytes(raw)
120
+ cached = lookup_cached(db, media_hash=media_hash, media_type="image", user_id=user.id if user else None) if cache else None
121
+ if cached is not None:
122
+ payload = cached_payload(cached)
123
+ if payload is not None:
124
+ logger.info(f"cache hit image sha={media_hash[:12]} record={cached.id}")
125
+ return ImageAnalysisResponse.model_validate(payload)
126
+
127
+ pil = load_image_from_bytes(raw)
128
 
129
  indicators = scan_artifacts(pil, raw)
130
  stages.append("artifact_scanning")
 
133
  heatmap_status = "success"
134
  heatmap = ""
135
  try:
136
+ model_family = "efficientnet" if settings.ENSEMBLE_MODE else "vit"
137
+ heatmap, heatmap_source = generate_heatmap_base64(pil, model_family=model_family)
138
+ if not heatmap:
139
+ heatmap_status = heatmap_source # "none" or "fallback"
140
  stages.append("heatmap_generation")
141
  except Exception as e: # noqa: BLE001
142
  logger.warning(f"Heatmap generation failed, continuing: {e}")
 
166
  except Exception as e: # noqa: BLE001
167
  logger.warning(f"EXIF extraction failed, continuing: {e}")
168
 
169
+ clf = classify_image(pil, artifact_indicators=indicators, exif=exif_summary)
170
+ stages.append("classification")
171
+
172
+ analysis_id = str(uuid.uuid4())
173
+ vlm_bd = None
174
+ if user is not None and clf.no_face_analysis is not None:
175
+ try:
176
+ vlm_bd = generate_vlm_breakdown(pil, record_id=analysis_id)
177
+ if vlm_bd:
178
+ clf = classify_image(
179
+ pil,
180
+ artifact_indicators=indicators,
181
+ exif=exif_summary,
182
+ vlm_breakdown=vlm_bd,
183
+ )
184
+ stages.append("vlm_no_face_fusion")
185
+ except Exception as e: # noqa: BLE001
186
+ logger.warning(f"VLM no-face fusion failed, continuing: {e}")
187
+
188
  score = compute_authenticity_score(clf.confidence, clf.label)
189
 
190
+ # Apply EXIF trust adjustment.
191
+ # trust_adjustment convention: negative = more real β†’ subtract to RAISE authenticity score.
192
+ # positive = more fake β†’ subtract to LOWER authenticity score.
193
+ if clf.no_face_analysis is None and exif_summary and exif_summary.trust_adjustment != 0:
194
+ score = int(round(max(0, min(100, score - exif_summary.trust_adjustment))))
195
 
196
  label, severity = get_verdict_label(score)
197
  duration_ms = int((time.perf_counter() - start) * 1000)
198
 
199
+ resp = ImageAnalysisResponse(
 
 
200
  analysis_id=analysis_id,
201
  media_type="image",
202
  timestamp=datetime.now(timezone.utc).isoformat(),
 
214
  heatmap_status=heatmap_status,
215
  artifact_indicators=indicators,
216
  exif=exif_summary,
217
+ no_face_analysis=clf.no_face_analysis,
218
+ vlm_breakdown=vlm_bd,
219
  ),
220
  trusted_sources=[],
221
  contradicting_evidence=[],
 
223
  stages_completed=stages,
224
  total_duration_ms=duration_ms,
225
  model_used=settings.IMAGE_MODEL_ID,
226
+ models_used=clf.models_used,
227
+ calibrator_applied=clf.calibrator_applied,
228
  ),
229
  )
230
 
231
+ # Phase 19.2 β€” persist original bytes + thumbnail under content-address
232
+ ext = (mime.split("/")[-1] if mime else "jpg").replace("jpeg", "jpg")
233
+ try:
234
+ media_path = save_bytes(raw, media_hash, ext)
235
+ except Exception as e: # noqa: BLE001
236
+ logger.warning(f"media save failed: {e}")
237
+ media_path = None
238
+ thumbnail_url = make_image_thumbnail(pil, media_hash)
239
+ resp.thumbnail_url = thumbnail_url
240
+ if media_path:
241
+ resp.media_path = media_path
242
+
243
+ # Persist overlay images so they survive page reloads (base64 excluded from DB)
244
+ if heatmap:
245
+ url = save_overlay(heatmap, media_hash, "heatmap")
246
+ if url:
247
+ resp.explainability.heatmap_url = url
248
+ if ela_b64:
249
+ url = save_overlay(ela_b64, media_hash, "ela")
250
+ if url:
251
+ resp.explainability.ela_url = url
252
+ if boxes_b64:
253
+ url = save_overlay(boxes_b64, media_hash, "boxes")
254
+ if url:
255
+ resp.explainability.boxes_url = url
256
+
257
  record = AnalysisRecord(
258
  user_id=user.id if user else None,
259
  media_type="image",
260
  verdict=label,
261
  authenticity_score=float(score),
262
+ result_json=json.dumps(resp.model_dump(
263
  exclude={"explainability": {"heatmap_base64", "ela_base64", "boxes_base64"}}
264
  )),
265
+ media_hash=media_hash,
266
+ media_path=media_path,
267
+ thumbnail_url=thumbnail_url,
268
  )
269
  db.add(record)
270
  db.commit()
271
  db.refresh(record)
272
+ resp.record_id = record.id
273
  logger.info(f"Saved AnalysisRecord id={record.id} score={score} verdict={label}")
274
 
275
+ # ── Phase 12+14: LLM + VLM cards (authed users only β€” conserves LLM quota) ──
276
+ llm_summary = _compute_llm_summary(resp, record_id=record.id, user=user, media_kind="image", exclude=_IMAGE_EXCLUDE)
277
+ if llm_summary:
278
+ resp.explainability.llm_summary = llm_summary
 
 
 
 
 
279
  stages.append("llm_explanation")
 
 
280
 
281
+ if user is not None and vlm_bd is None:
282
+ try:
283
+ vlm_bd = generate_vlm_breakdown(pil, record_id=str(record.id))
284
+ if vlm_bd:
285
+ resp.explainability.vlm_breakdown = vlm_bd
286
+ stages.append("vlm_breakdown")
287
+ except Exception as e: # noqa: BLE001
288
+ logger.warning(f"VLM breakdown failed, continuing: {e}")
289
 
290
+ return resp
291
 
292
 
293
  @router.post("/video", response_model=VideoAnalysisResponse)
294
+ @limiter.limit(ANON_ANALYZE, exempt_when=is_authed)
295
+ @limiter.limit(AUTH_ANALYZE, exempt_when=is_anon)
296
  async def analyze_video_endpoint(
297
+ request: Request,
298
+ response: Response,
299
+ cache: bool = Query(default=True),
300
+ language_hint: str = Query(default="auto"),
301
  file: UploadFile = File(...),
302
  db: Session = Depends(get_db),
303
  user: User | None = Depends(optional_current_user),
 
311
  )
312
  stages.append("validation")
313
 
314
+ # Phase 19.1 β€” dedup cache (hash temp file before running pipeline)
315
+ media_hash = sha256_file(path)
316
+ cached = lookup_cached(db, media_hash=media_hash, media_type="video", user_id=user.id if user else None) if cache else None
317
+ if cached is not None:
318
+ payload = cached_payload(cached)
319
+ if payload is not None:
320
+ try:
321
+ os.unlink(path)
322
+ except OSError:
323
+ pass
324
+ logger.info(f"cache hit video sha={media_hash[:12]} record={cached.id}")
325
+ return VideoAnalysisResponse.model_validate(payload)
326
+
327
  try:
328
  agg = analyze_video(path, num_frames=VIDEO_NUM_FRAMES)
329
  stages.append("frame_extraction")
330
  stages.append("frame_classification")
331
  stages.append("aggregation")
332
+ if agg.temporal:
333
+ stages.append("temporal_analysis")
334
+ except Exception:
335
  try:
336
  os.unlink(path)
337
  except OSError:
338
  pass
339
+ raise
340
+
341
+ # Phase 17.2 β€” audio analysis (needs file path, runs before cleanup)
342
+ audio_result: AudioAnalysis | None = None
343
+ try:
344
+ audio_result = analyze_audio(path)
345
+ if audio_result:
346
+ stages.append("audio_analysis")
347
+ except Exception as _ae: # noqa: BLE001
348
+ logger.warning(f"Audio analysis failed, continuing: {_ae}")
349
+
350
+ # Phase 17.3 β€” combined verdict formula
351
+ score, label, severity = compute_video_authenticity_score(
352
+ mean_suspicious_prob=agg.mean_suspicious_prob,
353
+ insufficient_faces=agg.insufficient_faces,
354
+ temporal_score=agg.temporal.temporal_score if agg.temporal else None,
355
+ audio_authenticity_score=audio_result.audio_authenticity_score if audio_result else None,
356
+ has_audio=bool(audio_result and audio_result.has_audio),
357
+ )
358
 
 
 
 
 
 
 
 
359
  duration_ms = int((time.perf_counter() - start) * 1000)
360
 
361
+ from schemas.analyze import AudioExplainability
362
+ audio_ex = None
363
+ if audio_result:
364
+ audio_ex = AudioExplainability(
365
+ audio_authenticity_score=audio_result.audio_authenticity_score,
366
+ has_audio=audio_result.has_audio,
367
+ duration_s=audio_result.duration_s,
368
+ silence_ratio=audio_result.silence_ratio,
369
+ spectral_variance=audio_result.spectral_variance,
370
+ rms_consistency=audio_result.rms_consistency,
371
+ notes=audio_result.notes,
372
+ )
373
+
374
+ resp = VideoAnalysisResponse(
375
  analysis_id=str(uuid.uuid4()),
376
  media_type="video",
377
  timestamp=datetime.now(timezone.utc).isoformat(),
 
404
  )
405
  for f in agg.frames
406
  ],
407
+ temporal_score=agg.temporal.temporal_score if agg.temporal else None,
408
+ optical_flow_variance=agg.temporal.optical_flow_variance if agg.temporal else None,
409
+ flicker_score=agg.temporal.flicker_score if agg.temporal else None,
410
+ blink_rate_anomaly=agg.temporal.blink_rate_anomaly if agg.temporal else None,
411
+ audio=audio_ex,
412
  ),
413
  processing_summary=ProcessingSummary(
414
  stages_completed=stages,
415
  total_duration_ms=duration_ms,
416
  model_used=settings.IMAGE_MODEL_ID,
417
+ models_used=agg.models_used,
418
+ calibrator_applied=agg.calibrator_applied,
419
  ),
420
  )
421
 
422
+ # Phase 19.2 β€” persist video + thumbnail frame
423
+ try:
424
+ media_path = save_file(path, media_hash, suffix.lstrip("."))
425
+ except Exception as e: # noqa: BLE001
426
+ logger.warning(f"video media save failed: {e}")
427
+ media_path = None
428
+ thumbnail_url = make_video_thumbnail(path, media_hash)
429
+ resp.thumbnail_url = thumbnail_url
430
+
431
  record = AnalysisRecord(
432
  user_id=user.id if user else None,
433
  media_type="video",
434
  verdict=label,
435
  authenticity_score=float(score),
436
+ result_json=json.dumps(resp.model_dump()),
437
+ media_hash=media_hash,
438
+ media_path=media_path,
439
+ thumbnail_url=thumbnail_url,
440
  )
441
  db.add(record)
442
  db.commit()
443
  db.refresh(record)
444
+ resp.record_id = record.id
445
  logger.info(
446
  f"Saved AnalysisRecord id={record.id} video score={score} verdict={label} "
447
  f"frames={agg.num_frames_sampled} susp={agg.num_suspicious_frames}"
448
  )
449
 
450
+ # Write verdict into video metadata (ExifTool, optional β€” gated by EXIFTOOL_PATH).
451
  try:
452
+ write_verdict_metadata(
453
+ file_path=path,
454
+ verdict=label,
455
+ authenticity_score=score,
456
+ models_used=agg.models_used,
457
+ analysis_id=str(record.id),
458
  )
459
  except Exception as e: # noqa: BLE001
460
+ logger.warning(f"Metadata write failed: {e}")
461
+ finally:
462
+ try:
463
+ os.unlink(path)
464
+ except OSError:
465
+ pass
466
+
467
+ # Phase 12: LLM explainability card (authed users only β€” conserves LLM quota)
468
+ llm = _compute_llm_summary(resp, record_id=record.id, user=user, media_kind="video")
469
+ if llm:
470
+ resp.llm_summary = llm
471
 
472
+ return resp
473
 
474
 
475
  class TextAnalyzeBody(BaseModel):
476
  text: str
477
+ cache: bool = True
478
+ language_hint: str = "auto"
479
 
480
 
481
  @router.post("/text", response_model=TextAnalysisResponse)
482
+ @limiter.limit(ANON_ANALYZE, exempt_when=is_authed)
483
+ @limiter.limit(AUTH_ANALYZE, exempt_when=is_anon)
484
  async def analyze_text_endpoint(
485
+ request: Request,
486
+ response: Response,
487
  body: TextAnalyzeBody = Body(...),
488
  db: Session = Depends(get_db),
489
  user: User | None = Depends(optional_current_user),
 
492
  stages: list[str] = []
493
 
494
  # Phase 13: language detection β€” routes to multilang model when non-English
495
+ lang = _resolve_language_hint(body.text, body.language_hint)
496
  stages.append("language_detection")
497
 
498
  clf = classify_text(body.text, language=lang)
 
522
  effective_fake_prob = news.truth_override.fake_prob_after
523
  stages.append("truth_override_applied")
524
 
525
+ # Weighted score: keep classifier authoritative. Linguistic heuristics can
526
+ # lower confidence, but should not give a high floor when classifier is very fake.
527
  manip_penalty = min(len(manip) * 5, 30)
528
  raw_score = (1.0 - effective_fake_prob) * 100.0
529
+ heuristic_score = max(0, 100 - sens.score) * 0.60 + max(0, 100 - manip_penalty) * 0.40
530
+ weighted = raw_score * 0.90 + heuristic_score * 0.10
531
  score = int(round(max(0.0, min(100.0, weighted))))
532
  label, severity = get_verdict_label(score)
533
  duration_ms = int((time.perf_counter() - start) * 1000)
 
537
  else settings.TEXT_MODEL_ID
538
  )
539
 
540
+ resp = TextAnalysisResponse(
541
  analysis_id=str(uuid.uuid4()),
542
  media_type="text",
543
  timestamp=datetime.now(timezone.utc).isoformat(),
 
582
  stages_completed=stages,
583
  total_duration_ms=duration_ms,
584
  model_used=model_used,
585
+ calibrator_applied=False,
586
  ),
587
  )
588
 
 
591
  media_type="text",
592
  verdict=label,
593
  authenticity_score=float(score),
594
+ result_json=json.dumps(resp.model_dump()),
595
  )
596
  db.add(record)
597
  db.commit()
598
  db.refresh(record)
599
+ resp.record_id = record.id
600
  logger.info(f"Saved AnalysisRecord id={record.id} text score={score} verdict={label}")
601
 
602
+ # Phase 12: LLM explainability card (authed users only β€” conserves LLM quota)
603
+ llm = _compute_llm_summary(resp, record_id=record.id, user=user, media_kind="text")
604
+ if llm:
605
+ resp.llm_summary = llm
 
 
 
606
 
607
+ return resp
608
 
609
 
610
  @router.post("/screenshot", response_model=ScreenshotAnalysisResponse)
611
+ @limiter.limit(ANON_ANALYZE, exempt_when=is_authed)
612
+ @limiter.limit(AUTH_ANALYZE, exempt_when=is_anon)
613
  async def analyze_screenshot_endpoint(
614
+ request: Request,
615
+ response: Response,
616
+ cache: bool = Query(default=True),
617
+ language_hint: str = Query(default="auto"),
618
  file: UploadFile = File(...),
619
  db: Session = Depends(get_db),
620
  user: User | None = Depends(optional_current_user),
 
627
  )
628
  stages.append("validation")
629
 
630
+ # Phase 19.1 β€” dedup cache
631
+ media_hash = sha256_bytes(raw)
632
+ cached = lookup_cached(db, media_hash=media_hash, media_type="screenshot", user_id=user.id if user else None) if cache else None
633
+ if cached is not None:
634
+ payload = cached_payload(cached)
635
+ if payload is not None:
636
+ logger.info(f"cache hit screenshot sha={media_hash[:12]} record={cached.id}")
637
+ return ScreenshotAnalysisResponse.model_validate(payload)
638
+
639
  pil = load_image_from_bytes(raw)
640
  ocr_boxes = run_ocr(pil)
641
  stages.append("ocr")
 
643
  full_text = extract_full_text(ocr_boxes)
644
 
645
  # Phase 13: language detection on extracted OCR text
646
+ lang = _resolve_language_hint(full_text, language_hint) if full_text else "en"
647
  stages.append("language_detection")
648
 
649
  clf = classify_text(full_text, language=lang) if full_text else None
 
685
  manip_penalty = min(len(manip) * 5, 30)
686
  layout_penalty = min(len(layout) * 5, 15)
687
  raw_score = (1.0 - effective_fake_prob) * 100.0
688
+ heuristic_score = (
689
+ max(0, 100 - sens.score) * 0.45
690
+ + max(0, 100 - manip_penalty) * 0.35
691
+ + max(0, 100 - layout_penalty) * 0.20
 
692
  )
693
+ weighted = raw_score * 0.90 + heuristic_score * 0.10
694
  if not full_text.strip():
695
  weighted = 50
696
  score = int(round(max(0.0, min(100.0, weighted))))
 
703
  else f"{settings.TEXT_MODEL_ID} + EasyOCR"
704
  )
705
 
706
+ resp = ScreenshotAnalysisResponse(
707
  analysis_id=str(uuid.uuid4()),
708
  media_type="screenshot",
709
  timestamp=datetime.now(timezone.utc).isoformat(),
 
746
  stages_completed=stages,
747
  total_duration_ms=duration_ms,
748
  model_used=model_used_str,
749
+ calibrator_applied=False,
750
  ),
751
  )
752
 
753
+ # Phase 19.2 β€” object storage + thumbnail
754
+ ext = (mime.split("/")[-1] if mime else "jpg").replace("jpeg", "jpg")
755
+ try:
756
+ media_path = save_bytes(raw, media_hash, ext)
757
+ except Exception as e: # noqa: BLE001
758
+ logger.warning(f"screenshot media save failed: {e}")
759
+ media_path = None
760
+ thumbnail_url = make_image_thumbnail(pil, media_hash)
761
+ resp.thumbnail_url = thumbnail_url
762
+
763
  record = AnalysisRecord(
764
  user_id=user.id if user else None,
765
  media_type="screenshot",
766
  verdict=label,
767
  authenticity_score=float(score),
768
+ result_json=json.dumps(resp.model_dump()),
769
+ media_hash=media_hash,
770
+ media_path=media_path,
771
+ thumbnail_url=thumbnail_url,
772
  )
773
  db.add(record)
774
  db.commit()
775
  db.refresh(record)
776
+ resp.record_id = record.id
777
  logger.info(f"Saved AnalysisRecord id={record.id} screenshot score={score} verdict={label}")
778
 
779
+ # Phase 12: LLM explainability card (authed users only β€” conserves LLM quota)
780
+ llm = _compute_llm_summary(resp, record_id=record.id, user=user, media_kind="screenshot")
781
+ if llm:
782
+ resp.llm_summary = llm
783
+
784
+ return resp
785
+
786
+
787
+ # ───────────────────────── Phase 19.3 β€” async video + jobs ─────────────────────────
788
+
789
+ @router.post("/video/async", status_code=status.HTTP_202_ACCEPTED)
790
+ @limiter.limit(ANON_ANALYZE, exempt_when=is_authed)
791
+ @limiter.limit(AUTH_ANALYZE, exempt_when=is_anon)
792
+ async def analyze_video_async(
793
+ request: Request,
794
+ response: Response,
795
+ background: BackgroundTasks,
796
+ cache: bool = Query(default=True),
797
+ language_hint: str = Query(default="auto"),
798
+ file: UploadFile = File(...),
799
+ db: Session = Depends(get_db),
800
+ user: User | None = Depends(optional_current_user),
801
+ ):
802
+ """Queue a video analysis and return a job_id. Poll GET /api/v1/jobs/{job_id}.
803
+
804
+ Used by the PipelineVisualizer so it can read real backend stage/progress
805
+ instead of guessing timing.
806
+ """
807
+ suffix = os.path.splitext(file.filename or "")[1].lower() or ".mp4"
808
+ path, _mime = await save_upload_to_tempfile(
809
+ file, settings.ALLOWED_VIDEO_TYPES, max_size_mb=VIDEO_MAX_MB, suffix=suffix
810
+ )
811
+
812
+ # Quick cache probe so callers don't wait for queue dispatch on repeats.
813
+ media_hash = sha256_file(path)
814
+ cached = lookup_cached(db, media_hash=media_hash, media_type="video", user_id=user.id if user else None) if cache else None
815
+ if cached is not None:
816
+ payload = cached_payload(cached)
817
+ try:
818
+ os.unlink(path)
819
+ except OSError:
820
+ pass
821
+ if payload is not None:
822
+ job = job_registry.create()
823
+ job_registry.update(job.id, status="done", stage="done", progress=100, result=payload)
824
+ return {"job_id": job.id, "status": "done", "cached": True}
825
+
826
+ user_id = user.id if user else None
827
+ job = job_registry.create()
828
+
829
+ def _work(progress):
830
+ from db.database import SessionLocal
831
+ local_db = SessionLocal()
832
+ try:
833
+ progress("frame_extraction", 15)
834
+ agg = analyze_video(path, num_frames=VIDEO_NUM_FRAMES)
835
+ progress("aggregation", 60)
836
+
837
+ audio_result = None
838
+ try:
839
+ audio_result = analyze_audio(path)
840
+ except Exception as _ae: # noqa: BLE001
841
+ logger.warning(f"Audio analysis failed, continuing: {_ae}")
842
+ progress("audio_analysis", 75)
843
+
844
+ score_val, label_val, sev = compute_video_authenticity_score(
845
+ mean_suspicious_prob=agg.mean_suspicious_prob,
846
+ insufficient_faces=agg.insufficient_faces,
847
+ temporal_score=agg.temporal.temporal_score if agg.temporal else None,
848
+ audio_authenticity_score=audio_result.audio_authenticity_score if audio_result else None,
849
+ has_audio=bool(audio_result and audio_result.has_audio),
850
+ )
851
+
852
+ from schemas.analyze import AudioExplainability
853
+ audio_ex = None
854
+ if audio_result:
855
+ audio_ex = AudioExplainability(
856
+ audio_authenticity_score=audio_result.audio_authenticity_score,
857
+ has_audio=audio_result.has_audio,
858
+ duration_s=audio_result.duration_s,
859
+ silence_ratio=audio_result.silence_ratio,
860
+ spectral_variance=audio_result.spectral_variance,
861
+ rms_consistency=audio_result.rms_consistency,
862
+ notes=audio_result.notes,
863
+ )
864
 
865
+ resp = VideoAnalysisResponse(
866
+ analysis_id=str(uuid.uuid4()),
867
+ media_type="video",
868
+ timestamp=datetime.now(timezone.utc).isoformat(),
869
+ verdict=Verdict(
870
+ label=label_val, severity=sev,
871
+ authenticity_score=score_val,
872
+ model_confidence=float(agg.mean_suspicious_prob),
873
+ model_label="suspicious_mean" if not agg.insufficient_faces else "no_faces",
874
+ ),
875
+ explainability=VideoExplainability(
876
+ num_frames_sampled=agg.num_frames_sampled,
877
+ num_face_frames=agg.num_face_frames,
878
+ num_suspicious_frames=agg.num_suspicious_frames,
879
+ mean_suspicious_prob=agg.mean_suspicious_prob,
880
+ max_suspicious_prob=agg.max_suspicious_prob,
881
+ suspicious_ratio=agg.suspicious_ratio,
882
+ insufficient_faces=agg.insufficient_faces,
883
+ suspicious_timestamps=agg.suspicious_timestamps,
884
+ frames=[
885
+ FrameAnalysisOut(
886
+ index=f.index, timestamp_s=f.timestamp_s,
887
+ label=f.label, confidence=f.confidence,
888
+ suspicious_prob=f.suspicious_prob, is_suspicious=f.is_suspicious,
889
+ has_face=f.has_face, scored=f.scored,
890
+ ) for f in agg.frames
891
+ ],
892
+ temporal_score=agg.temporal.temporal_score if agg.temporal else None,
893
+ optical_flow_variance=agg.temporal.optical_flow_variance if agg.temporal else None,
894
+ flicker_score=agg.temporal.flicker_score if agg.temporal else None,
895
+ blink_rate_anomaly=agg.temporal.blink_rate_anomaly if agg.temporal else None,
896
+ audio=audio_ex,
897
+ ),
898
+ processing_summary=ProcessingSummary(
899
+ stages_completed=["frame_extraction", "classification", "aggregation"],
900
+ total_duration_ms=0,
901
+ model_used=settings.IMAGE_MODEL_ID,
902
+ models_used=agg.models_used,
903
+ calibrator_applied=agg.calibrator_applied,
904
+ ),
905
+ )
906
+
907
+ progress("storage", 85)
908
+ try:
909
+ media_path = save_file(path, media_hash, suffix.lstrip("."))
910
+ except Exception as e: # noqa: BLE001
911
+ logger.warning(f"async video media save failed: {e}")
912
+ media_path = None
913
+ thumb = make_video_thumbnail(path, media_hash)
914
+ resp.thumbnail_url = thumb
915
+
916
+ rec = AnalysisRecord(
917
+ user_id=user_id,
918
+ media_type="video",
919
+ verdict=label_val,
920
+ authenticity_score=float(score_val),
921
+ result_json=json.dumps(resp.model_dump()),
922
+ media_hash=media_hash,
923
+ media_path=media_path,
924
+ thumbnail_url=thumb,
925
+ )
926
+ local_db.add(rec)
927
+ local_db.commit()
928
+ local_db.refresh(rec)
929
+ resp.record_id = rec.id
930
+ progress("persist", 95)
931
+
932
+ return resp.model_dump()
933
+ finally:
934
+ local_db.close()
935
+ try:
936
+ os.unlink(path)
937
+ except OSError:
938
+ pass
939
+
940
+ stages = ["queued", "frame_extraction", "aggregation", "audio_analysis", "storage", "persist", "done"]
941
+ background.add_task(run_job, job.id, stages, _work)
942
+ return {"job_id": job.id, "status": "queued", "cached": False}
943
+
944
+
945
+ jobs_router = APIRouter(prefix="/jobs", tags=["jobs"])
946
+
947
+
948
+ @jobs_router.get("/{job_id}")
949
+ def get_job(job_id: str):
950
+ j = job_registry.get(job_id)
951
+ if not j:
952
+ raise HTTPException(status_code=404, detail="job not found")
953
+ return {
954
+ "id": j.id,
955
+ "status": j.status,
956
+ "stage": j.stage,
957
+ "progress": j.progress,
958
+ "error": j.error,
959
+ "result": j.result if j.status == "done" else None,
960
+ }