Spaces:
Runtime error
Runtime error
Sync from GitHub via hub-sync
Browse files- api/v1/analyze.py +6 -3
- config.py +1 -1
- services/audio_service.py +5 -1
- services/video_service.py +2 -2
- utils/scoring.py +26 -1
api/v1/analyze.py
CHANGED
|
@@ -97,7 +97,6 @@ def _resize_for_vis(pil) -> "Image.Image":
|
|
| 97 |
scale = _VIS_MAX_PX / max(w, h)
|
| 98 |
return pil.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
|
| 99 |
VIDEO_MAX_MB = 100
|
| 100 |
-
VIDEO_NUM_FRAMES = 16
|
| 101 |
|
| 102 |
_IMAGE_EXCLUDE = {"explainability": {"heatmap_base64", "ela_base64", "boxes_base64"}}
|
| 103 |
|
|
@@ -575,7 +574,7 @@ async def analyze_video_endpoint(
|
|
| 575 |
return VideoAnalysisResponse.model_validate(payload)
|
| 576 |
|
| 577 |
try:
|
| 578 |
-
agg = analyze_video(path, num_frames=
|
| 579 |
stages.append("frame_extraction")
|
| 580 |
stages.append("frame_classification")
|
| 581 |
stages.append("aggregation")
|
|
@@ -600,6 +599,8 @@ async def analyze_video_endpoint(
|
|
| 600 |
# Phase 17.3 — combined verdict formula
|
| 601 |
score, label, severity = compute_video_authenticity_score(
|
| 602 |
mean_suspicious_prob=agg.mean_suspicious_prob,
|
|
|
|
|
|
|
| 603 |
insufficient_faces=agg.insufficient_faces,
|
| 604 |
temporal_score=agg.temporal.temporal_score if agg.temporal else None,
|
| 605 |
audio_authenticity_score=audio_result.audio_authenticity_score if audio_result else None,
|
|
@@ -1139,7 +1140,7 @@ async def analyze_video_async(
|
|
| 1139 |
local_db = SessionLocal()
|
| 1140 |
try:
|
| 1141 |
progress("frame_extraction", 15)
|
| 1142 |
-
agg = analyze_video(path, num_frames=
|
| 1143 |
progress("aggregation", 60)
|
| 1144 |
|
| 1145 |
audio_result = None
|
|
@@ -1151,6 +1152,8 @@ async def analyze_video_async(
|
|
| 1151 |
|
| 1152 |
score_val, label_val, sev = compute_video_authenticity_score(
|
| 1153 |
mean_suspicious_prob=agg.mean_suspicious_prob,
|
|
|
|
|
|
|
| 1154 |
insufficient_faces=agg.insufficient_faces,
|
| 1155 |
temporal_score=agg.temporal.temporal_score if agg.temporal else None,
|
| 1156 |
audio_authenticity_score=audio_result.audio_authenticity_score if audio_result else None,
|
|
|
|
| 97 |
scale = _VIS_MAX_PX / max(w, h)
|
| 98 |
return pil.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
|
| 99 |
VIDEO_MAX_MB = 100
|
|
|
|
| 100 |
|
| 101 |
_IMAGE_EXCLUDE = {"explainability": {"heatmap_base64", "ela_base64", "boxes_base64"}}
|
| 102 |
|
|
|
|
| 574 |
return VideoAnalysisResponse.model_validate(payload)
|
| 575 |
|
| 576 |
try:
|
| 577 |
+
agg = analyze_video(path, num_frames=settings.VIDEO_SAMPLE_FRAMES)
|
| 578 |
stages.append("frame_extraction")
|
| 579 |
stages.append("frame_classification")
|
| 580 |
stages.append("aggregation")
|
|
|
|
| 599 |
# Phase 17.3 — combined verdict formula
|
| 600 |
score, label, severity = compute_video_authenticity_score(
|
| 601 |
mean_suspicious_prob=agg.mean_suspicious_prob,
|
| 602 |
+
max_suspicious_prob=agg.max_suspicious_prob,
|
| 603 |
+
suspicious_ratio=agg.suspicious_ratio,
|
| 604 |
insufficient_faces=agg.insufficient_faces,
|
| 605 |
temporal_score=agg.temporal.temporal_score if agg.temporal else None,
|
| 606 |
audio_authenticity_score=audio_result.audio_authenticity_score if audio_result else None,
|
|
|
|
| 1140 |
local_db = SessionLocal()
|
| 1141 |
try:
|
| 1142 |
progress("frame_extraction", 15)
|
| 1143 |
+
agg = analyze_video(path, num_frames=settings.VIDEO_SAMPLE_FRAMES)
|
| 1144 |
progress("aggregation", 60)
|
| 1145 |
|
| 1146 |
audio_result = None
|
|
|
|
| 1152 |
|
| 1153 |
score_val, label_val, sev = compute_video_authenticity_score(
|
| 1154 |
mean_suspicious_prob=agg.mean_suspicious_prob,
|
| 1155 |
+
max_suspicious_prob=agg.max_suspicious_prob,
|
| 1156 |
+
suspicious_ratio=agg.suspicious_ratio,
|
| 1157 |
insufficient_faces=agg.insufficient_faces,
|
| 1158 |
temporal_score=agg.temporal.temporal_score if agg.temporal else None,
|
| 1159 |
audio_authenticity_score=audio_result.audio_authenticity_score if audio_result else None,
|
config.py
CHANGED
|
@@ -273,7 +273,7 @@ class Settings(BaseSettings):
|
|
| 273 |
# face forgery frames, so it is the dominant signal for video analysis.
|
| 274 |
VIDEO_FFPP_WEIGHT: float = 0.70
|
| 275 |
VIDEO_EFFNET_WEIGHT: float = 0.30
|
| 276 |
-
VIDEO_SAMPLE_FRAMES: int =
|
| 277 |
EXIFTOOL_PATH: str = "" # full path to ExifTool binary; empty = metadata write disabled
|
| 278 |
|
| 279 |
# Auth
|
|
|
|
| 273 |
# face forgery frames, so it is the dominant signal for video analysis.
|
| 274 |
VIDEO_FFPP_WEIGHT: float = 0.70
|
| 275 |
VIDEO_EFFNET_WEIGHT: float = 0.30
|
| 276 |
+
VIDEO_SAMPLE_FRAMES: int = 32 # frames to sample per video for inference
|
| 277 |
EXIFTOOL_PATH: str = "" # full path to ExifTool binary; empty = metadata write disabled
|
| 278 |
|
| 279 |
# Auth
|
services/audio_service.py
CHANGED
|
@@ -50,7 +50,11 @@ def _extract_audio_wav(video_path: str, out_path: str) -> bool:
|
|
| 50 |
capture_output=True,
|
| 51 |
timeout=60,
|
| 52 |
)
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
except (FileNotFoundError, subprocess.TimeoutExpired, OSError) as exc:
|
| 55 |
logger.warning(f"ffmpeg audio extraction failed: {exc}")
|
| 56 |
return False
|
|
|
|
| 50 |
capture_output=True,
|
| 51 |
timeout=60,
|
| 52 |
)
|
| 53 |
+
if result.returncode != 0 or not os.path.exists(out_path) or os.path.getsize(out_path) == 0:
|
| 54 |
+
stderr_tail = result.stderr.decode(errors="replace")[-400:].strip()
|
| 55 |
+
logger.warning(f"ffmpeg exited {result.returncode} — {stderr_tail or '(no stderr)'}")
|
| 56 |
+
return False
|
| 57 |
+
return True
|
| 58 |
except (FileNotFoundError, subprocess.TimeoutExpired, OSError) as exc:
|
| 59 |
logger.warning(f"ffmpeg audio extraction failed: {exc}")
|
| 60 |
return False
|
services/video_service.py
CHANGED
|
@@ -188,7 +188,7 @@ def _analyze_with_efficientnet(
|
|
| 188 |
label=label,
|
| 189 |
confidence=fake_prob,
|
| 190 |
suspicious_prob=fake_prob,
|
| 191 |
-
is_suspicious=(fake_prob >= 0.
|
| 192 |
has_face=has_face,
|
| 193 |
scored=bool(has_face and faces),
|
| 194 |
)
|
|
@@ -212,7 +212,7 @@ def _analyze_with_vit(
|
|
| 212 |
label=vit_label,
|
| 213 |
confidence=vit_fake_prob,
|
| 214 |
suspicious_prob=vit_fake_prob,
|
| 215 |
-
is_suspicious=(vit_fake_prob >= 0.
|
| 216 |
has_face=face,
|
| 217 |
scored=face,
|
| 218 |
)
|
|
|
|
| 188 |
label=label,
|
| 189 |
confidence=fake_prob,
|
| 190 |
suspicious_prob=fake_prob,
|
| 191 |
+
is_suspicious=(fake_prob >= 0.40) and has_face,
|
| 192 |
has_face=has_face,
|
| 193 |
scored=bool(has_face and faces),
|
| 194 |
)
|
|
|
|
| 212 |
label=vit_label,
|
| 213 |
confidence=vit_fake_prob,
|
| 214 |
suspicious_prob=vit_fake_prob,
|
| 215 |
+
is_suspicious=(vit_fake_prob >= 0.40) and face,
|
| 216 |
has_face=face,
|
| 217 |
scored=face,
|
| 218 |
)
|
utils/scoring.py
CHANGED
|
@@ -70,6 +70,8 @@ def apply_unverified_news_gate(
|
|
| 70 |
def compute_video_authenticity_score(
|
| 71 |
*,
|
| 72 |
mean_suspicious_prob: float,
|
|
|
|
|
|
|
| 73 |
insufficient_faces: bool,
|
| 74 |
temporal_score: float | None = None,
|
| 75 |
audio_authenticity_score: float | None = None,
|
|
@@ -80,6 +82,14 @@ def compute_video_authenticity_score(
|
|
| 80 |
Face-model evidence is authoritative only when enough face frames were
|
| 81 |
scored. If face content is insufficient, use temporal/audio evidence when
|
| 82 |
available instead of forcing a neutral result.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
"""
|
| 84 |
if insufficient_faces:
|
| 85 |
evidence: list[tuple[float, float]] = []
|
|
@@ -97,7 +107,12 @@ def compute_video_authenticity_score(
|
|
| 97 |
label, severity = get_verdict_label(score)
|
| 98 |
return score, label, severity
|
| 99 |
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
temporal_sc = float(temporal_score) if temporal_score is not None else visual_score
|
| 102 |
if has_audio and audio_authenticity_score is not None:
|
| 103 |
_validate_weight_total([0.50, 0.30, 0.20], "video audio+temporal fusion")
|
|
@@ -106,6 +121,16 @@ def compute_video_authenticity_score(
|
|
| 106 |
_validate_weight_total([0.70, 0.30], "video visual+temporal fusion")
|
| 107 |
combined = 0.70 * visual_score + 0.30 * temporal_sc
|
| 108 |
score = int(round(max(0.0, min(100.0, combined))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
label, severity = get_verdict_label(score)
|
| 110 |
return score, label, severity
|
| 111 |
|
|
|
|
| 70 |
def compute_video_authenticity_score(
|
| 71 |
*,
|
| 72 |
mean_suspicious_prob: float,
|
| 73 |
+
max_suspicious_prob: float = 0.0,
|
| 74 |
+
suspicious_ratio: float = 0.0,
|
| 75 |
insufficient_faces: bool,
|
| 76 |
temporal_score: float | None = None,
|
| 77 |
audio_authenticity_score: float | None = None,
|
|
|
|
| 82 |
Face-model evidence is authoritative only when enough face frames were
|
| 83 |
scored. If face content is insufficient, use temporal/audio evidence when
|
| 84 |
available instead of forcing a neutral result.
|
| 85 |
+
|
| 86 |
+
The effective visual fake probability blends the per-frame mean with the
|
| 87 |
+
per-frame maximum (65/35 split). This prevents a deepfake from hiding
|
| 88 |
+
behind many clean frames: even a cluster of highly-suspicious frames
|
| 89 |
+
raises the combined score meaningfully.
|
| 90 |
+
|
| 91 |
+
A suspicious_ratio cap prevents a misleadingly high authenticity score when
|
| 92 |
+
a significant fraction of frames are flagged regardless of the mean.
|
| 93 |
"""
|
| 94 |
if insufficient_faces:
|
| 95 |
evidence: list[tuple[float, float]] = []
|
|
|
|
| 107 |
label, severity = get_verdict_label(score)
|
| 108 |
return score, label, severity
|
| 109 |
|
| 110 |
+
# Blend mean and max: mean alone is easily diluted by clean frames.
|
| 111 |
+
# 65% mean keeps the overall distribution; 35% max ensures a cluster of
|
| 112 |
+
# highly-suspicious frames cannot be hidden by majority-clean frames.
|
| 113 |
+
effective_prob = 0.65 * float(mean_suspicious_prob) + 0.35 * float(max_suspicious_prob)
|
| 114 |
+
visual_score = (1.0 - effective_prob) * 100.0
|
| 115 |
+
|
| 116 |
temporal_sc = float(temporal_score) if temporal_score is not None else visual_score
|
| 117 |
if has_audio and audio_authenticity_score is not None:
|
| 118 |
_validate_weight_total([0.50, 0.30, 0.20], "video audio+temporal fusion")
|
|
|
|
| 121 |
_validate_weight_total([0.70, 0.30], "video visual+temporal fusion")
|
| 122 |
combined = 0.70 * visual_score + 0.30 * temporal_sc
|
| 123 |
score = int(round(max(0.0, min(100.0, combined))))
|
| 124 |
+
|
| 125 |
+
# Suspicious-ratio caps: when a meaningful fraction of frames are flagged,
|
| 126 |
+
# prevent the score from landing in a confident "Likely Real" band.
|
| 127 |
+
# ≥40% suspicious → cap at 35 (Likely Fake zone).
|
| 128 |
+
# ≥20% suspicious → cap at 50 (Uncertain/Suspicious zone).
|
| 129 |
+
if suspicious_ratio >= 0.40:
|
| 130 |
+
score = min(score, 35)
|
| 131 |
+
elif suspicious_ratio >= 0.20:
|
| 132 |
+
score = min(score, 50)
|
| 133 |
+
|
| 134 |
label, severity = get_verdict_label(score)
|
| 135 |
return score, label, severity
|
| 136 |
|