vnitx-video / src /api /video_server.py
jaivsh
add video detection pipeline
6903fe1
import asyncio
from pathlib import Path
from fastapi import FastAPI, File, Form, UploadFile
from src.video.video_processor import VideoAnalyzer
app = FastAPI(title="Video Prompt Detection API")
_ANALYZER: VideoAnalyzer | None = None
@app.on_event("startup")
def load_analyzer() -> None:
global _ANALYZER
if _ANALYZER is None:
_ANALYZER = VideoAnalyzer()
@app.get("/")
def health_check() -> dict:
return {"status": "ok", "engine": "video"}
@app.post("/analyze_video")
async def analyze_video(
video: UploadFile = File(...),
audio_transcript: str = Form(""),
target_fps: float = Form(5.0),
max_frames: int | None = Form(None),
run_injection: bool = Form(True),
run_cross_modal: bool = Form(True),
run_caption: bool = Form(True),
run_vision_deepfake: bool = Form(True),
run_avsync: bool = Form(True),
log_frames: bool = Form(True),
) -> dict:
if _ANALYZER is None:
load_analyzer()
analyzer = _ANALYZER
video_bytes = await video.read()
log_path = None
if log_frames:
log_path = f"/tmp/video_frame_log_{int(asyncio.get_event_loop().time()*1000)}.jsonl"
frames, summary = analyzer.analyze_video_bytes(
video_bytes,
audio_transcript=audio_transcript,
target_fps=target_fps,
max_frames=max_frames,
run_injection=run_injection,
run_cross_modal=run_cross_modal,
run_caption=run_caption,
run_vision_deepfake=run_vision_deepfake,
run_avsync=run_avsync,
log_path=Path(log_path) if log_path else None,
)
top_risky = sorted(frames, key=lambda f: f.final_score, reverse=True)[:5]
def _action_from_score(score: float) -> str:
if score >= 0.7:
return "BLOCK"
if score >= 0.5:
return "FLAG"
return "ALLOW"
def flatten(frame):
action = _action_from_score(frame.final_score)
return {
"frame_index": frame.frame_index,
"timestamp_sec": frame.timestamp_sec,
"final_score": frame.final_score,
"action": action,
"deepfake_score": frame.deepfake_score,
"deepfake_label": frame.deepfake_label,
"deepfake_is_fake": frame.deepfake_is_fake,
"injection_risk": frame.injection.get("risk_score", 0.0),
"injection_reason": frame.injection.get("reason", ""),
"cross_modal_score": frame.cross_modal.get("consistency_score", 0.0),
"ocr_vs_image_score": frame.ocr_vs_image.get("consistency_score", 0.0),
"caption_alignment_score": frame.caption_alignment.get("alignment_score", 0.0),
"caption": frame.caption_alignment.get("caption", ""),
"ocr_text": frame.ocr_text,
}
action = _action_from_score(summary.get("max_final_score", 0.0))
explanations = [
f"avg_deepfake={summary.get('avg_deepfake_score', 0.0)}",
f"avsync={summary.get('avsync_score', 0.0)}",
f"max_final={summary.get('max_final_score', 0.0)}",
]
return {
"summary": summary,
"timeline": [f.__dict__ for f in frames],
"timeline_flat": [flatten(f) for f in frames],
"top_risky_frames": [f.__dict__ for f in top_risky],
"top_risky_frames_flat": [flatten(f) for f in top_risky],
"action": action,
"explanations": explanations,
"log_path": log_path,
}
@app.post("/analyze_webcam")
async def analyze_webcam(
camera_index: int = Form(0),
duration_sec: float = Form(10.0),
target_fps: float = Form(5.0),
run_injection: bool = Form(True),
run_cross_modal: bool = Form(True),
run_caption: bool = Form(True),
run_vision_deepfake: bool = Form(True),
run_avsync: bool = Form(True),
log_frames: bool = Form(True),
) -> dict:
if _ANALYZER is None:
load_analyzer()
analyzer = _ANALYZER
log_path = None
if log_frames:
log_path = f"/tmp/webcam_frame_log_{int(asyncio.get_event_loop().time()*1000)}.jsonl"
frames, summary = analyzer.analyze_webcam(
camera_index=camera_index,
duration_sec=duration_sec,
target_fps=target_fps,
run_injection=run_injection,
run_cross_modal=run_cross_modal,
run_caption=run_caption,
run_vision_deepfake=run_vision_deepfake,
run_avsync=run_avsync,
log_path=Path(log_path) if log_path else None,
)
top_risky = sorted(frames, key=lambda f: f.final_score, reverse=True)[:5]
def _action_from_score(score: float) -> str:
if score >= 0.7:
return "BLOCK"
if score >= 0.5:
return "FLAG"
return "ALLOW"
def flatten(frame):
action = _action_from_score(frame.final_score)
return {
"frame_index": frame.frame_index,
"timestamp_sec": frame.timestamp_sec,
"final_score": frame.final_score,
"action": action,
"deepfake_score": frame.deepfake_score,
"deepfake_label": frame.deepfake_label,
"deepfake_is_fake": frame.deepfake_is_fake,
"injection_risk": frame.injection.get("risk_score", 0.0),
"injection_reason": frame.injection.get("reason", ""),
"cross_modal_score": frame.cross_modal.get("consistency_score", 0.0),
"ocr_vs_image_score": frame.ocr_vs_image.get("consistency_score", 0.0),
"caption_alignment_score": frame.caption_alignment.get("alignment_score", 0.0),
"caption": frame.caption_alignment.get("caption", ""),
"ocr_text": frame.ocr_text,
}
action = _action_from_score(summary.get("max_final_score", 0.0))
explanations = [
f"avg_deepfake={summary.get('avg_deepfake_score', 0.0)}",
f"avsync={summary.get('avsync_score', 0.0)}",
f"max_final={summary.get('max_final_score', 0.0)}",
]
return {
"summary": summary,
"timeline": [f.__dict__ for f in frames],
"timeline_flat": [flatten(f) for f in frames],
"top_risky_frames": [f.__dict__ for f in top_risky],
"top_risky_frames_flat": [flatten(f) for f in top_risky],
"action": action,
"explanations": explanations,
"log_path": log_path,
}