Spaces:
Running
Running
File size: 8,160 Bytes
85b485a 6e3fc02 85b485a 6e3fc02 85b485a 6e3fc02 85b485a 6e3fc02 85b485a 6e3fc02 85b485a 6e3fc02 85b485a 6e3fc02 85b485a 6e3fc02 85b485a 6e3fc02 85b485a 6e3fc02 85b485a 6e3fc02 85b485a 6e3fc02 85b485a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | """Assemble a final guide: match a frame to each step, then caption it.
Takes the LLM's :class:`~src.llm.GuideDraft` (pure text) plus the pool of
extracted :class:`~src.frames.FrameRecord` s and produces a :class:`Guide` where
each step carries the best-matching image and a caption. If a step has a
timestamp but no nearby frame, a fresh frame is pulled from the video so every
step can be illustrated.
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Callable
from . import config, video, vision
from .frames import FrameRecord
from .llm import GuideDraft
@dataclass
class GuideStep:
heading: str
text: str
timestamp: float | None = None
image_path: str | None = None
caption: str | None = None
@dataclass
class Guide:
title: str = "Step-by-Step Guide"
intro: str = ""
prerequisites: list[str] = field(default_factory=list)
steps: list[GuideStep] = field(default_factory=list)
# Relevance weights. Timestamp proximity (transcript/LLM time) is by far the most
# reliable signal for tutorials and is weighted heavily; the BLIP-caption match and
# sharpness only break ties.
_W_PROX, _W_SEM, _W_SHARP = 0.70, 0.18, 0.12
# How far (seconds) a frame may sit from a step's LLM timestamp to be reused.
# Manual frames get a wider window (the user captured them on purpose); scene/auto
# frames must be close, otherwise we extract a fresh frame at the exact step time.
_MANUAL_WINDOW = 20.0
_AUTO_WINDOW = 12.0
_STOPWORDS = {
"the", "a", "an", "to", "of", "and", "or", "in", "on", "at", "for", "with",
"your", "you", "is", "are", "be", "this", "that", "it", "from", "by", "as",
"into", "then", "will", "can", "should", "have", "has", "its", "their", "our",
"up", "out", "off", "over", "we", "i",
}
_SHARPNESS_CACHE: dict[str, float] = {}
def _sharpness(path: str) -> float:
if path not in _SHARPNESS_CACHE:
_SHARPNESS_CACHE[path] = vision.frame_score(path)
return _SHARPNESS_CACHE[path]
def _keywords(text: str | None) -> set[str]:
return {
w
for w in re.findall(r"[a-z0-9]+", (text or "").lower())
if len(w) > 2 and w not in _STOPWORDS
}
def _text_relevance(caption: str | None, step_text: str) -> float:
"""Fraction of the step's keywords that BLIP's caption mentions (0..1).
This is the BLIP *suggestion* signal: it nudges selection toward a frame
whose description overlaps the step, without letting BLIP decide alone.
"""
step_kw = _keywords(step_text)
cap_kw = _keywords(caption)
if not step_kw or not cap_kw:
return 0.0
return min(len(step_kw & cap_kw) / len(step_kw), 1.0)
def _pick_frame(
candidates: list[FrameRecord],
timestamp: float | None,
step_text: str,
used: set[str],
spoken_range: tuple[float, float] | None = None,
) -> FrameRecord | None:
"""Pick the best existing frame for a step, anchored to its LLM timestamp.
Returns ``None`` when no pool frame sits close enough in time — the caller then
extracts a fresh frame at the exact step timestamp, which keeps every step's
image aligned to the narration rather than to an unrelated visual scene change.
"""
avail = [f for f in candidates if f.path not in used] or candidates
if not avail:
return None
# No LLM timestamp for this step: fall back to caption relevance, then sharpness.
if timestamp is None:
return max(avail, key=lambda f: (_text_relevance(f.caption, step_text), _sharpness(f.path)))
# 1) A manual frame captured near this step wins — it's deliberate user intent.
manual_near = [
f for f in avail
if f.source == "manual" and abs(f.timestamp - timestamp) <= _MANUAL_WINDOW
]
if manual_near:
return min(manual_near, key=lambda f: abs(f.timestamp - timestamp))
# 2) Scene/auto frames tightly around the step's LLM time, inside the spoken range.
near = [f for f in avail if abs(f.timestamp - timestamp) <= _AUTO_WINDOW]
if spoken_range:
lo, hi = spoken_range
in_speech = [f for f in near if lo - 2.0 <= f.timestamp <= hi + 2.0]
near = in_speech or near
if not near:
return None # -> caller extracts a fresh frame at the exact step time
sharps = [_sharpness(f.path) for f in near]
smin, smax = min(sharps), max(sharps)
def norm_sharp(value: float) -> float:
return (value - smin) / (smax - smin) if smax > smin else 1.0
def score(frame: FrameRecord) -> float:
prox = 1.0 - min(abs(frame.timestamp - timestamp) / _AUTO_WINDOW, 1.0)
sem = _text_relevance(frame.caption, step_text)
return _W_PROX * prox + _W_SEM * sem + _W_SHARP * norm_sharp(_sharpness(frame.path))
return max(near, key=score)
def assemble_guide(
draft: GuideDraft,
frames: list[FrameRecord],
*,
video_path: str | Path | None = None,
session_dir: str | Path | None = None,
do_caption: bool = True,
token: str | None = None,
spoken_range: tuple[float, float] | None = None,
progress: Callable[[float, str], None] | None = None,
) -> Guide:
"""Combine a guide draft with frames into a fully illustrated :class:`Guide`.
When captioning is on, the whole (deduped) frame pool is captioned once so
BLIP can both *suggest* the most relevant frame per step and supply the
figure captions. ``spoken_range`` (first/last narration time) keeps selection
inside the narrated portion of the video.
"""
frames_sorted = sorted(frames, key=lambda f: f.timestamp)
# Caption the pool up front (once per frame, context-free to keep the
# relevance signal unbiased) so captions feed both selection and figures.
if do_caption and frames_sorted:
for i, rec in enumerate(frames_sorted):
if rec.caption is None:
if progress:
progress(
0.05 + 0.45 * (i / len(frames_sorted)),
f"Captioning frame {i + 1}/{len(frames_sorted)}…",
)
rec.caption = vision.caption_image(rec.path, token=token) or ""
used: set[str] = set()
steps: list[GuideStep] = []
total = max(len(draft.steps), 1)
for i, sd in enumerate(draft.steps):
if progress:
progress(0.5 + 0.5 * (i / total), f"Matching image to step {i + 1}/{total}…")
step_text = f"{sd.heading} {sd.text}".strip()
chosen = _pick_frame(frames_sorted, sd.approx_timestamp, step_text, used, spoken_range)
# No suitable frame nearby — extract one at the step timestamp.
if (
chosen is None
and video_path
and session_dir
and sd.approx_timestamp is not None
):
out = Path(session_dir) / "frames" / f"step_{i:03d}_{int(sd.approx_timestamp * 1000):08d}.png"
try:
video.extract_frame(video_path, sd.approx_timestamp, out)
chosen = FrameRecord(path=str(out), timestamp=sd.approx_timestamp, source="auto")
except Exception:
chosen = None
image_path = None
caption = None
if chosen is not None:
used.add(chosen.path)
image_path = chosen.path
if chosen.caption is None and do_caption: # freshly extracted frame
chosen.caption = (
vision.caption_image(chosen.path, token=token, context=step_text) or ""
)
caption = chosen.caption or None
steps.append(
GuideStep(
heading=sd.heading,
text=sd.text,
timestamp=sd.approx_timestamp if sd.approx_timestamp is not None
else (chosen.timestamp if chosen else None),
image_path=image_path,
caption=caption,
)
)
if progress:
progress(1.0, "Guide assembled.")
return Guide(
title=draft.title,
intro=draft.intro,
prerequisites=draft.prerequisites,
steps=steps,
)
|