DocuMaker / src /guide.py
vivekchakraverty's picture
Make auto-frame extraction narration-accurate
6e3fc02
Raw
History Blame Contribute Delete
8.16 kB
"""Assemble a final guide: match a frame to each step, then caption it.
Takes the LLM's :class:`~src.llm.GuideDraft` (pure text) plus the pool of
extracted :class:`~src.frames.FrameRecord` s and produces a :class:`Guide` where
each step carries the best-matching image and a caption. If a step has a
timestamp but no nearby frame, a fresh frame is pulled from the video so every
step can be illustrated.
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Callable
from . import config, video, vision
from .frames import FrameRecord
from .llm import GuideDraft
@dataclass
class GuideStep:
heading: str
text: str
timestamp: float | None = None
image_path: str | None = None
caption: str | None = None
@dataclass
class Guide:
title: str = "Step-by-Step Guide"
intro: str = ""
prerequisites: list[str] = field(default_factory=list)
steps: list[GuideStep] = field(default_factory=list)
# Relevance weights. Timestamp proximity (transcript/LLM time) is by far the most
# reliable signal for tutorials and is weighted heavily; the BLIP-caption match and
# sharpness only break ties.
_W_PROX, _W_SEM, _W_SHARP = 0.70, 0.18, 0.12
# How far (seconds) a frame may sit from a step's LLM timestamp to be reused.
# Manual frames get a wider window (the user captured them on purpose); scene/auto
# frames must be close, otherwise we extract a fresh frame at the exact step time.
_MANUAL_WINDOW = 20.0
_AUTO_WINDOW = 12.0
_STOPWORDS = {
"the", "a", "an", "to", "of", "and", "or", "in", "on", "at", "for", "with",
"your", "you", "is", "are", "be", "this", "that", "it", "from", "by", "as",
"into", "then", "will", "can", "should", "have", "has", "its", "their", "our",
"up", "out", "off", "over", "we", "i",
}
_SHARPNESS_CACHE: dict[str, float] = {}
def _sharpness(path: str) -> float:
if path not in _SHARPNESS_CACHE:
_SHARPNESS_CACHE[path] = vision.frame_score(path)
return _SHARPNESS_CACHE[path]
def _keywords(text: str | None) -> set[str]:
return {
w
for w in re.findall(r"[a-z0-9]+", (text or "").lower())
if len(w) > 2 and w not in _STOPWORDS
}
def _text_relevance(caption: str | None, step_text: str) -> float:
"""Fraction of the step's keywords that BLIP's caption mentions (0..1).
This is the BLIP *suggestion* signal: it nudges selection toward a frame
whose description overlaps the step, without letting BLIP decide alone.
"""
step_kw = _keywords(step_text)
cap_kw = _keywords(caption)
if not step_kw or not cap_kw:
return 0.0
return min(len(step_kw & cap_kw) / len(step_kw), 1.0)
def _pick_frame(
candidates: list[FrameRecord],
timestamp: float | None,
step_text: str,
used: set[str],
spoken_range: tuple[float, float] | None = None,
) -> FrameRecord | None:
"""Pick the best existing frame for a step, anchored to its LLM timestamp.
Returns ``None`` when no pool frame sits close enough in time — the caller then
extracts a fresh frame at the exact step timestamp, which keeps every step's
image aligned to the narration rather than to an unrelated visual scene change.
"""
avail = [f for f in candidates if f.path not in used] or candidates
if not avail:
return None
# No LLM timestamp for this step: fall back to caption relevance, then sharpness.
if timestamp is None:
return max(avail, key=lambda f: (_text_relevance(f.caption, step_text), _sharpness(f.path)))
# 1) A manual frame captured near this step wins — it's deliberate user intent.
manual_near = [
f for f in avail
if f.source == "manual" and abs(f.timestamp - timestamp) <= _MANUAL_WINDOW
]
if manual_near:
return min(manual_near, key=lambda f: abs(f.timestamp - timestamp))
# 2) Scene/auto frames tightly around the step's LLM time, inside the spoken range.
near = [f for f in avail if abs(f.timestamp - timestamp) <= _AUTO_WINDOW]
if spoken_range:
lo, hi = spoken_range
in_speech = [f for f in near if lo - 2.0 <= f.timestamp <= hi + 2.0]
near = in_speech or near
if not near:
return None # -> caller extracts a fresh frame at the exact step time
sharps = [_sharpness(f.path) for f in near]
smin, smax = min(sharps), max(sharps)
def norm_sharp(value: float) -> float:
return (value - smin) / (smax - smin) if smax > smin else 1.0
def score(frame: FrameRecord) -> float:
prox = 1.0 - min(abs(frame.timestamp - timestamp) / _AUTO_WINDOW, 1.0)
sem = _text_relevance(frame.caption, step_text)
return _W_PROX * prox + _W_SEM * sem + _W_SHARP * norm_sharp(_sharpness(frame.path))
return max(near, key=score)
def assemble_guide(
draft: GuideDraft,
frames: list[FrameRecord],
*,
video_path: str | Path | None = None,
session_dir: str | Path | None = None,
do_caption: bool = True,
token: str | None = None,
spoken_range: tuple[float, float] | None = None,
progress: Callable[[float, str], None] | None = None,
) -> Guide:
"""Combine a guide draft with frames into a fully illustrated :class:`Guide`.
When captioning is on, the whole (deduped) frame pool is captioned once so
BLIP can both *suggest* the most relevant frame per step and supply the
figure captions. ``spoken_range`` (first/last narration time) keeps selection
inside the narrated portion of the video.
"""
frames_sorted = sorted(frames, key=lambda f: f.timestamp)
# Caption the pool up front (once per frame, context-free to keep the
# relevance signal unbiased) so captions feed both selection and figures.
if do_caption and frames_sorted:
for i, rec in enumerate(frames_sorted):
if rec.caption is None:
if progress:
progress(
0.05 + 0.45 * (i / len(frames_sorted)),
f"Captioning frame {i + 1}/{len(frames_sorted)}…",
)
rec.caption = vision.caption_image(rec.path, token=token) or ""
used: set[str] = set()
steps: list[GuideStep] = []
total = max(len(draft.steps), 1)
for i, sd in enumerate(draft.steps):
if progress:
progress(0.5 + 0.5 * (i / total), f"Matching image to step {i + 1}/{total}…")
step_text = f"{sd.heading} {sd.text}".strip()
chosen = _pick_frame(frames_sorted, sd.approx_timestamp, step_text, used, spoken_range)
# No suitable frame nearby — extract one at the step timestamp.
if (
chosen is None
and video_path
and session_dir
and sd.approx_timestamp is not None
):
out = Path(session_dir) / "frames" / f"step_{i:03d}_{int(sd.approx_timestamp * 1000):08d}.png"
try:
video.extract_frame(video_path, sd.approx_timestamp, out)
chosen = FrameRecord(path=str(out), timestamp=sd.approx_timestamp, source="auto")
except Exception:
chosen = None
image_path = None
caption = None
if chosen is not None:
used.add(chosen.path)
image_path = chosen.path
if chosen.caption is None and do_caption: # freshly extracted frame
chosen.caption = (
vision.caption_image(chosen.path, token=token, context=step_text) or ""
)
caption = chosen.caption or None
steps.append(
GuideStep(
heading=sd.heading,
text=sd.text,
timestamp=sd.approx_timestamp if sd.approx_timestamp is not None
else (chosen.timestamp if chosen else None),
image_path=image_path,
caption=caption,
)
)
if progress:
progress(1.0, "Guide assembled.")
return Guide(
title=draft.title,
intro=draft.intro,
prerequisites=draft.prerequisites,
steps=steps,
)