Spaces:
Running
Running
| """Assemble a final guide: match a frame to each step, then caption it. | |
| Takes the LLM's :class:`~src.llm.GuideDraft` (pure text) plus the pool of | |
| extracted :class:`~src.frames.FrameRecord` s and produces a :class:`Guide` where | |
| each step carries the best-matching image and a caption. If a step has a | |
| timestamp but no nearby frame, a fresh frame is pulled from the video so every | |
| step can be illustrated. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from typing import Callable | |
| from . import config, video, vision | |
| from .frames import FrameRecord | |
| from .llm import GuideDraft | |
| class GuideStep: | |
| heading: str | |
| text: str | |
| timestamp: float | None = None | |
| image_path: str | None = None | |
| caption: str | None = None | |
| class Guide: | |
| title: str = "Step-by-Step Guide" | |
| intro: str = "" | |
| prerequisites: list[str] = field(default_factory=list) | |
| steps: list[GuideStep] = field(default_factory=list) | |
| # Relevance weights. Timestamp proximity (transcript/LLM time) is by far the most | |
| # reliable signal for tutorials and is weighted heavily; the BLIP-caption match and | |
| # sharpness only break ties. | |
| _W_PROX, _W_SEM, _W_SHARP = 0.70, 0.18, 0.12 | |
| # How far (seconds) a frame may sit from a step's LLM timestamp to be reused. | |
| # Manual frames get a wider window (the user captured them on purpose); scene/auto | |
| # frames must be close, otherwise we extract a fresh frame at the exact step time. | |
| _MANUAL_WINDOW = 20.0 | |
| _AUTO_WINDOW = 12.0 | |
| _STOPWORDS = { | |
| "the", "a", "an", "to", "of", "and", "or", "in", "on", "at", "for", "with", | |
| "your", "you", "is", "are", "be", "this", "that", "it", "from", "by", "as", | |
| "into", "then", "will", "can", "should", "have", "has", "its", "their", "our", | |
| "up", "out", "off", "over", "we", "i", | |
| } | |
| _SHARPNESS_CACHE: dict[str, float] = {} | |
| def _sharpness(path: str) -> float: | |
| if path not in _SHARPNESS_CACHE: | |
| _SHARPNESS_CACHE[path] = vision.frame_score(path) | |
| return _SHARPNESS_CACHE[path] | |
| def _keywords(text: str | None) -> set[str]: | |
| return { | |
| w | |
| for w in re.findall(r"[a-z0-9]+", (text or "").lower()) | |
| if len(w) > 2 and w not in _STOPWORDS | |
| } | |
| def _text_relevance(caption: str | None, step_text: str) -> float: | |
| """Fraction of the step's keywords that BLIP's caption mentions (0..1). | |
| This is the BLIP *suggestion* signal: it nudges selection toward a frame | |
| whose description overlaps the step, without letting BLIP decide alone. | |
| """ | |
| step_kw = _keywords(step_text) | |
| cap_kw = _keywords(caption) | |
| if not step_kw or not cap_kw: | |
| return 0.0 | |
| return min(len(step_kw & cap_kw) / len(step_kw), 1.0) | |
| def _pick_frame( | |
| candidates: list[FrameRecord], | |
| timestamp: float | None, | |
| step_text: str, | |
| used: set[str], | |
| spoken_range: tuple[float, float] | None = None, | |
| ) -> FrameRecord | None: | |
| """Pick the best existing frame for a step, anchored to its LLM timestamp. | |
| Returns ``None`` when no pool frame sits close enough in time — the caller then | |
| extracts a fresh frame at the exact step timestamp, which keeps every step's | |
| image aligned to the narration rather than to an unrelated visual scene change. | |
| """ | |
| avail = [f for f in candidates if f.path not in used] or candidates | |
| if not avail: | |
| return None | |
| # No LLM timestamp for this step: fall back to caption relevance, then sharpness. | |
| if timestamp is None: | |
| return max(avail, key=lambda f: (_text_relevance(f.caption, step_text), _sharpness(f.path))) | |
| # 1) A manual frame captured near this step wins — it's deliberate user intent. | |
| manual_near = [ | |
| f for f in avail | |
| if f.source == "manual" and abs(f.timestamp - timestamp) <= _MANUAL_WINDOW | |
| ] | |
| if manual_near: | |
| return min(manual_near, key=lambda f: abs(f.timestamp - timestamp)) | |
| # 2) Scene/auto frames tightly around the step's LLM time, inside the spoken range. | |
| near = [f for f in avail if abs(f.timestamp - timestamp) <= _AUTO_WINDOW] | |
| if spoken_range: | |
| lo, hi = spoken_range | |
| in_speech = [f for f in near if lo - 2.0 <= f.timestamp <= hi + 2.0] | |
| near = in_speech or near | |
| if not near: | |
| return None # -> caller extracts a fresh frame at the exact step time | |
| sharps = [_sharpness(f.path) for f in near] | |
| smin, smax = min(sharps), max(sharps) | |
| def norm_sharp(value: float) -> float: | |
| return (value - smin) / (smax - smin) if smax > smin else 1.0 | |
| def score(frame: FrameRecord) -> float: | |
| prox = 1.0 - min(abs(frame.timestamp - timestamp) / _AUTO_WINDOW, 1.0) | |
| sem = _text_relevance(frame.caption, step_text) | |
| return _W_PROX * prox + _W_SEM * sem + _W_SHARP * norm_sharp(_sharpness(frame.path)) | |
| return max(near, key=score) | |
| def assemble_guide( | |
| draft: GuideDraft, | |
| frames: list[FrameRecord], | |
| *, | |
| video_path: str | Path | None = None, | |
| session_dir: str | Path | None = None, | |
| do_caption: bool = True, | |
| token: str | None = None, | |
| spoken_range: tuple[float, float] | None = None, | |
| progress: Callable[[float, str], None] | None = None, | |
| ) -> Guide: | |
| """Combine a guide draft with frames into a fully illustrated :class:`Guide`. | |
| When captioning is on, the whole (deduped) frame pool is captioned once so | |
| BLIP can both *suggest* the most relevant frame per step and supply the | |
| figure captions. ``spoken_range`` (first/last narration time) keeps selection | |
| inside the narrated portion of the video. | |
| """ | |
| frames_sorted = sorted(frames, key=lambda f: f.timestamp) | |
| # Caption the pool up front (once per frame, context-free to keep the | |
| # relevance signal unbiased) so captions feed both selection and figures. | |
| if do_caption and frames_sorted: | |
| for i, rec in enumerate(frames_sorted): | |
| if rec.caption is None: | |
| if progress: | |
| progress( | |
| 0.05 + 0.45 * (i / len(frames_sorted)), | |
| f"Captioning frame {i + 1}/{len(frames_sorted)}…", | |
| ) | |
| rec.caption = vision.caption_image(rec.path, token=token) or "" | |
| used: set[str] = set() | |
| steps: list[GuideStep] = [] | |
| total = max(len(draft.steps), 1) | |
| for i, sd in enumerate(draft.steps): | |
| if progress: | |
| progress(0.5 + 0.5 * (i / total), f"Matching image to step {i + 1}/{total}…") | |
| step_text = f"{sd.heading} {sd.text}".strip() | |
| chosen = _pick_frame(frames_sorted, sd.approx_timestamp, step_text, used, spoken_range) | |
| # No suitable frame nearby — extract one at the step timestamp. | |
| if ( | |
| chosen is None | |
| and video_path | |
| and session_dir | |
| and sd.approx_timestamp is not None | |
| ): | |
| out = Path(session_dir) / "frames" / f"step_{i:03d}_{int(sd.approx_timestamp * 1000):08d}.png" | |
| try: | |
| video.extract_frame(video_path, sd.approx_timestamp, out) | |
| chosen = FrameRecord(path=str(out), timestamp=sd.approx_timestamp, source="auto") | |
| except Exception: | |
| chosen = None | |
| image_path = None | |
| caption = None | |
| if chosen is not None: | |
| used.add(chosen.path) | |
| image_path = chosen.path | |
| if chosen.caption is None and do_caption: # freshly extracted frame | |
| chosen.caption = ( | |
| vision.caption_image(chosen.path, token=token, context=step_text) or "" | |
| ) | |
| caption = chosen.caption or None | |
| steps.append( | |
| GuideStep( | |
| heading=sd.heading, | |
| text=sd.text, | |
| timestamp=sd.approx_timestamp if sd.approx_timestamp is not None | |
| else (chosen.timestamp if chosen else None), | |
| image_path=image_path, | |
| caption=caption, | |
| ) | |
| ) | |
| if progress: | |
| progress(1.0, "Guide assembled.") | |
| return Guide( | |
| title=draft.title, | |
| intro=draft.intro, | |
| prerequisites=draft.prerequisites, | |
| steps=steps, | |
| ) | |