Spaces:

vivekchakraverty
/

DocuMaker

Running

File size: 8,160 Bytes

"""Assemble a final guide: match a frame to each step, then caption it.

Takes the LLM's :class:`~src.llm.GuideDraft` (pure text) plus the pool of
extracted :class:`~src.frames.FrameRecord` s and produces a :class:`Guide` where
each step carries the best-matching image and a caption. If a step has a
timestamp but no nearby frame, a fresh frame is pulled from the video so every
step can be illustrated.
"""
from __future__ import annotations

import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Callable

from . import config, video, vision
from .frames import FrameRecord
from .llm import GuideDraft


@dataclass
class GuideStep:
    heading: str
    text: str
    timestamp: float | None = None
    image_path: str | None = None
    caption: str | None = None


@dataclass
class Guide:
    title: str = "Step-by-Step Guide"
    intro: str = ""
    prerequisites: list[str] = field(default_factory=list)
    steps: list[GuideStep] = field(default_factory=list)


# Relevance weights. Timestamp proximity (transcript/LLM time) is by far the most
# reliable signal for tutorials and is weighted heavily; the BLIP-caption match and
# sharpness only break ties.
_W_PROX, _W_SEM, _W_SHARP = 0.70, 0.18, 0.12
# How far (seconds) a frame may sit from a step's LLM timestamp to be reused.
# Manual frames get a wider window (the user captured them on purpose); scene/auto
# frames must be close, otherwise we extract a fresh frame at the exact step time.
_MANUAL_WINDOW = 20.0
_AUTO_WINDOW = 12.0

_STOPWORDS = {
    "the", "a", "an", "to", "of", "and", "or", "in", "on", "at", "for", "with",
    "your", "you", "is", "are", "be", "this", "that", "it", "from", "by", "as",
    "into", "then", "will", "can", "should", "have", "has", "its", "their", "our",
    "up", "out", "off", "over", "we", "i",
}

_SHARPNESS_CACHE: dict[str, float] = {}


def _sharpness(path: str) -> float:
    if path not in _SHARPNESS_CACHE:
        _SHARPNESS_CACHE[path] = vision.frame_score(path)
    return _SHARPNESS_CACHE[path]


def _keywords(text: str | None) -> set[str]:
    return {
        w
        for w in re.findall(r"[a-z0-9]+", (text or "").lower())
        if len(w) > 2 and w not in _STOPWORDS
    }


def _text_relevance(caption: str | None, step_text: str) -> float:
    """Fraction of the step's keywords that BLIP's caption mentions (0..1).

    This is the BLIP *suggestion* signal: it nudges selection toward a frame
    whose description overlaps the step, without letting BLIP decide alone.
    """
    step_kw = _keywords(step_text)
    cap_kw = _keywords(caption)
    if not step_kw or not cap_kw:
        return 0.0
    return min(len(step_kw & cap_kw) / len(step_kw), 1.0)


def _pick_frame(
    candidates: list[FrameRecord],
    timestamp: float | None,
    step_text: str,
    used: set[str],
    spoken_range: tuple[float, float] | None = None,
) -> FrameRecord | None:
    """Pick the best existing frame for a step, anchored to its LLM timestamp.

    Returns ``None`` when no pool frame sits close enough in time — the caller then
    extracts a fresh frame at the exact step timestamp, which keeps every step's
    image aligned to the narration rather than to an unrelated visual scene change.
    """
    avail = [f for f in candidates if f.path not in used] or candidates
    if not avail:
        return None

    # No LLM timestamp for this step: fall back to caption relevance, then sharpness.
    if timestamp is None:
        return max(avail, key=lambda f: (_text_relevance(f.caption, step_text), _sharpness(f.path)))

    # 1) A manual frame captured near this step wins — it's deliberate user intent.
    manual_near = [
        f for f in avail
        if f.source == "manual" and abs(f.timestamp - timestamp) <= _MANUAL_WINDOW
    ]
    if manual_near:
        return min(manual_near, key=lambda f: abs(f.timestamp - timestamp))

    # 2) Scene/auto frames tightly around the step's LLM time, inside the spoken range.
    near = [f for f in avail if abs(f.timestamp - timestamp) <= _AUTO_WINDOW]
    if spoken_range:
        lo, hi = spoken_range
        in_speech = [f for f in near if lo - 2.0 <= f.timestamp <= hi + 2.0]
        near = in_speech or near
    if not near:
        return None  # -> caller extracts a fresh frame at the exact step time

    sharps = [_sharpness(f.path) for f in near]
    smin, smax = min(sharps), max(sharps)

    def norm_sharp(value: float) -> float:
        return (value - smin) / (smax - smin) if smax > smin else 1.0

    def score(frame: FrameRecord) -> float:
        prox = 1.0 - min(abs(frame.timestamp - timestamp) / _AUTO_WINDOW, 1.0)
        sem = _text_relevance(frame.caption, step_text)
        return _W_PROX * prox + _W_SEM * sem + _W_SHARP * norm_sharp(_sharpness(frame.path))

    return max(near, key=score)


def assemble_guide(
    draft: GuideDraft,
    frames: list[FrameRecord],
    *,
    video_path: str | Path | None = None,
    session_dir: str | Path | None = None,
    do_caption: bool = True,
    token: str | None = None,
    spoken_range: tuple[float, float] | None = None,
    progress: Callable[[float, str], None] | None = None,
) -> Guide:
    """Combine a guide draft with frames into a fully illustrated :class:`Guide`.

    When captioning is on, the whole (deduped) frame pool is captioned once so
    BLIP can both *suggest* the most relevant frame per step and supply the
    figure captions. ``spoken_range`` (first/last narration time) keeps selection
    inside the narrated portion of the video.
    """
    frames_sorted = sorted(frames, key=lambda f: f.timestamp)

    # Caption the pool up front (once per frame, context-free to keep the
    # relevance signal unbiased) so captions feed both selection and figures.
    if do_caption and frames_sorted:
        for i, rec in enumerate(frames_sorted):
            if rec.caption is None:
                if progress:
                    progress(
                        0.05 + 0.45 * (i / len(frames_sorted)),
                        f"Captioning frame {i + 1}/{len(frames_sorted)}…",
                    )
                rec.caption = vision.caption_image(rec.path, token=token) or ""

    used: set[str] = set()
    steps: list[GuideStep] = []
    total = max(len(draft.steps), 1)

    for i, sd in enumerate(draft.steps):
        if progress:
            progress(0.5 + 0.5 * (i / total), f"Matching image to step {i + 1}/{total}…")

        step_text = f"{sd.heading} {sd.text}".strip()
        chosen = _pick_frame(frames_sorted, sd.approx_timestamp, step_text, used, spoken_range)

        # No suitable frame nearby — extract one at the step timestamp.
        if (
            chosen is None
            and video_path
            and session_dir
            and sd.approx_timestamp is not None
        ):
            out = Path(session_dir) / "frames" / f"step_{i:03d}_{int(sd.approx_timestamp * 1000):08d}.png"
            try:
                video.extract_frame(video_path, sd.approx_timestamp, out)
                chosen = FrameRecord(path=str(out), timestamp=sd.approx_timestamp, source="auto")
            except Exception:
                chosen = None

        image_path = None
        caption = None
        if chosen is not None:
            used.add(chosen.path)
            image_path = chosen.path
            if chosen.caption is None and do_caption:  # freshly extracted frame
                chosen.caption = (
                    vision.caption_image(chosen.path, token=token, context=step_text) or ""
                )
            caption = chosen.caption or None

        steps.append(
            GuideStep(
                heading=sd.heading,
                text=sd.text,
                timestamp=sd.approx_timestamp if sd.approx_timestamp is not None
                else (chosen.timestamp if chosen else None),
                image_path=image_path,
                caption=caption,
            )
        )

    if progress:
        progress(1.0, "Guide assembled.")

    return Guide(
        title=draft.title,
        intro=draft.intro,
        prerequisites=draft.prerequisites,
        steps=steps,
    )