"""Multimodal pass: caption frames and score them for "informativeness". Captioning prefers a vision LLM on the HuggingFace Inference API and falls back to a local BLIP model (only if torch/transformers are installed). Frame scoring uses a cheap sharpness heuristic (variance of the Laplacian) so the guide builder can prefer crisp, content-rich frames over blurry scene-transition frames. """ from __future__ import annotations import base64 import io from pathlib import Path from . import config _LOCAL_PROC = None _LOCAL_MODEL = None _LOCAL_DEVICE = "cpu" _LOCAL_FAILED = False # Many free HF accounts have no provider that serves a vision-chat model. Once # the API VLM fails, stop retrying it for the session and use local BLIP. _API_VLM_DISABLED = False _CAPTION_PROMPT = ( "In one concise sentence, describe what this screenshot from a tutorial shows, " "focusing on the on-screen UI element or the action being performed. " "Do not begin with phrases like 'The image shows'." ) def _data_uri(image_path: str | Path, max_side: int = 1024) -> str: """Downscale + JPEG-encode an image into a data URI (saves API bandwidth).""" from PIL import Image with Image.open(image_path) as im: im = im.convert("RGB") im.thumbnail((max_side, max_side)) buf = io.BytesIO() im.save(buf, format="JPEG", quality=85) return "data:image/jpeg;base64," + base64.b64encode(buf.getvalue()).decode() def _get_vlm_client(token: str | None): from huggingface_hub import InferenceClient kwargs = {"model": config.VLM_MODEL} if token: kwargs["token"] = token if config.VLM_PROVIDER: kwargs["provider"] = config.VLM_PROVIDER return InferenceClient(**kwargs) def _caption_via_api(image_path: str | Path, prompt: str, token: str | None) -> str: client = _get_vlm_client(token) resp = client.chat_completion( messages=[ { "role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": _data_uri(image_path)}}, ], } ], max_tokens=120, temperature=0.2, ) return (resp.choices[0].message.content or "").strip() def _load_local_captioner() -> None: """Load the BLIP captioner directly (the image-to-text pipeline task was removed in transformers 5). Uses the GPU if a CUDA build of torch is present. """ global _LOCAL_PROC, _LOCAL_MODEL, _LOCAL_DEVICE from transformers import AutoProcessor try: from transformers import AutoModelForImageTextToText as _AutoCaptionModel except Exception: # older transformers from transformers import AutoModelForVision2Seq as _AutoCaptionModel proc = AutoProcessor.from_pretrained(config.LOCAL_CAPTION_MODEL) model = _AutoCaptionModel.from_pretrained(config.LOCAL_CAPTION_MODEL) device = "cpu" try: import torch if torch.cuda.is_available(): device = "cuda" model = model.to(device) except Exception: pass _LOCAL_PROC, _LOCAL_MODEL, _LOCAL_DEVICE = proc, model, device def _caption_via_local(image_path: str | Path) -> str: """Local BLIP captioner. Returns '' if torch/transformers are unavailable.""" global _LOCAL_FAILED if _LOCAL_FAILED: return "" if _LOCAL_MODEL is None: try: _load_local_captioner() except Exception: _LOCAL_FAILED = True return "" try: import torch from PIL import Image with Image.open(image_path) as im: img = im.convert("RGB") inputs = _LOCAL_PROC(images=img, return_tensors="pt") if _LOCAL_DEVICE != "cpu": inputs = {k: v.to(_LOCAL_DEVICE) for k, v in inputs.items()} with torch.no_grad(): out = _LOCAL_MODEL.generate(**inputs, max_new_tokens=40) return _LOCAL_PROC.decode(out[0], skip_special_tokens=True).strip() except Exception: return "" def caption_image( image_path: str | Path, *, token: str | None = None, context: str = "" ) -> str | None: """Return a one-line caption for a frame, or None if captioning is off/failed. With a ``token`` it tries an API vision-chat model first (if any provider serves one), then falls back to local BLIP. After the API VLM fails once it is skipped for the rest of the session to avoid repeated dead calls. Local BLIP needs no token. """ global _API_VLM_DISABLED if not config.ENABLE_VISION: return None prompt = _CAPTION_PROMPT if context: prompt += f" For context, this step is about: {context[:200]}" if token and not _API_VLM_DISABLED: try: caption = _caption_via_api(image_path, prompt, token) if caption: return caption except Exception: _API_VLM_DISABLED = True # no usable provider — switch to local BLIP caption = _caption_via_local(image_path) return caption or None def frame_score(image_path: str | Path) -> float: """Sharpness score (variance of Laplacian). Higher = crisper/more detailed.""" try: import cv2 img = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE) if img is None: return 0.0 return float(cv2.Laplacian(img, cv2.CV_64F).var()) except Exception: return 0.0