Spaces:

vivekchakraverty
/

DocuMaker

Sleeping

File size: 5,497 Bytes

85b485a

"""Multimodal pass: caption frames and score them for "informativeness".

Captioning prefers a vision LLM on the HuggingFace Inference API and falls back
to a local BLIP model (only if torch/transformers are installed). Frame scoring
uses a cheap sharpness heuristic (variance of the Laplacian) so the guide builder
can prefer crisp, content-rich frames over blurry scene-transition frames.
"""
from __future__ import annotations

import base64
import io
from pathlib import Path

from . import config

_LOCAL_PROC = None
_LOCAL_MODEL = None
_LOCAL_DEVICE = "cpu"
_LOCAL_FAILED = False
# Many free HF accounts have no provider that serves a vision-chat model. Once
# the API VLM fails, stop retrying it for the session and use local BLIP.
_API_VLM_DISABLED = False

_CAPTION_PROMPT = (
    "In one concise sentence, describe what this screenshot from a tutorial shows, "
    "focusing on the on-screen UI element or the action being performed. "
    "Do not begin with phrases like 'The image shows'."
)


def _data_uri(image_path: str | Path, max_side: int = 1024) -> str:
    """Downscale + JPEG-encode an image into a data URI (saves API bandwidth)."""
    from PIL import Image

    with Image.open(image_path) as im:
        im = im.convert("RGB")
        im.thumbnail((max_side, max_side))
        buf = io.BytesIO()
        im.save(buf, format="JPEG", quality=85)
    return "data:image/jpeg;base64," + base64.b64encode(buf.getvalue()).decode()


def _get_vlm_client(token: str | None):
    from huggingface_hub import InferenceClient

    kwargs = {"model": config.VLM_MODEL}
    if token:
        kwargs["token"] = token
    if config.VLM_PROVIDER:
        kwargs["provider"] = config.VLM_PROVIDER
    return InferenceClient(**kwargs)


def _caption_via_api(image_path: str | Path, prompt: str, token: str | None) -> str:
    client = _get_vlm_client(token)
    resp = client.chat_completion(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": _data_uri(image_path)}},
                ],
            }
        ],
        max_tokens=120,
        temperature=0.2,
    )
    return (resp.choices[0].message.content or "").strip()


def _load_local_captioner() -> None:
    """Load the BLIP captioner directly (the image-to-text pipeline task was
    removed in transformers 5). Uses the GPU if a CUDA build of torch is present.
    """
    global _LOCAL_PROC, _LOCAL_MODEL, _LOCAL_DEVICE
    from transformers import AutoProcessor

    try:
        from transformers import AutoModelForImageTextToText as _AutoCaptionModel
    except Exception:  # older transformers
        from transformers import AutoModelForVision2Seq as _AutoCaptionModel

    proc = AutoProcessor.from_pretrained(config.LOCAL_CAPTION_MODEL)
    model = _AutoCaptionModel.from_pretrained(config.LOCAL_CAPTION_MODEL)

    device = "cpu"
    try:
        import torch

        if torch.cuda.is_available():
            device = "cuda"
            model = model.to(device)
    except Exception:
        pass

    _LOCAL_PROC, _LOCAL_MODEL, _LOCAL_DEVICE = proc, model, device


def _caption_via_local(image_path: str | Path) -> str:
    """Local BLIP captioner. Returns '' if torch/transformers are unavailable."""
    global _LOCAL_FAILED
    if _LOCAL_FAILED:
        return ""
    if _LOCAL_MODEL is None:
        try:
            _load_local_captioner()
        except Exception:
            _LOCAL_FAILED = True
            return ""
    try:
        import torch
        from PIL import Image

        with Image.open(image_path) as im:
            img = im.convert("RGB")
        inputs = _LOCAL_PROC(images=img, return_tensors="pt")
        if _LOCAL_DEVICE != "cpu":
            inputs = {k: v.to(_LOCAL_DEVICE) for k, v in inputs.items()}
        with torch.no_grad():
            out = _LOCAL_MODEL.generate(**inputs, max_new_tokens=40)
        return _LOCAL_PROC.decode(out[0], skip_special_tokens=True).strip()
    except Exception:
        return ""


def caption_image(
    image_path: str | Path, *, token: str | None = None, context: str = ""
) -> str | None:
    """Return a one-line caption for a frame, or None if captioning is off/failed.

    With a ``token`` it tries an API vision-chat model first (if any provider
    serves one), then falls back to local BLIP. After the API VLM fails once it
    is skipped for the rest of the session to avoid repeated dead calls. Local
    BLIP needs no token.
    """
    global _API_VLM_DISABLED
    if not config.ENABLE_VISION:
        return None
    prompt = _CAPTION_PROMPT
    if context:
        prompt += f" For context, this step is about: {context[:200]}"

    if token and not _API_VLM_DISABLED:
        try:
            caption = _caption_via_api(image_path, prompt, token)
            if caption:
                return caption
        except Exception:
            _API_VLM_DISABLED = True  # no usable provider — switch to local BLIP

    caption = _caption_via_local(image_path)
    return caption or None


def frame_score(image_path: str | Path) -> float:
    """Sharpness score (variance of Laplacian). Higher = crisper/more detailed."""
    try:
        import cv2

        img = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE)
        if img is None:
            return 0.0
        return float(cv2.Laplacian(img, cv2.CV_64F).var())
    except Exception:
        return 0.0