DocuMaker / src /vision.py
vivekchakraverty's picture
DocuMaker: video to step-by-step DOCX guide (Whisper + HF LLM + BLIP)
85b485a
Raw
History Blame Contribute Delete
5.5 kB
"""Multimodal pass: caption frames and score them for "informativeness".
Captioning prefers a vision LLM on the HuggingFace Inference API and falls back
to a local BLIP model (only if torch/transformers are installed). Frame scoring
uses a cheap sharpness heuristic (variance of the Laplacian) so the guide builder
can prefer crisp, content-rich frames over blurry scene-transition frames.
"""
from __future__ import annotations
import base64
import io
from pathlib import Path
from . import config
_LOCAL_PROC = None
_LOCAL_MODEL = None
_LOCAL_DEVICE = "cpu"
_LOCAL_FAILED = False
# Many free HF accounts have no provider that serves a vision-chat model. Once
# the API VLM fails, stop retrying it for the session and use local BLIP.
_API_VLM_DISABLED = False
_CAPTION_PROMPT = (
"In one concise sentence, describe what this screenshot from a tutorial shows, "
"focusing on the on-screen UI element or the action being performed. "
"Do not begin with phrases like 'The image shows'."
)
def _data_uri(image_path: str | Path, max_side: int = 1024) -> str:
"""Downscale + JPEG-encode an image into a data URI (saves API bandwidth)."""
from PIL import Image
with Image.open(image_path) as im:
im = im.convert("RGB")
im.thumbnail((max_side, max_side))
buf = io.BytesIO()
im.save(buf, format="JPEG", quality=85)
return "data:image/jpeg;base64," + base64.b64encode(buf.getvalue()).decode()
def _get_vlm_client(token: str | None):
from huggingface_hub import InferenceClient
kwargs = {"model": config.VLM_MODEL}
if token:
kwargs["token"] = token
if config.VLM_PROVIDER:
kwargs["provider"] = config.VLM_PROVIDER
return InferenceClient(**kwargs)
def _caption_via_api(image_path: str | Path, prompt: str, token: str | None) -> str:
client = _get_vlm_client(token)
resp = client.chat_completion(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": _data_uri(image_path)}},
],
}
],
max_tokens=120,
temperature=0.2,
)
return (resp.choices[0].message.content or "").strip()
def _load_local_captioner() -> None:
"""Load the BLIP captioner directly (the image-to-text pipeline task was
removed in transformers 5). Uses the GPU if a CUDA build of torch is present.
"""
global _LOCAL_PROC, _LOCAL_MODEL, _LOCAL_DEVICE
from transformers import AutoProcessor
try:
from transformers import AutoModelForImageTextToText as _AutoCaptionModel
except Exception: # older transformers
from transformers import AutoModelForVision2Seq as _AutoCaptionModel
proc = AutoProcessor.from_pretrained(config.LOCAL_CAPTION_MODEL)
model = _AutoCaptionModel.from_pretrained(config.LOCAL_CAPTION_MODEL)
device = "cpu"
try:
import torch
if torch.cuda.is_available():
device = "cuda"
model = model.to(device)
except Exception:
pass
_LOCAL_PROC, _LOCAL_MODEL, _LOCAL_DEVICE = proc, model, device
def _caption_via_local(image_path: str | Path) -> str:
"""Local BLIP captioner. Returns '' if torch/transformers are unavailable."""
global _LOCAL_FAILED
if _LOCAL_FAILED:
return ""
if _LOCAL_MODEL is None:
try:
_load_local_captioner()
except Exception:
_LOCAL_FAILED = True
return ""
try:
import torch
from PIL import Image
with Image.open(image_path) as im:
img = im.convert("RGB")
inputs = _LOCAL_PROC(images=img, return_tensors="pt")
if _LOCAL_DEVICE != "cpu":
inputs = {k: v.to(_LOCAL_DEVICE) for k, v in inputs.items()}
with torch.no_grad():
out = _LOCAL_MODEL.generate(**inputs, max_new_tokens=40)
return _LOCAL_PROC.decode(out[0], skip_special_tokens=True).strip()
except Exception:
return ""
def caption_image(
image_path: str | Path, *, token: str | None = None, context: str = ""
) -> str | None:
"""Return a one-line caption for a frame, or None if captioning is off/failed.
With a ``token`` it tries an API vision-chat model first (if any provider
serves one), then falls back to local BLIP. After the API VLM fails once it
is skipped for the rest of the session to avoid repeated dead calls. Local
BLIP needs no token.
"""
global _API_VLM_DISABLED
if not config.ENABLE_VISION:
return None
prompt = _CAPTION_PROMPT
if context:
prompt += f" For context, this step is about: {context[:200]}"
if token and not _API_VLM_DISABLED:
try:
caption = _caption_via_api(image_path, prompt, token)
if caption:
return caption
except Exception:
_API_VLM_DISABLED = True # no usable provider — switch to local BLIP
caption = _caption_via_local(image_path)
return caption or None
def frame_score(image_path: str | Path) -> float:
"""Sharpness score (variance of Laplacian). Higher = crisper/more detailed."""
try:
import cv2
img = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE)
if img is None:
return 0.0
return float(cv2.Laplacian(img, cv2.CV_64F).var())
except Exception:
return 0.0