Spaces:

vivekchakraverty
/

DocuMaker

Sleeping

App Files Files Community

DocuMaker / src /vision.py

vivekchakraverty

DocuMaker: video to step-by-step DOCX guide (Whisper + HF LLM + BLIP)

85b485a 13 days ago

Raw

History Blame Contribute Delete

5.5 kB

	"""Multimodal pass: caption frames and score them for "informativeness".

	Captioning prefers a vision LLM on the HuggingFace Inference API and falls back
	to a local BLIP model (only if torch/transformers are installed). Frame scoring
	uses a cheap sharpness heuristic (variance of the Laplacian) so the guide builder
	can prefer crisp, content-rich frames over blurry scene-transition frames.
	"""
	from __future__ import annotations

	import base64
	import io
	from pathlib import Path

	from . import config

	_LOCAL_PROC = None
	_LOCAL_MODEL = None
	_LOCAL_DEVICE = "cpu"
	_LOCAL_FAILED = False
	# Many free HF accounts have no provider that serves a vision-chat model. Once
	# the API VLM fails, stop retrying it for the session and use local BLIP.
	_API_VLM_DISABLED = False

	_CAPTION_PROMPT = (
	"In one concise sentence, describe what this screenshot from a tutorial shows, "
	"focusing on the on-screen UI element or the action being performed. "
	"Do not begin with phrases like 'The image shows'."
	)


	def _data_uri(image_path: str \| Path, max_side: int = 1024) -> str:
	"""Downscale + JPEG-encode an image into a data URI (saves API bandwidth)."""
	from PIL import Image

	with Image.open(image_path) as im:
	im = im.convert("RGB")
	im.thumbnail((max_side, max_side))
	buf = io.BytesIO()
	im.save(buf, format="JPEG", quality=85)
	return "data:image/jpeg;base64," + base64.b64encode(buf.getvalue()).decode()


	def _get_vlm_client(token: str \| None):
	from huggingface_hub import InferenceClient

	kwargs = {"model": config.VLM_MODEL}
	if token:
	kwargs["token"] = token
	if config.VLM_PROVIDER:
	kwargs["provider"] = config.VLM_PROVIDER
	return InferenceClient(**kwargs)


	def _caption_via_api(image_path: str \| Path, prompt: str, token: str \| None) -> str:
	client = _get_vlm_client(token)
	resp = client.chat_completion(
	messages=[
	{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{"type": "image_url", "image_url": {"url": _data_uri(image_path)}},
	],
	}
	],
	max_tokens=120,
	temperature=0.2,
	)
	return (resp.choices[0].message.content or "").strip()


	def _load_local_captioner() -> None:
	"""Load the BLIP captioner directly (the image-to-text pipeline task was
	removed in transformers 5). Uses the GPU if a CUDA build of torch is present.
	"""
	global _LOCAL_PROC, _LOCAL_MODEL, _LOCAL_DEVICE
	from transformers import AutoProcessor

	try:
	from transformers import AutoModelForImageTextToText as _AutoCaptionModel
	except Exception: # older transformers
	from transformers import AutoModelForVision2Seq as _AutoCaptionModel

	proc = AutoProcessor.from_pretrained(config.LOCAL_CAPTION_MODEL)
	model = _AutoCaptionModel.from_pretrained(config.LOCAL_CAPTION_MODEL)

	device = "cpu"
	try:
	import torch

	if torch.cuda.is_available():
	device = "cuda"
	model = model.to(device)
	except Exception:
	pass

	_LOCAL_PROC, _LOCAL_MODEL, _LOCAL_DEVICE = proc, model, device


	def _caption_via_local(image_path: str \| Path) -> str:
	"""Local BLIP captioner. Returns '' if torch/transformers are unavailable."""
	global _LOCAL_FAILED
	if _LOCAL_FAILED:
	return ""
	if _LOCAL_MODEL is None:
	try:
	_load_local_captioner()
	except Exception:
	_LOCAL_FAILED = True
	return ""
	try:
	import torch
	from PIL import Image

	with Image.open(image_path) as im:
	img = im.convert("RGB")
	inputs = _LOCAL_PROC(images=img, return_tensors="pt")
	if _LOCAL_DEVICE != "cpu":
	inputs = {k: v.to(_LOCAL_DEVICE) for k, v in inputs.items()}
	with torch.no_grad():
	out = _LOCAL_MODEL.generate(**inputs, max_new_tokens=40)
	return _LOCAL_PROC.decode(out[0], skip_special_tokens=True).strip()
	except Exception:
	return ""


	def caption_image(
	image_path: str \| Path, *, token: str \| None = None, context: str = ""
	) -> str \| None:
	"""Return a one-line caption for a frame, or None if captioning is off/failed.

	With a ``token`` it tries an API vision-chat model first (if any provider
	serves one), then falls back to local BLIP. After the API VLM fails once it
	is skipped for the rest of the session to avoid repeated dead calls. Local
	BLIP needs no token.
	"""
	global _API_VLM_DISABLED
	if not config.ENABLE_VISION:
	return None
	prompt = _CAPTION_PROMPT
	if context:
	prompt += f" For context, this step is about: {context[:200]}"

	if token and not _API_VLM_DISABLED:
	try:
	caption = _caption_via_api(image_path, prompt, token)
	if caption:
	return caption
	except Exception:
	_API_VLM_DISABLED = True # no usable provider — switch to local BLIP

	caption = _caption_via_local(image_path)
	return caption or None


	def frame_score(image_path: str \| Path) -> float:
	"""Sharpness score (variance of Laplacian). Higher = crisper/more detailed."""
	try:
	import cv2

	img = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE)
	if img is None:
	return 0.0
	return float(cv2.Laplacian(img, cv2.CV_64F).var())
	except Exception:
	return 0.0