Spaces:

Kimhi
/

CARES

Running

App Files Files Community

CARES / app.py

Kimhi

Disabling Server-Side Rendering (SSR)

57581f7 verified about 1 month ago

raw

history blame contribute delete

15.2 kB

	import os
	import dataclasses
	from typing import Optional, Dict, Any, Tuple

	import torch
	import torch.nn.functional as F
	import gradio as gr
	from PIL import Image

	from transformers import (
	AutoProcessor,
	AutoModelForVision2Seq,
	AutoModelForCausalLM,
	)

	from peft import PeftModel, PeftConfig

	# -----------------------------
	# CPU-only enforcement
	# -----------------------------
	FORCE_CPU = True
	DEVICE = torch.device("cpu")

	# -----------------------------
	# Resolution gate
	# -----------------------------
	RESOLUTION_MAP = {0: 384, 1: 768, 2: 1024}

	def load_and_resize_image(img: Image.Image, max_size: Optional[int] = None) -> Image.Image:
	img = img.convert("RGB")
	if max_size is None:
	return img
	w, h = img.size
	if max(w, h) <= max_size:
	return img
	s = max_size / max(w, h)
	return img.resize((round(w * s), round(h * s)), Image.BICUBIC)

	def token_id_for_digit(tokenizer, digit: str) -> int:
	ids = tokenizer.encode(digit, add_special_tokens=False)
	if not ids:
	raise ValueError(f"Could not encode digit {digit!r}")
	return ids[-1]

	class GraniteDoclingGateHF:
	def __init__(self, adapter_repo: str, token: Optional[str] = None):
	self.device = DEVICE

	peft_cfg = PeftConfig.from_pretrained(adapter_repo, token=token)
	base_model_name = peft_cfg.base_model_name_or_path

	self.processor = AutoProcessor.from_pretrained(adapter_repo, token=token)

	# CPU: use float32 for safety (bfloat16/float16 often slower or problematic on CPU)
	torch_dtype = torch.float32

	base_model = AutoModelForVision2Seq.from_pretrained(
	base_model_name, torch_dtype=torch_dtype
	)

	self.model = PeftModel.from_pretrained(base_model, adapter_repo, token=token)
	self.model.to(self.device).eval()

	tok = self.processor.tokenizer
	self.class_token_ids = [
	token_id_for_digit(tok, "0"),
	token_id_for_digit(tok, "1"),
	token_id_for_digit(tok, "2"),
	]

	@torch.no_grad()
	def predict_probs(self, image: Image.Image, question: str):
	messages = [
	{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": question}]}
	]
	prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
	inputs = self.processor(text=[prompt], images=[image], return_tensors="pt")
	inputs = {k: v.to(self.device) for k, v in inputs.items() if hasattr(v, "to")}

	outputs = self.model(**inputs)
	next_token_logits = outputs.logits[:, -1, :]
	class_logits = next_token_logits[:, self.class_token_ids]
	probs = F.softmax(class_logits, dim=-1)[0].detach().float().cpu().tolist()
	return probs

	def predict_expected(self, image: Image.Image, question: str) -> float:
	probs = self.predict_probs(image, question)
	return float(sum(RESOLUTION_MAP[i] * probs[i] for i in range(3)))

	# -----------------------------
	# CPU-friendly downstream HF VLM inference
	# -----------------------------
	# IMPORTANT: Choose models that can run on CPU.
	# Many VLMs will be too slow/heavy on CPU; start small.
	DOWNSTREAM_MODELS = {
	"ibm-granite/granite-vision-3.3-2b (recommended CPU)": "ibm-granite/granite-vision-3.3-2b",
	"HuggingFaceTB/SmolVLM-256M-Instruct (tiny CPU)": "HuggingFaceTB/SmolVLM-256M-Instruct",
	"google/paligemma-3b-mix-224 (CPU)": "google/paligemma-3b-mix-224",

	# Your list (kept available but not recommended on CPU)
	"Qwen/Qwen2.5-VL-3B-Instruct (slow CPU)": "Qwen/Qwen2.5-VL-3B-Instruct",
	"Qwen/Qwen2.5-VL-7B-Instruct (very slow CPU)": "Qwen/Qwen2.5-VL-7B-Instruct",
	"Qwen/Qwen2.5-VL-72B-Instruct (not for CPU)": "Qwen/Qwen2.5-VL-72B-Instruct",
	"Qwen/Qwen3-VL-8B-Instruct (very slow CPU)": "Qwen/Qwen3-VL-8B-Instruct",
	"OpenGVLab/InternVL3_5-8B (very slow CPU)": "OpenGVLab/InternVL3_5-8B",
	"OpenGVLab/InternVL3_5-38B (not for CPU)": "OpenGVLab/InternVL3_5-38B",
	"OpenGVLab/InternVL3_5-241B-A28B (not for CPU)": "OpenGVLab/InternVL3_5-241B-A28B",

	"None (gate only)": None,
	}


	_model_cache: Dict[str, Tuple[Any, Any]] = {}

	import inspect

	torch.set_num_threads(int(os.getenv("TORCH_NUM_THREADS", "4")))

	def get_vlm(model_id: str):
	if model_id in _model_cache:
	return _model_cache[model_id]

	proc = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

	common_kwargs = dict(
	torch_dtype=torch.float32,
	trust_remote_code=True,
	)

	# low_cpu_mem_usage exists on many HF models; use it if supported
	try:
	sig = inspect.signature(AutoModelForVision2Seq.from_pretrained)
	if "low_cpu_mem_usage" in sig.parameters:
	common_kwargs["low_cpu_mem_usage"] = True
	except Exception:
	pass

	model = None
	err = None

	# Try Vision2Seq first
	try:
	model = AutoModelForVision2Seq.from_pretrained(model_id, **common_kwargs)
	except Exception as e:
	err = e
	# Fallback: CausalLM
	model = AutoModelForCausalLM.from_pretrained(model_id, **common_kwargs)

	model.to(DEVICE).eval()
	_model_cache[model_id] = (proc, model)
	return proc, model


	# def get_vlm(model_id: str):
	# if model_id in _model_cache:
	# return _model_cache[model_id]

	# # CPU-only: float32 and no device_map
	# proc = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

	# try:
	# model = AutoModelForVision2Seq.from_pretrained(
	# model_id,
	# torch_dtype=torch.float32,
	# trust_remote_code=True,
	# )
	# except Exception:
	# model = AutoModelForCausalLM.from_pretrained(
	# model_id,
	# torch_dtype=torch.float32,
	# trust_remote_code=True,
	# )

	# model.to(DEVICE).eval()
	# _model_cache[model_id] = (proc, model)
	# return proc, model

	# @torch.no_grad()
	# def vlm_answer(model_id: str, image: Image.Image, question: str, max_new_tokens: int = 96) -> str:
	# proc, model = get_vlm(model_id)

	# messages = [
	# {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": question}]}
	# ]

	# if hasattr(proc, "apply_chat_template"):
	# prompt = proc.apply_chat_template(messages, add_generation_prompt=True)
	# inputs = proc(text=[prompt], images=[image], return_tensors="pt")
	# else:
	# inputs = proc(text=[question], images=[image], return_tensors="pt")

	# inputs = {k: v.to(DEVICE) for k, v in inputs.items() if hasattr(v, "to")}

	# out = model.generate(**inputs, max_new_tokens=max_new_tokens)
	# text = proc.batch_decode(out, skip_special_tokens=True)[0].strip()

	# # Heuristic: remove prompt echoes
	# if question in text and len(text) > 2 * len(question):
	# text = text.split(question, 1)[-1].strip()
	# return text

	##GV ONLY
	# @torch.no_grad()
	# def vlm_answer(model_id: str, image: Image.Image, question: str, max_new_tokens: int = 96) -> str:
	# proc, model = get_vlm(model_id)

	# conversation = [
	# {
	# "role": "user",
	# "content": [
	# {"type": "image"},
	# {"type": "text", "text": question},
	# ],
	# }
	# ]

	# # Prefer the Granite-style path if supported
	# if hasattr(proc, "apply_chat_template"):
	# try:
	# inputs = proc.apply_chat_template(
	# conversation,
	# add_generation_prompt=True,
	# tokenize=True,
	# return_dict=True,
	# return_tensors="pt",
	# images=image, # some processors accept this; if not, except below
	# )
	# except TypeError:
	# # Fallback: build prompt then call processor(text, images)
	# prompt = proc.apply_chat_template(conversation, add_generation_prompt=True)
	# inputs = proc(text=[prompt], images=[image], return_tensors="pt")
	# else:
	# inputs = proc(text=[question], images=[image], return_tensors="pt")

	# inputs = {k: v.to(DEVICE) for k, v in inputs.items() if hasattr(v, "to")}

	# out = model.generate(**inputs, max_new_tokens=max_new_tokens)
	# text = proc.batch_decode(out, skip_special_tokens=True)[0].strip()

	# if question in text and len(text) > 2 * len(question):
	# text = text.split(question, 1)[-1].strip()
	# return text


	@torch.no_grad()
	def vlm_answer(model_id: str, image: Image.Image, question: str, max_new_tokens: int = 96) -> str:
	proc, model = get_vlm(model_id)

	# ---- Path A: model.chat (InternVL-style, some others) ----
	if hasattr(model, "chat") and callable(getattr(model, "chat")):
	try:
	# Different repos have different signatures; this is the most common pattern.
	# If it fails, we fall back to processor+generate.
	return str(model.chat(proc, image, question)).strip()
	except Exception:
	pass

	# ---- Path B: processor + generate ----
	conversation = [{
	"role": "user",
	"content": [{"type": "image"}, {"type": "text", "text": question}],
	}]

	inputs = None

	if hasattr(proc, "apply_chat_template"):
	# Try Granite-style “tokenize=True” path
	try:
	inputs = proc.apply_chat_template(
	conversation,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
	images=image, # supported by some processors
	)
	except Exception:
	# Fallback: create prompt string then call processor
	try:
	prompt = proc.apply_chat_template(conversation, add_generation_prompt=True)
	inputs = proc(text=[prompt], images=[image], return_tensors="pt")
	except Exception:
	inputs = None

	if inputs is None:
	# Final fallback: no templates
	inputs = proc(text=[question], images=[image], return_tensors="pt")

	inputs = {k: v.to(DEVICE) for k, v in inputs.items() if hasattr(v, "to")}

	out = model.generate(**inputs, max_new_tokens=max_new_tokens)
	text = proc.batch_decode(out, skip_special_tokens=True)[0].strip()

	# prompt-echo cleanup (best effort)
	if question in text and len(text) > 2 * len(question):
	text = text.split(question, 1)[-1].strip()
	return text

	def cpu_model_allowed(model_id: str) -> Tuple[bool, str]:
	mid = (model_id or "").lower()
	blocked = ["72b", "38b", "241b", "a28b"]
	if any(b in mid for b in blocked):
	return False, "Too large for CPU Space (will OOM)."
	return True, ""


	# -----------------------------
	# Resolution selection strategy
	# -----------------------------
	def choose_resolution(expected: float, probs: list, strategy: str) -> int:
	if strategy == "expected":
	return int(round(expected))
	if strategy == "argmax":
	k = int(max(range(len(probs)), key=lambda i: probs[i]))
	return int(RESOLUTION_MAP[k])
	# conservative: choose highest bucket if it has meaningful mass, else next, else lowest
	if probs[2] >= 0.34:
	return int(RESOLUTION_MAP[2])
	if probs[1] >= 0.34:
	return int(RESOLUTION_MAP[1])
	return int(RESOLUTION_MAP[0])

	# -----------------------------
	# Gradio app
	# -----------------------------
	GATE_ADAPTER_REPO = os.getenv("GATE_ADAPTER_REPO", "Kimhi/granite-docling-res-gate-lora")
	HF_TOKEN = os.getenv("HF_TOKEN", None)
	GATE_INPUT_MAX_SIDE = int(os.getenv("GATE_INPUT_MAX_SIDE", "256"))

	gate = None

	def run(image: Image.Image, question: str, vlm_choice: str, strategy: str):
	global gate
	if gate is None:
	gate = GraniteDoclingGateHF(adapter_repo=GATE_ADAPTER_REPO, token=HF_TOKEN)

	if image is None or not question:
	return "Upload an image and enter a question.", None, None

	native_w, native_h = image.size

	# Gate runs on small image
	gate_img = load_and_resize_image(image, GATE_INPUT_MAX_SIDE)
	probs = gate.predict_probs(gate_img, question)
	expected = float(sum(RESOLUTION_MAP[i] * probs[i] for i in range(3)))

	pred = choose_resolution(expected, probs, strategy)

	# never upscale above native max-side
	native_max = max(native_w, native_h)
	used_max = min(pred, native_max)

	resized = load_and_resize_image(image, used_max)
	resized_w, resized_h = resized.size


	model_id = DOWNSTREAM_MODELS.get(vlm_choice)
	if model_id is None:
	answer = "(gate only) No VLM selected."
	else:
	ok, reason = cpu_model_allowed(model_id)
	if not ok:
	answer = f"Blocked on CPU: {reason}"
	else:
	try:
	answer = vlm_answer(model_id, resized, question)
	except Exception as e:
	answer = f"VLM error: {type(e).__name__}: {e}"



	# model_id = DOWNSTREAM_MODELS.get(vlm_choice)
	# if model_id is None:
	# answer = "(gate only) No VLM selected."
	# else:
	# answer = vlm_answer(model_id, resized, question)
	if strategy != "expected":
	info = (
	f"Native: {native_w}×{native_h}\n"
	#f"Gate probs [384,768,1024]: {['%.3f'%p for p in probs]}\n"
	f"Sufficient max-side: {expected:.1f}\n"
	f"Strategy: {strategy}\n"
	f"Predicted sufficient max-side: {pred}\n"
	f"Used max-side (clamped to native): {used_max}\n"
	f"Resized sent to VLM: {resized_w}×{resized_h}\n"
	f"VLM: {vlm_choice}\n"
	)
	else:
	info = (
	f"Native: {native_w}×{native_h}\n"
	#f"Gate probs [384,768,1024]: {['%.3f'%p for p in probs]}\n"
	f"Sufficient max-side: {expected:.1f}\n"
	#f"Strategy: {strategy}\n"
	#f"Predicted sufficient max-side: {pred}\n"
	#f"Used max-side (clamped to native): {used_max}\n"
	#f"Resized sent to VLM: {resized_w}×{resized_h}\n"
	f"VLM: {vlm_choice}\n")

	return info, resized, answer

	with gr.Blocks() as demo:
	gr.Markdown("# CARES – Sufficient Resolution Selection for VLMs")

	with gr.Row():
	inp_img = gr.Image(type="pil", label="Upload image")
	with gr.Column():
	inp_q = gr.Textbox(label="Question", placeholder="Ask something about the image…")
	vlm = gr.Dropdown(
	choices=list(DOWNSTREAM_MODELS.keys()),
	value=list(DOWNSTREAM_MODELS.keys())[0],
	label="VLM",
	)
	strategy = gr.Dropdown(
	choices=["expected", "argmax", "conservative"],
	value="expected",
	label="Resolution selection strategy",
	)
	btn = gr.Button("Run")

	out_info = gr.Textbox(label="Info", lines=10)
	out_img = gr.Image(type="pil", label="Image used for inference (sufficient resolution)")
	out_ans = gr.Textbox(label="Answer", lines=6)

	btn.click(run, inputs=[inp_img, inp_q, vlm, strategy], outputs=[out_info, out_img, out_ans])

	demo.launch(ssr_mode=False)