FastVLM_Caption_Generator

Sleeping

App Files Files Community

FastVLM_Caption_Generator / app.py

adityaardak

Update app.py

3bd8cbe verified 5 months ago

raw

history blame

2.91 kB

	diff --git a/app.py b/app.py
	index 0000000..1111111 100644
	--- a/app.py
	+++ b/app.py
	@@ -1,16 +1,28 @@
	import gradio as gr
	import torch
	from PIL import Image
	from transformers import AutoTokenizer, AutoModelForCausalLM
	-import spaces

	# Model configuration
	MID = "apple/FastVLM-0.5B"
	IMAGE_TOKEN_INDEX = -200

	# Load model and tokenizer (will be loaded on first GPU allocation)
	tok = None
	model = None
	def load_model():
	global tok, model
	if tok is None or model is None:
	print("Loading model...")
	tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
	- model = AutoModelForCausalLM.from_pretrained(
	- MID,
	- torch_dtype=torch.float16,
	- device_map="cuda",
	- trust_remote_code=True,
	- )
	+ # ---- CPU-first, with dynamic fallback if CUDA exists ----
	+ use_cuda = torch.cuda.is_available()
	+ device_map = "cuda" if use_cuda else "cpu"
	+ # float16 is great on GPU, but unsafe on CPU; use float32 on CPU
	+ dtype = torch.float16 if use_cuda else torch.float32
	+
	+ model = AutoModelForCausalLM.from_pretrained(
	+ MID,
	+ torch_dtype=dtype,
	+ device_map=device_map,
	+ trust_remote_code=True,
	+ )
	print("Model loaded successfully!")
	return tok, model
	-
	-@spaces.GPU(duration=60)
	+
	+# Removed GPU decorator so CPU Spaces don't request a GPU
	def caption_image(image, custom_prompt=None):
	@@ -66,16 +78,23 @@ def caption_image(image, custom_prompt=None):
	# Insert IMAGE token id at placeholder position
	- img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
	- input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
	- attention_mask = torch.ones_like(input_ids, device=model.device)
	+ # Derive device/dtype from model parameters (robust on CPU or GPU)
	+ model_device = next(model.parameters()).device
	+ model_dtype = next(model.parameters()).dtype
	+
	+ img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype, device=model_device)
	+ input_ids = torch.cat([pre_ids.to(model_device), img_tok, post_ids.to(model_device)], dim=1)
	+ attention_mask = torch.ones_like(input_ids, device=model_device)

	# Preprocess image using model's vision tower
	px = model.get_vision_tower().image_processor(
	images=image, return_tensors="pt"
	)["pixel_values"]
	- px = px.to(model.device, dtype=model.dtype)
	+ px = px.to(model_device, dtype=model_dtype)

	# Generate caption
	with torch.no_grad():
	out = model.generate(
	inputs=input_ids,
	attention_mask=attention_mask,
	images=px,
	max_new_tokens=128,
	do_sample=False, # Deterministic generation
	- temperature=1.0,
	+ # temperature is ignored when do_sample=False
	)