diff --git a/app.py b/app.py index 0000000..1111111 100644 --- a/app.py +++ b/app.py @@ -1,16 +1,28 @@ import gradio as gr import torch from PIL import Image from transformers import AutoTokenizer, AutoModelForCausalLM -import spaces # Model configuration MID = "apple/FastVLM-0.5B" IMAGE_TOKEN_INDEX = -200 # Load model and tokenizer (will be loaded on first GPU allocation) tok = None model = None def load_model(): global tok, model if tok is None or model is None: print("Loading model...") tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained( - MID, - torch_dtype=torch.float16, - device_map="cuda", - trust_remote_code=True, - ) + # ---- CPU-first, with dynamic fallback if CUDA exists ---- + use_cuda = torch.cuda.is_available() + device_map = "cuda" if use_cuda else "cpu" + # float16 is great on GPU, but unsafe on CPU; use float32 on CPU + dtype = torch.float16 if use_cuda else torch.float32 + + model = AutoModelForCausalLM.from_pretrained( + MID, + torch_dtype=dtype, + device_map=device_map, + trust_remote_code=True, + ) print("Model loaded successfully!") return tok, model - -@spaces.GPU(duration=60) + +# Removed GPU decorator so CPU Spaces don't request a GPU def caption_image(image, custom_prompt=None): @@ -66,16 +78,23 @@ def caption_image(image, custom_prompt=None): # Insert IMAGE token id at placeholder position - img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype) - input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device) - attention_mask = torch.ones_like(input_ids, device=model.device) + # Derive device/dtype from model parameters (robust on CPU or GPU) + model_device = next(model.parameters()).device + model_dtype = next(model.parameters()).dtype + + img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype, device=model_device) + input_ids = torch.cat([pre_ids.to(model_device), img_tok, post_ids.to(model_device)], dim=1) + attention_mask = torch.ones_like(input_ids, device=model_device) # Preprocess image using model's vision tower px = model.get_vision_tower().image_processor( images=image, return_tensors="pt" )["pixel_values"] - px = px.to(model.device, dtype=model.dtype) + px = px.to(model_device, dtype=model_dtype) # Generate caption with torch.no_grad(): out = model.generate( inputs=input_ids, attention_mask=attention_mask, images=px, max_new_tokens=128, do_sample=False, # Deterministic generation - temperature=1.0, + # temperature is ignored when do_sample=False )