diff --git a/app.py b/app.py
index 0000000..1111111 100644
--- a/app.py
+++ b/app.py
@@ -1,16 +1,28 @@
 import gradio as gr
 import torch
 from PIL import Image
 from transformers import AutoTokenizer, AutoModelForCausalLM
-import spaces

 # Model configuration
 MID = "apple/FastVLM-0.5B"
 IMAGE_TOKEN_INDEX = -200

 # Load model and tokenizer (will be loaded on first GPU allocation)
 tok = None
 model = None
 def load_model():
     global tok, model
     if tok is None or model is None:
         print("Loading model...")
         tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
-        model = AutoModelForCausalLM.from_pretrained(
-            MID,
-            torch_dtype=torch.float16,
-            device_map="cuda",
-            trust_remote_code=True,
-        )
+        # ---- CPU-first, with dynamic fallback if CUDA exists ----
+        use_cuda = torch.cuda.is_available()
+        device_map = "cuda" if use_cuda else "cpu"
+        # float16 is great on GPU, but unsafe on CPU; use float32 on CPU
+        dtype = torch.float16 if use_cuda else torch.float32
+
+        model = AutoModelForCausalLM.from_pretrained(
+            MID,
+            torch_dtype=dtype,
+            device_map=device_map,
+            trust_remote_code=True,
+        )
         print("Model loaded successfully!")
     return tok, model
-
-@spaces.GPU(duration=60)
+
+# Removed GPU decorator so CPU Spaces don't request a GPU
 def caption_image(image, custom_prompt=None):
@@ -66,16 +78,23 @@ def caption_image(image, custom_prompt=None):
     # Insert IMAGE token id at placeholder position
-    img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
-    input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
-    attention_mask = torch.ones_like(input_ids, device=model.device)
+    # Derive device/dtype from model parameters (robust on CPU or GPU)
+    model_device = next(model.parameters()).device
+    model_dtype = next(model.parameters()).dtype
+
+    img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype, device=model_device)
+    input_ids = torch.cat([pre_ids.to(model_device), img_tok, post_ids.to(model_device)], dim=1)
+    attention_mask = torch.ones_like(input_ids, device=model_device)

     # Preprocess image using model's vision tower
     px = model.get_vision_tower().image_processor(
         images=image, return_tensors="pt"
     )["pixel_values"]
-    px = px.to(model.device, dtype=model.dtype)
+    px = px.to(model_device, dtype=model_dtype)

     # Generate caption
     with torch.no_grad():
         out = model.generate(
             inputs=input_ids,
             attention_mask=attention_mask,
             images=px,
             max_new_tokens=128,
             do_sample=False,  # Deterministic generation
-            temperature=1.0,
+            # temperature is ignored when do_sample=False
         )