Spaces:

SVECTOR-CORPORATION
/

Fal-2-500M-Demo

Sleeping

App Files Files Community

SVECTOR-OFFICIAL commited on Oct 29, 2025

Commit

204c402

verified ·

1 Parent(s): 26e01cf

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -13

app.py CHANGED Viewed

@@ -15,15 +15,30 @@ def load_model():
     if tok is None or model is None:
         print("Loading model...")
         tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
-        model = AutoModelForCausalLM.from_pretrained(
-            MID,
-            dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device_map="auto" if torch.cuda.is_available() else None,
-            trust_remote_code=True,
-        )
-        if not torch.cuda.is_available():
-            model = model.to('cpu')
-        print("Model loaded successfully!")
     return tok, model
 @spaces.GPU(duration=60)
@@ -34,6 +49,7 @@ def caption_image(image, custom_prompt=None):
     try:
         # Load model if not already loaded
         tok, model = load_model()
         # Convert image to RGB if needed
         if image.mode != "RGB":
             image = image.convert("RGB")
@@ -58,16 +74,20 @@ def caption_image(image, custom_prompt=None):
         pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
         post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
         # Insert IMAGE token id at placeholder position
         img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
-        input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
-        attention_mask = torch.ones_like(input_ids, device=model.device)
         # Preprocess image using model's vision tower
         px = model.get_vision_tower().image_processor(
             images=image, return_tensors="pt"
         )["pixel_values"]
-        px = px.to(model.device, dtype=model.dtype)
         # Generate caption
         with torch.no_grad():
@@ -92,7 +112,9 @@ def caption_image(image, custom_prompt=None):
         return response
     except Exception as e:
-        return f"Error generating caption: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="Fal-2 Image Captioning") as demo:

     if tok is None or model is None:
         print("Loading model...")
         tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
+        # Determine device and dtype
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        # Load model without device_map for CPU, or with proper device_map for CUDA
+        if torch.cuda.is_available():
+            model = AutoModelForCausalLM.from_pretrained(
+                MID,
+                torch_dtype=dtype,
+                device_map="auto",
+                trust_remote_code=True,
+            )
+        else:
+            # For CPU: load directly to CPU without device_map
+            model = AutoModelForCausalLM.from_pretrained(
+                MID,
+                torch_dtype=dtype,
+                trust_remote_code=True,
+            )
+            model = model.to(device)
+        model.eval()  # Set to evaluation mode
+        print(f"Model loaded successfully on {device}!")
     return tok, model
 @spaces.GPU(duration=60)
     try:
         # Load model if not already loaded
         tok, model = load_model()
         # Convert image to RGB if needed
         if image.mode != "RGB":
             image = image.convert("RGB")
         pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
         post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
+        # Get model device and dtype
+        device = next(model.parameters()).device
+        dtype = next(model.parameters()).dtype
         # Insert IMAGE token id at placeholder position
         img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
+        input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(device)
+        attention_mask = torch.ones_like(input_ids, device=device)
         # Preprocess image using model's vision tower
         px = model.get_vision_tower().image_processor(
             images=image, return_tensors="pt"
         )["pixel_values"]
+        px = px.to(device, dtype=dtype)
         # Generate caption
         with torch.no_grad():
         return response
     except Exception as e:
+        import traceback
+        error_detail = traceback.format_exc()
+        return f"Error generating caption: {str(e)}\n\nDetails:\n{error_detail}"
 # Create Gradio interface
 with gr.Blocks(title="Fal-2 Image Captioning") as demo: