Spaces:

Rishabh2234
/

image-captionator

Sleeping

Rishabh2234 commited on Mar 2, 2025

Commit

f90ca7e

1 Parent(s): 87cea5c

files for inference generation

Files changed (2) hide show

app.py CHANGED Viewed

@@ -4,14 +4,12 @@ from PIL import Image
 import torchvision.transforms as transforms
 import torch
 from model import load_model
-import os
-# Initialize FastAPI
 app = FastAPI()
-# Set device and checkpoint path (use a relative path so it stays within your Space's storage)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-checkpoint_path = "checkpoint.pth"
 # Load the model and tokenizer
 model, tokenizer = load_model(checkpoint_path, device)
@@ -30,18 +28,11 @@ def read_root():
 @app.post("/generate_caption/")
 async def generate_caption(file: UploadFile = File(...)):
     try:
-        # Read image file from the request
         contents = await file.read()
         image = Image.open(io.BytesIO(contents)).convert("RGB")
-        # Preprocess the image
         image_tensor = transform(image).unsqueeze(0).to(device)
-        # Generate caption using your model's generate method
         output_ids = model.generate(pixel_values=image_tensor, max_length=30, num_beams=4)
         caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
         return {"caption": caption}
     except Exception as e:
         return {"error": str(e)}

 import torchvision.transforms as transforms
 import torch
 from model import load_model
 app = FastAPI()
+# Set device and use a writable checkpoint path (e.g., /tmp)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+checkpoint_path = "/tmp/checkpoint.pth"  # Updated path
 # Load the model and tokenizer
 model, tokenizer = load_model(checkpoint_path, device)
 @app.post("/generate_caption/")
 async def generate_caption(file: UploadFile = File(...)):
     try:
         contents = await file.read()
         image = Image.open(io.BytesIO(contents)).convert("RGB")
         image_tensor = transform(image).unsqueeze(0).to(device)
         output_ids = model.generate(pixel_values=image_tensor, max_length=30, num_beams=4)
         caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
         return {"caption": caption}
     except Exception as e:
         return {"error": str(e)}

model.py CHANGED Viewed

@@ -55,7 +55,6 @@ class ViTT5(nn.Module):
             temperature=0.9,         # More diverse outputs
             **kwargs
         )
 def download_checkpoint(checkpoint_path):
     """
     Downloads the checkpoint from Hugging Face Model Hub if not found locally.
@@ -72,6 +71,7 @@ def download_checkpoint(checkpoint_path):
     else:
         raise RuntimeError(f"Error downloading model, status code: {response.status_code}")
 def load_model(checkpoint_path, device):
     """
     Loads the ViTT5 model along with the T5 tokenizer.

             temperature=0.9,         # More diverse outputs
             **kwargs
         )
 def download_checkpoint(checkpoint_path):
     """
     Downloads the checkpoint from Hugging Face Model Hub if not found locally.
     else:
         raise RuntimeError(f"Error downloading model, status code: {response.status_code}")
 def load_model(checkpoint_path, device):
     """
     Loads the ViTT5 model along with the T5 tokenizer.