from transformers import BlipProcessor, BlipForConditionalGeneration import io from PIL import Image from fastapi import FastAPI, File, UploadFile, HTTPException from fastapi.responses import JSONResponse import uvicorn import torch # ── Load BLIP Model ──────────────────────────────────────────────────────── processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") def image_to_speech(image: Image.Image) -> str: # Step 1: Generate a more detailed caption # The processor handles the conversion from PIL Image to PyTorch tensors inputs = processor(image, return_tensors="pt") out = model.generate( **inputs, max_length=90, # Allow longer, more detailed sentences num_beams=8, # Use beam search to improve quality repetition_penalty=1.2, # Prevent repeating words length_penalty=1.0, # Balanced caption length early_stopping=True ) caption = processor.decode(out[0], skip_special_tokens=True) return caption # Removed the trailing comma # ── FastAPI app ──────────────────────────────────────────────────────────── app = FastAPI( title="VocalEyes API", description="Converts an uploaded image into a short scene description via BLIP.", version="1.0.0", ) @app.get("/") def root(): return {"status": "ok", "message": "VocalEyes API is running. POST an image to /predict"} @app.post("/predict") async def predict(file: UploadFile = File(...)): # ── Validate content type ────────────────────────────────────────────── if file.content_type not in ("image/jpeg", "image/png", "image/webp", "image/bmp"): raise HTTPException( status_code=415, detail=f"Unsupported file type '{file.content_type}'. Send JPEG, PNG, WEBP, or BMP.", ) # ── Read & preprocess ────────────────────────────────────────────────── try: raw = await file.read() image = Image.open(io.BytesIO(raw)).convert("RGB") except Exception as e: raise HTTPException(status_code=400, detail=f"Could not read image: {e}") # ── Run pipeline ─────────────────────────────────────────────────────── try: # Pass the PIL Image directly to our BLIP function caption = image_to_speech(image) except Exception as e: raise HTTPException(status_code=500, detail=f"Pipeline error: {e}") # Return the clean description return JSONResponse({ "description": caption }) # ── Entry point ──────────────────────────────────────────────────────────── if __name__ == "__main__": # Make sure this file is saved as app.py if you are passing "app:app" uvicorn.run("app:app", host="0.0.0.0", port=7860)