ABDRauf's picture
fixed errors
67434ff verified
from transformers import BlipProcessor, BlipForConditionalGeneration
import io
from PIL import Image
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
import uvicorn
import torch
# ── Load BLIP Model ────────────────────────────────────────────────────────
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
def image_to_speech(image: Image.Image) -> str:
# Step 1: Generate a more detailed caption
# The processor handles the conversion from PIL Image to PyTorch tensors
inputs = processor(image, return_tensors="pt")
out = model.generate(
**inputs,
max_length=90, # Allow longer, more detailed sentences
num_beams=8, # Use beam search to improve quality
repetition_penalty=1.2, # Prevent repeating words
length_penalty=1.0, # Balanced caption length
early_stopping=True
)
caption = processor.decode(out[0], skip_special_tokens=True)
return caption # Removed the trailing comma
# ── FastAPI app ────────────────────────────────────────────────────────────
app = FastAPI(
title="VocalEyes API",
description="Converts an uploaded image into a short scene description via BLIP.",
version="1.0.0",
)
@app.get("/")
def root():
return {"status": "ok", "message": "VocalEyes API is running. POST an image to /predict"}
@app.post("/predict")
async def predict(file: UploadFile = File(...)):
# ── Validate content type ──────────────────────────────────────────────
if file.content_type not in ("image/jpeg", "image/png", "image/webp", "image/bmp"):
raise HTTPException(
status_code=415,
detail=f"Unsupported file type '{file.content_type}'. Send JPEG, PNG, WEBP, or BMP.",
)
# ── Read & preprocess ──────────────────────────────────────────────────
try:
raw = await file.read()
image = Image.open(io.BytesIO(raw)).convert("RGB")
except Exception as e:
raise HTTPException(status_code=400, detail=f"Could not read image: {e}")
# ── Run pipeline ───────────────────────────────────────────────────────
try:
# Pass the PIL Image directly to our BLIP function
caption = image_to_speech(image)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Pipeline error: {e}")
# Return the clean description
return JSONResponse({
"description": caption
})
# ── Entry point ────────────────────────────────────────────────────────────
if __name__ == "__main__":
# Make sure this file is saved as app.py if you are passing "app:app"
uvicorn.run("app:app", host="0.0.0.0", port=7860)