Spaces:
Sleeping
Sleeping
File size: 3,520 Bytes
67434ff caf813e 67434ff f82a827 67434ff dc69312 f82a827 67434ff dc69312 67434ff dc69312 67434ff dc69312 67434ff dc69312 67434ff b3d99de 67434ff b3d99de f82a827 b3d99de f82a827 b3d99de 67434ff b3d99de 67434ff b3d99de 67434ff b3d99de 67434ff b3d99de 67434ff b3d99de 67434ff b3d99de 67434ff b3d99de 67434ff b3d99de | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | from transformers import BlipProcessor, BlipForConditionalGeneration
import io
from PIL import Image
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
import uvicorn
import torch
# ββ Load BLIP Model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
def image_to_speech(image: Image.Image) -> str:
# Step 1: Generate a more detailed caption
# The processor handles the conversion from PIL Image to PyTorch tensors
inputs = processor(image, return_tensors="pt")
out = model.generate(
**inputs,
max_length=90, # Allow longer, more detailed sentences
num_beams=8, # Use beam search to improve quality
repetition_penalty=1.2, # Prevent repeating words
length_penalty=1.0, # Balanced caption length
early_stopping=True
)
caption = processor.decode(out[0], skip_special_tokens=True)
return caption # Removed the trailing comma
# ββ FastAPI app ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
app = FastAPI(
title="VocalEyes API",
description="Converts an uploaded image into a short scene description via BLIP.",
version="1.0.0",
)
@app.get("/")
def root():
return {"status": "ok", "message": "VocalEyes API is running. POST an image to /predict"}
@app.post("/predict")
async def predict(file: UploadFile = File(...)):
# ββ Validate content type ββββββββββββββββββββββββββββββββββββββββββββββ
if file.content_type not in ("image/jpeg", "image/png", "image/webp", "image/bmp"):
raise HTTPException(
status_code=415,
detail=f"Unsupported file type '{file.content_type}'. Send JPEG, PNG, WEBP, or BMP.",
)
# ββ Read & preprocess ββββββββββββββββββββββββββββββββββββββββββββββββββ
try:
raw = await file.read()
image = Image.open(io.BytesIO(raw)).convert("RGB")
except Exception as e:
raise HTTPException(status_code=400, detail=f"Could not read image: {e}")
# ββ Run pipeline βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
try:
# Pass the PIL Image directly to our BLIP function
caption = image_to_speech(image)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Pipeline error: {e}")
# Return the clean description
return JSONResponse({
"description": caption
})
# ββ Entry point ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
if __name__ == "__main__":
# Make sure this file is saved as app.py if you are passing "app:app"
uvicorn.run("app:app", host="0.0.0.0", port=7860) |