File size: 3,520 Bytes
67434ff
caf813e
 
 
 
67434ff
 
f82a827
67434ff
dc69312
 
f82a827
67434ff
dc69312
67434ff
dc69312
67434ff
dc69312
 
 
 
 
 
 
 
 
67434ff
dc69312
67434ff
b3d99de
 
67434ff
b3d99de
 
f82a827
b3d99de
 
 
f82a827
b3d99de
 
67434ff
b3d99de
 
 
 
 
67434ff
 
b3d99de
 
 
 
 
 
67434ff
b3d99de
67434ff
 
b3d99de
 
 
67434ff
b3d99de
67434ff
b3d99de
 
67434ff
b3d99de
67434ff
b3d99de
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from transformers import BlipProcessor, BlipForConditionalGeneration
import io
from PIL import Image
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
import uvicorn
import torch

# ── Load BLIP Model ────────────────────────────────────────────────────────
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def image_to_speech(image: Image.Image) -> str:
    # Step 1: Generate a more detailed caption
    # The processor handles the conversion from PIL Image to PyTorch tensors
    inputs = processor(image, return_tensors="pt")
    
    out = model.generate(
        **inputs,
        max_length=90,          # Allow longer, more detailed sentences
        num_beams=8,            # Use beam search to improve quality
        repetition_penalty=1.2, # Prevent repeating words
        length_penalty=1.0,     # Balanced caption length
        early_stopping=True
    )
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption # Removed the trailing comma

# ── FastAPI app ────────────────────────────────────────────────────────────
app = FastAPI(
    title="VocalEyes API",
    description="Converts an uploaded image into a short scene description via BLIP.",
    version="1.0.0",
)

@app.get("/")
def root():
    return {"status": "ok", "message": "VocalEyes API is running. POST an image to /predict"}

@app.post("/predict")
async def predict(file: UploadFile = File(...)):
    # ── Validate content type ──────────────────────────────────────────────
    if file.content_type not in ("image/jpeg", "image/png", "image/webp", "image/bmp"):
        raise HTTPException(
            status_code=415,
            detail=f"Unsupported file type '{file.content_type}'. Send JPEG, PNG, WEBP, or BMP.",
        )
        
    # ── Read & preprocess ──────────────────────────────────────────────────
    try:
        raw = await file.read()
        image = Image.open(io.BytesIO(raw)).convert("RGB")
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"Could not read image: {e}")

    # ── Run pipeline ───────────────────────────────────────────────────────
    try:
        # Pass the PIL Image directly to our BLIP function
        caption = image_to_speech(image)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Pipeline error: {e}")

    # Return the clean description
    return JSONResponse({
        "description": caption
    })

# ── Entry point ────────────────────────────────────────────────────────────
if __name__ == "__main__":
    # Make sure this file is saved as app.py if you are passing "app:app"
    uvicorn.run("app:app", host="0.0.0.0", port=7860)