from transformers import BlipProcessor, BlipForConditionalGeneration
import io
from PIL import Image
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
import uvicorn
import torch

# ── Load BLIP Model ────────────────────────────────────────────────────────
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def image_to_speech(image: Image.Image) -> str:
    # Step 1: Generate a more detailed caption
    # The processor handles the conversion from PIL Image to PyTorch tensors
    inputs = processor(image, return_tensors="pt")
    
    out = model.generate(
        **inputs,
        max_length=90,          # Allow longer, more detailed sentences
        num_beams=8,            # Use beam search to improve quality
        repetition_penalty=1.2, # Prevent repeating words
        length_penalty=1.0,     # Balanced caption length
        early_stopping=True
    )
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption # Removed the trailing comma

# ── FastAPI app ────────────────────────────────────────────────────────────
app = FastAPI(
    title="VocalEyes API",
    description="Converts an uploaded image into a short scene description via BLIP.",
    version="1.0.0",
)

@app.get("/")
def root():
    return {"status": "ok", "message": "VocalEyes API is running. POST an image to /predict"}

@app.post("/predict")
async def predict(file: UploadFile = File(...)):
    # ── Validate content type ──────────────────────────────────────────────
    if file.content_type not in ("image/jpeg", "image/png", "image/webp", "image/bmp"):
        raise HTTPException(
            status_code=415,
            detail=f"Unsupported file type '{file.content_type}'. Send JPEG, PNG, WEBP, or BMP.",
        )
        
    # ── Read & preprocess ──────────────────────────────────────────────────
    try:
        raw = await file.read()
        image = Image.open(io.BytesIO(raw)).convert("RGB")
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"Could not read image: {e}")

    # ── Run pipeline ───────────────────────────────────────────────────────
    try:
        # Pass the PIL Image directly to our BLIP function
        caption = image_to_speech(image)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Pipeline error: {e}")

    # Return the clean description
    return JSONResponse({
        "description": caption
    })

# ── Entry point ────────────────────────────────────────────────────────────
if __name__ == "__main__":
    # Make sure this file is saved as app.py if you are passing "app:app"
    uvicorn.run("app:app", host="0.0.0.0", port=7860)