Spaces:
Sleeping
Sleeping
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| import io | |
| from PIL import Image | |
| from fastapi import FastAPI, File, UploadFile, HTTPException | |
| from fastapi.responses import JSONResponse | |
| import uvicorn | |
| import torch | |
| # ββ Load BLIP Model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
| model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") | |
| def image_to_speech(image: Image.Image) -> str: | |
| # Step 1: Generate a more detailed caption | |
| # The processor handles the conversion from PIL Image to PyTorch tensors | |
| inputs = processor(image, return_tensors="pt") | |
| out = model.generate( | |
| **inputs, | |
| max_length=90, # Allow longer, more detailed sentences | |
| num_beams=8, # Use beam search to improve quality | |
| repetition_penalty=1.2, # Prevent repeating words | |
| length_penalty=1.0, # Balanced caption length | |
| early_stopping=True | |
| ) | |
| caption = processor.decode(out[0], skip_special_tokens=True) | |
| return caption # Removed the trailing comma | |
| # ββ FastAPI app ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| app = FastAPI( | |
| title="VocalEyes API", | |
| description="Converts an uploaded image into a short scene description via BLIP.", | |
| version="1.0.0", | |
| ) | |
| def root(): | |
| return {"status": "ok", "message": "VocalEyes API is running. POST an image to /predict"} | |
| async def predict(file: UploadFile = File(...)): | |
| # ββ Validate content type ββββββββββββββββββββββββββββββββββββββββββββββ | |
| if file.content_type not in ("image/jpeg", "image/png", "image/webp", "image/bmp"): | |
| raise HTTPException( | |
| status_code=415, | |
| detail=f"Unsupported file type '{file.content_type}'. Send JPEG, PNG, WEBP, or BMP.", | |
| ) | |
| # ββ Read & preprocess ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| try: | |
| raw = await file.read() | |
| image = Image.open(io.BytesIO(raw)).convert("RGB") | |
| except Exception as e: | |
| raise HTTPException(status_code=400, detail=f"Could not read image: {e}") | |
| # ββ Run pipeline βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| try: | |
| # Pass the PIL Image directly to our BLIP function | |
| caption = image_to_speech(image) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Pipeline error: {e}") | |
| # Return the clean description | |
| return JSONResponse({ | |
| "description": caption | |
| }) | |
| # ββ Entry point ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| # Make sure this file is saved as app.py if you are passing "app:app" | |
| uvicorn.run("app:app", host="0.0.0.0", port=7860) |