Spaces:

videopix
/

image2caption

Running

App Files Files Community

videopix commited on Dec 1, 2025

Commit

0ecd601

verified ·

1 Parent(s): 8775421

Update app_working_api.py

Browse files

Files changed (1) hide show

app_working_api.py +50 -239

app_working_api.py CHANGED Viewed

@@ -1,264 +1,75 @@
-import uvicorn
-import base64
 import io
-import numpy as np
-from fastapi import FastAPI
-from fastapi.responses import HTMLResponse
-from pydantic import BaseModel
-from PIL import Image, ImageOps, ImageEnhance
 import torch
-from transformers import BlipProcessor, BlipForConditionalGeneration
-import easyocr
-import os
-# ------------------------
-# HF Token
-# ------------------------
-HF_TOKEN = os.getenv("HF_TOKEN")
-# ------------------------
-# Load BLIP model
-# ------------------------
-device = torch.device("cpu")
-processor = BlipProcessor.from_pretrained(
-    "Salesforce/blip-image-captioning-large",
-    use_auth_token=HF_TOKEN
 )
-model = BlipForConditionalGeneration.from_pretrained(
-    "Salesforce/blip-image-captioning-large",
-    use_auth_token=HF_TOKEN
-).to(device)
-model.eval()
-# ------------------------
-# Load OCR Reader
-# ------------------------
-ocr_reader = easyocr.Reader(
-    ["en"],
-    gpu=False,
-    recog_network="english_g2"      # BEST for mixed fonts / stylized text
-)
-# ------------------------
-# FastAPI App
-# ------------------------
-app = FastAPI()
-class ImageRequest(BaseModel):
-    image_base64: str
-# ------------------------
-# Improve OCR by preprocessing image
-# ------------------------
-def preprocess_for_ocr(img: Image.Image) -> np.ndarray:
-    # Convert to grayscale
-    gray = ImageOps.grayscale(img)
-    # Increase contrast
-    enhancer = ImageEnhance.Contrast(gray)
-    gray = enhancer.enhance(2.0)
-    # Increase brightness slightly
-    enhancer = ImageEnhance.Brightness(gray)
-    gray = enhancer.enhance(1.1)
-    # Convert to numpy
-    return np.array(gray)
-# ------------------------
-# OCR Function (improved)
-# ------------------------
-def extract_text(img: Image.Image) -> str:
-    pre_img = preprocess_for_ocr(img)
-    result = ocr_reader.readtext(
-        pre_img,
-        detail=0,
-        paragraph=True
     )
-    return "\n".join(result) if result else "No text detected."
-# ------------------------
-# Caption Function (clean output)
-# ------------------------
-def create_caption(img: Image.Image) -> str:
-    inputs = processor(img, return_tensors="pt").to(device)
-    with torch.no_grad():
-        out = model.generate(
-            **inputs,
-            max_length=150,
-            min_length=30,
-            num_beams=5,
-            repetition_penalty=1.1,
-            length_penalty=1.0,
-            temperature=0.7
-        )
-    caption = processor.decode(out[0], skip_special_tokens=True)
-    # REMOVE prompt words if BLIP inserted them
-    caption = caption.replace("describe this image", "").strip()
-    caption = caption.replace("describe the image", "").strip()
-    return caption
-# ------------------------
-# API Endpoint: /img2caption
-# ------------------------
 @app.post("/img2caption")
-async def img2caption(payload: ImageRequest):
     try:
-        img_bytes = base64.b64decode(payload.image_base64)
-        img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
-        caption = create_caption(img)
-        return {"caption": caption}
-    except Exception as e:
-        return {"error": str(e)}
-# ------------------------
-# API Endpoint: /ocr
-# ------------------------
-@app.post("/ocr")
-async def ocr_endpoint(payload: ImageRequest):
-    try:
-        img_bytes = base64.b64decode(payload.image_base64)
-        img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
-        text = extract_text(img)
-        return {"ocr_text": text}
-    except Exception as e:
-        return {"error": str(e)}
-# ------------------------
-# API Endpoint: /ocr
-# ------------------------
-@app.post("/ocr")
-async def ocr_endpoint(payload: ImageRequest):
-    try:
-        img_bytes = base64.b64decode(payload.image_base64)
-        img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
-        text = extract_text(img)
-        return {"ocr_text": text}
     except Exception as e:
-        return {"error": str(e)}
-# ------------------------
-# UI Endpoint: /
-# ------------------------
-@app.get("/", response_class=HTMLResponse)
-async def ui_page():
-    return """
-<!DOCTYPE html>
-<html>
-<head>
-    <title>Image Caption + OCR</title>
-    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet">
-    <style>
-        body { background: #f5f7fa; }
-        .container { max-width: 650px; margin-top: 60px; }
-        #preview {
-            width: 100%; border-radius: 10px; margin-top: 20px; display: none;
-        }
-        #caption-box {
-            font-size: 18px; margin-top: 20px; padding: 15px;
-            border-radius: 8px; background: #e3f2fd; display: none;
-        }
-    </style>
-</head>
-<body>
-<div class="container">
-    <div class="card shadow-sm">
-        <div class="card-body">
-            <h3 class="text-center mb-3">Image Caption + OCR Extractor</h3>
-            <input type="file" class="form-control" id="imageInput" accept="image/*">
-            <img id="preview">
-            <div class="d-grid gap-2 mt-3">
-                <button class="btn btn-primary btn-lg" onclick="sendCaption()">
-                    Generate Detailed Caption
-                </button>
-                <button class="btn btn-success btn-lg" onclick="sendOCR()">
-                    Extract Text (OCR)
-                </button>
-            </div>
-            <div id="caption-box"></div>
-        </div>
-    </div>
-</div>
-<script>
-let base64Image = "";
-document.getElementById("imageInput").addEventListener("change", function(event){
-    const file = event.target.files[0];
-    const reader = new FileReader();
-    reader.onload = function(e){
-        base64Image = e.target.result.split(",")[1];
-        const preview = document.getElementById("preview");
-        preview.src = e.target.result;
-        preview.style.display = "block";
-    };
-    reader.readAsDataURL(file);
-});
-async function sendCaption() {
-    if (!base64Image) {
-        alert("Please upload an image first.");
-        return;
-    }
-    const box = document.getElementById("caption-box");
-    box.style.display = "block";
-    box.innerHTML = "Generating caption...";
-    const res = await fetch("/img2caption", {
-        method: "POST",
-        headers: { "Content-Type": "application/json" },
-        body: JSON.stringify({ image_base64: base64Image })
-    });
-    const data = await res.json();
-    box.innerHTML = data.caption
-        ? "<strong>Caption:</strong> " + data.caption
-        : "<strong>Error:</strong> " + data.error;
-}
-async function sendOCR() {
-    if (!base64Image) {
-        alert("Please upload an image first.");
-        return;
-    }
-    const box = document.getElementById("caption-box");
-    box.style.display = "block";
-    box.innerHTML = "Extracting text...";
-    const res = await fetch("/ocr", {
-        method: "POST",
-        headers: { "Content-Type": "application/json" },
-        body: JSON.stringify({ image_base64: base64Image })
-    });
-    const data = await res.json();
-    box.innerHTML = data.ocr_text
-        ? "<strong>OCR Result:</strong><br>" + data.ocr_text.replaceAll("\\n", "<br>")
-        : "<strong>Error:</strong> " + data.error;
-}
-</script>
-</body>
-</html>
-"""
-# -------------------------
-# Run App
-# -------------------------
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 import io
+import asyncio
+import threading
+import time
+from fastapi import FastAPI, File, UploadFile
+from fastapi.responses import JSONResponse
+from PIL import Image
 import torch
+from transformers import AutoProcessor, AutoModelForCausalLM
+import requests
+app = FastAPI(title="Image Caption API")
+# Load model once at startup
+device = "cuda" if torch.cuda.is_available() else "cpu"
+processor = AutoProcessor.from_pretrained(
+    "microsoft/Florence-2-base",
+    trust_remote_code=True
 )
+model = AutoModelForCausalLM.from_pretrained(
+    "microsoft/Florence-2-base",
+    trust_remote_code=True
+).to(device).eval()
+# A lock to allow multiple requests safely
+inference_lock = asyncio.Lock()
+def caption_image(image: Image.Image) -> str:
+    inputs = processor(
+        text="<MORE_DETAILED_CAPTION>",
+        images=image,
+        return_tensors="pt",
+    ).to(device)
+    output_ids = model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=256,
+        num_beams=3,
     )
+    decoded = processor.batch_decode(output_ids, skip_special_tokens=False)[0]
+    parsed = processor.post_process_generation(
+        decoded,
+        task="<MORE_DETAILED_CAPTION>",
+        image_size=(image.width, image.height),
+    )
+    return parsed["<MORE_DETAILED_CAPTION>"]
 @app.post("/img2caption")
+async def img2caption(file: UploadFile = File(...)):
     try:
+        # Read image
+        data = await file.read()
+        image = Image.open(io.BytesIO(data)).convert("RGB")
+        # Protect inference in async server
+        async with inference_lock:
+            caption = caption_image(image)
+        return {"caption": caption}
     except Exception as e:
+        return JSONResponse({"error": str(e)}, status_code=500)
+@app.get("/health")
+async def health():
+    return {"status": "ok"}