Spaces:

videopix
/

image2caption

Running

App Files Files Community

videopix commited on Dec 1, 2025

Commit

8dfd66b

verified ·

1 Parent(s): 9a1b65b

Upload app_working_api.py

Browse files

Files changed (1) hide show

app_working_api.py +264 -0

app_working_api.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import uvicorn
+import base64
+import io
+import numpy as np
+from fastapi import FastAPI
+from fastapi.responses import HTMLResponse
+from pydantic import BaseModel
+from PIL import Image, ImageOps, ImageEnhance
+import torch
+from transformers import BlipProcessor, BlipForConditionalGeneration
+import easyocr
+import os
+# ------------------------
+# HF Token
+# ------------------------
+HF_TOKEN = os.getenv("HF_TOKEN")
+# ------------------------
+# Load BLIP model
+# ------------------------
+device = torch.device("cpu")
+processor = BlipProcessor.from_pretrained(
+    "Salesforce/blip-image-captioning-large",
+    use_auth_token=HF_TOKEN
+)
+model = BlipForConditionalGeneration.from_pretrained(
+    "Salesforce/blip-image-captioning-large",
+    use_auth_token=HF_TOKEN
+).to(device)
+model.eval()
+# ------------------------
+# Load OCR Reader
+# ------------------------
+ocr_reader = easyocr.Reader(
+    ["en"],
+    gpu=False,
+    recog_network="english_g2"      # BEST for mixed fonts / stylized text
+)
+# ------------------------
+# FastAPI App
+# ------------------------
+app = FastAPI()
+class ImageRequest(BaseModel):
+    image_base64: str
+# ------------------------
+# Improve OCR by preprocessing image
+# ------------------------
+def preprocess_for_ocr(img: Image.Image) -> np.ndarray:
+    # Convert to grayscale
+    gray = ImageOps.grayscale(img)
+    # Increase contrast
+    enhancer = ImageEnhance.Contrast(gray)
+    gray = enhancer.enhance(2.0)
+    # Increase brightness slightly
+    enhancer = ImageEnhance.Brightness(gray)
+    gray = enhancer.enhance(1.1)
+    # Convert to numpy
+    return np.array(gray)
+# ------------------------
+# OCR Function (improved)
+# ------------------------
+def extract_text(img: Image.Image) -> str:
+    pre_img = preprocess_for_ocr(img)
+    result = ocr_reader.readtext(
+        pre_img,
+        detail=0,
+        paragraph=True
+    )
+    return "\n".join(result) if result else "No text detected."
+# ------------------------
+# Caption Function (clean output)
+# ------------------------
+def create_caption(img: Image.Image) -> str:
+    inputs = processor(img, return_tensors="pt").to(device)
+    with torch.no_grad():
+        out = model.generate(
+            **inputs,
+            max_length=150,
+            min_length=30,
+            num_beams=5,
+            repetition_penalty=1.1,
+            length_penalty=1.0,
+            temperature=0.7
+        )
+    caption = processor.decode(out[0], skip_special_tokens=True)
+    # REMOVE prompt words if BLIP inserted them
+    caption = caption.replace("describe this image", "").strip()
+    caption = caption.replace("describe the image", "").strip()
+    return caption
+# ------------------------
+# API Endpoint: /img2caption
+# ------------------------
+@app.post("/img2caption")
+async def img2caption(payload: ImageRequest):
+    try:
+        img_bytes = base64.b64decode(payload.image_base64)
+        img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+        caption = create_caption(img)
+        return {"caption": caption}
+    except Exception as e:
+        return {"error": str(e)}
+# ------------------------
+# API Endpoint: /ocr
+# ------------------------
+@app.post("/ocr")
+async def ocr_endpoint(payload: ImageRequest):
+    try:
+        img_bytes = base64.b64decode(payload.image_base64)
+        img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+        text = extract_text(img)
+        return {"ocr_text": text}
+    except Exception as e:
+        return {"error": str(e)}
+# ------------------------
+# API Endpoint: /ocr
+# ------------------------
+@app.post("/ocr")
+async def ocr_endpoint(payload: ImageRequest):
+    try:
+        img_bytes = base64.b64decode(payload.image_base64)
+        img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+        text = extract_text(img)
+        return {"ocr_text": text}
+    except Exception as e:
+        return {"error": str(e)}
+# ------------------------
+# UI Endpoint: /
+# ------------------------
+@app.get("/", response_class=HTMLResponse)
+async def ui_page():
+    return """
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Image Caption + OCR</title>
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet">
+    <style>
+        body { background: #f5f7fa; }
+        .container { max-width: 650px; margin-top: 60px; }
+        #preview {
+            width: 100%; border-radius: 10px; margin-top: 20px; display: none;
+        }
+        #caption-box {
+            font-size: 18px; margin-top: 20px; padding: 15px;
+            border-radius: 8px; background: #e3f2fd; display: none;
+        }
+    </style>
+</head>
+<body>
+<div class="container">
+    <div class="card shadow-sm">
+        <div class="card-body">
+            <h3 class="text-center mb-3">Image Caption + OCR Extractor</h3>
+            <input type="file" class="form-control" id="imageInput" accept="image/*">
+            <img id="preview">
+            <div class="d-grid gap-2 mt-3">
+                <button class="btn btn-primary btn-lg" onclick="sendCaption()">
+                    Generate Detailed Caption
+                </button>
+                <button class="btn btn-success btn-lg" onclick="sendOCR()">
+                    Extract Text (OCR)
+                </button>
+            </div>
+            <div id="caption-box"></div>
+        </div>
+    </div>
+</div>
+<script>
+let base64Image = "";
+document.getElementById("imageInput").addEventListener("change", function(event){
+    const file = event.target.files[0];
+    const reader = new FileReader();
+    reader.onload = function(e){
+        base64Image = e.target.result.split(",")[1];
+        const preview = document.getElementById("preview");
+        preview.src = e.target.result;
+        preview.style.display = "block";
+    };
+    reader.readAsDataURL(file);
+});
+async function sendCaption() {
+    if (!base64Image) {
+        alert("Please upload an image first.");
+        return;
+    }
+    const box = document.getElementById("caption-box");
+    box.style.display = "block";
+    box.innerHTML = "Generating caption...";
+    const res = await fetch("/img2caption", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({ image_base64: base64Image })
+    });
+    const data = await res.json();
+    box.innerHTML = data.caption
+        ? "<strong>Caption:</strong> " + data.caption
+        : "<strong>Error:</strong> " + data.error;
+}
+async function sendOCR() {
+    if (!base64Image) {
+        alert("Please upload an image first.");
+        return;
+    }
+    const box = document.getElementById("caption-box");
+    box.style.display = "block";
+    box.innerHTML = "Extracting text...";
+    const res = await fetch("/ocr", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({ image_base64: base64Image })
+    });
+    const data = await res.json();
+    box.innerHTML = data.ocr_text
+        ? "<strong>OCR Result:</strong><br>" + data.ocr_text.replaceAll("\\n", "<br>")
+        : "<strong>Error:</strong> " + data.error;
+}
+</script>
+</body>
+</html>
+"""
+# -------------------------
+# Run App
+# -------------------------
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)