Spaces:

d3evil4
/

Image2Caption

Sleeping

App Files Files Community

khushalcodiste commited on Mar 9

Commit

afd6ed3

1 Parent(s): da2a069

fix: huh

Browse files

Files changed (2) hide show

src/model.py +3 -15
src/server.py +7 -67

src/model.py CHANGED Viewed

@@ -15,17 +15,7 @@ MAX_MAX_TOKENS = int(os.getenv("MAX_MAX_TOKENS", "256"))
 MAX_IMAGE_SIDE = int(os.getenv("MAX_IMAGE_SIDE", "896"))
 RESIZE_MULTIPLE = int(os.getenv("RESIZE_MULTIPLE", "32"))
 NUM_BEAMS = int(os.getenv("NUM_BEAMS", "3"))
-TASKS = {
-    "caption": "<CAPTION>",
-    "detailed_caption": "<DETAILED_CAPTION>",
-    "more_detailed_caption": "<MORE_DETAILED_CAPTION>",
-    "ocr": "<OCR>",
-    "ocr_with_region": "<OCR_WITH_REGION>",
-    "object_detection": "<OD>",
-    "dense_region_caption": "<DENSE_REGION_CAPTION>",
-    "region_proposal": "<REGION_PROPOSAL>",
-}
 _model = None
 _processor = None
@@ -78,13 +68,11 @@ def load_model() -> tuple[Any, Any]:
 def generate_caption(
     image_bytes: bytes,
-    task: str = "caption",
     text_input: str | None = None,
     max_tokens: int = DEFAULT_MAX_TOKENS,
 ) -> dict[str, Any]:
     model, processor = load_model()
-    prompt_task = TASKS.get(task, TASKS["caption"])
-    prompt = f"{prompt_task} {text_input.strip()}" if text_input else prompt_task
     safe_max_tokens = min(max(int(max_tokens), 8), MAX_MAX_TOKENS)
     image = _prepare_image(image_bytes)
@@ -110,7 +98,7 @@ def generate_caption(
         try:
             parsed = post_process(
                 generated_text,
-                task=prompt_task,
                 image_size=(image.width, image.height),
             )
         except Exception:

 MAX_IMAGE_SIDE = int(os.getenv("MAX_IMAGE_SIDE", "896"))
 RESIZE_MULTIPLE = int(os.getenv("RESIZE_MULTIPLE", "32"))
 NUM_BEAMS = int(os.getenv("NUM_BEAMS", "3"))
+DEFAULT_PROMPT = os.getenv("DEFAULT_PROMPT", "<CAPTION>")
 _model = None
 _processor = None
 def generate_caption(
     image_bytes: bytes,
     text_input: str | None = None,
     max_tokens: int = DEFAULT_MAX_TOKENS,
 ) -> dict[str, Any]:
     model, processor = load_model()
+    prompt = f"{DEFAULT_PROMPT} {text_input.strip()}" if text_input else DEFAULT_PROMPT
     safe_max_tokens = min(max(int(max_tokens), 8), MAX_MAX_TOKENS)
     image = _prepare_image(image_bytes)
         try:
             parsed = post_process(
                 generated_text,
+                task=DEFAULT_PROMPT,
                 image_size=(image.width, image.height),
             )
         except Exception:

src/server.py CHANGED Viewed

@@ -5,13 +5,12 @@ from typing import Any
 from fastapi import FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import HTMLResponse
-from .model import MODEL_ID, TASKS, DEFAULT_MAX_TOKENS, generate_caption, load_model
 app = FastAPI(
     title="img3txt - Florence-2 API",
-    description="Generate captions, OCR, object detection and more from images using Florence-2.",
     version="1.0.0",
 )
@@ -29,84 +28,25 @@ def warmup_model() -> None:
     load_model()
-@app.get("/", response_class=HTMLResponse, include_in_schema=False)
-def root() -> str:
-    return """<!DOCTYPE html>
-<html lang=\"en\"><head><meta charset=\"utf-8\">
-<meta name=\"viewport\" content=\"width=device-width,initial-scale=1\">
-<title>img3txt - Florence-2 Image Captioning API</title>
-<style>
-*{margin:0;padding:0;box-sizing:border-box}
-body{font-family:system-ui,sans-serif;background:#0f172a;color:#e2e8f0;display:flex;align-items:center;justify-content:center;min-height:100vh}
-.card{background:#1e293b;border-radius:16px;padding:2.5rem;max-width:520px;width:90%;text-align:center;box-shadow:0 25px 50px rgba(0,0,0,.4)}
-h1{font-size:1.8rem;margin-bottom:.5rem}
-.sub{color:#94a3b8;margin-bottom:1.5rem}
-.btn{display:inline-block;padding:.75rem 1.5rem;background:#3b82f6;color:#fff;border-radius:8px;text-decoration:none;font-weight:600;margin:.25rem}
-.btn:hover{background:#2563eb}
-.tasks{margin-top:1.5rem;text-align:left;background:#0f172a;border-radius:8px;padding:1rem}
-.tasks code{color:#38bdf8}
-</style></head><body>
-<div class=\"card\">
-<h1>img3txt</h1>
-<p class=\"sub\">Image captioning, OCR &amp; object detection powered by Florence-2</p>
-<a class=\"btn\" href=\"/docs\">Swagger UI</a>
-<a class=\"btn\" href=\"/health\">Health Check</a>
-<div class=\"tasks\">
-<p><strong>POST /caption</strong> with form fields:</p>
-<ul style=\"margin:.5rem 0 0 1.2rem;color:#94a3b8\">
-<li><code>file</code> - image (required)</li>
-<li><code>task</code> - caption, detailed_caption, more_detailed_caption, ocr, ocr_with_region, object_detection, dense_region_caption, region_proposal</li>
-<li><code>max_tokens</code> - default 64 (smaller = faster)</li>
-</ul>
-</div>
-</div></body></html>"""
 @app.get("/health")
 def health() -> dict[str, Any]:
-    return {"status": "ok", "model": MODEL_ID, "tasks": list(TASKS.keys())}
-@app.post("/caption")
-async def caption(
     file: UploadFile = File(...),
-    task: str = Form("caption"),
     text: str | None = Form(None),
     max_tokens: int = Form(DEFAULT_MAX_TOKENS),
 ) -> dict[str, Any]:
-    if task not in TASKS:
-        raise HTTPException(status_code=400, detail=f"Invalid task. Choose from: {', '.join(TASKS.keys())}")
     image_bytes = await file.read()
     if not image_bytes:
         raise HTTPException(status_code=400, detail="Empty file uploaded")
-    result = generate_caption(image_bytes, task, text, max_tokens)
-    return {"task": task, "result": result}
-@app.post("/caption/batch")
-async def caption_batch(
-    files: list[UploadFile] = File(...),
-    task: str = Form("caption"),
-    text: str | None = Form(None),
-    max_tokens: int = Form(DEFAULT_MAX_TOKENS),
-) -> dict[str, Any]:
-    if task not in TASKS:
-        raise HTTPException(status_code=400, detail=f"Invalid task. Choose from: {', '.join(TASKS.keys())}")
-    results: list[dict[str, Any]] = []
-    for upload in files:
-        image_bytes = await upload.read()
-        if not image_bytes:
-            continue
-        result = generate_caption(image_bytes, task, text, max_tokens)
-        results.append({"filename": upload.filename, "task": task, "result": result})
-    if not results:
-        raise HTTPException(status_code=400, detail="No files uploaded")
-    return {"results": results}
 if __name__ == "__main__":

 from fastapi import FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
+from .model import MODEL_ID, DEFAULT_MAX_TOKENS, generate_caption, load_model
 app = FastAPI(
     title="img3txt - Florence-2 API",
+    description="Simple image-to-text endpoint powered by Florence-2-base.",
     version="1.0.0",
 )
     load_model()
 @app.get("/health")
 def health() -> dict[str, Any]:
+    return {"status": "ok", "model": MODEL_ID}
+@app.post("/predict")
+async def predict(
     file: UploadFile = File(...),
     text: str | None = Form(None),
     max_tokens: int = Form(DEFAULT_MAX_TOKENS),
 ) -> dict[str, Any]:
     image_bytes = await file.read()
     if not image_bytes:
         raise HTTPException(status_code=400, detail="Empty file uploaded")
+    result = generate_caption(image_bytes, text, max_tokens)
+    return {"result": result}
 if __name__ == "__main__":