Spaces:

d3evil4
/

Image2Caption

Sleeping

File size: 6,615 Bytes

import Fastify from "fastify";
import multipart from "@fastify/multipart";
import swagger from "@fastify/swagger";
import swaggerUi from "@fastify/swagger-ui";
import cors from "@fastify/cors";
import { generateCaption, loadModel, TASKS } from "./model.js";

const app = Fastify({ logger: true });

// --- Plugins ---
await app.register(cors);
await app.register(multipart, { limits: { fileSize: 20 * 1024 * 1024 } });

await app.register(swagger, {
  openapi: {
    info: {
      title: "img3txt — Image Captioning API",
      description:
        "Generate captions, OCR, object detection and more from images using Microsoft Florence-2 (ONNX).",
      version: "1.0.0",
    },
    servers: [{ url: "/" }],
    tags: [
      { name: "caption", description: "Image captioning endpoints" },
      { name: "health", description: "Health check" },
    ],
  },
});

await app.register(swaggerUi, {
  routePrefix: "/docs",
  uiConfig: { docExpansion: "list", deepLinking: true },
});

// --- Schemas ---
const taskEnum = Object.keys(TASKS);

const captionResponseSchema = {
  type: "object",
  properties: {
    task: { type: "string", example: "caption" },
    result: { type: "object", additionalProperties: true },
  },
};

const batchResponseSchema = {
  type: "object",
  properties: {
    results: {
      type: "array",
      items: {
        type: "object",
        properties: {
          filename: { type: "string" },
          task: { type: "string" },
          result: { type: "object", additionalProperties: true },
        },
      },
    },
  },
};

const errorSchema = {
  type: "object",
  properties: {
    error: { type: "string" },
  },
};

// --- Routes ---

// Landing page — HF Spaces iframe shows this
app.get(
  "/",
  { schema: { hide: true } },
  async (req, reply) => {
    reply.type("text/html").send(`<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>img3txt — Florence-2 Image Captioning API</title>
<style>
*{margin:0;padding:0;box-sizing:border-box}
body{font-family:system-ui,sans-serif;background:#0f172a;color:#e2e8f0;display:flex;align-items:center;justify-content:center;min-height:100vh}
.card{background:#1e293b;border-radius:16px;padding:2.5rem;max-width:520px;width:90%;text-align:center;box-shadow:0 25px 50px rgba(0,0,0,.4)}
h1{font-size:1.8rem;margin-bottom:.5rem}
.sub{color:#94a3b8;margin-bottom:1.5rem}
.btn{display:inline-block;padding:.75rem 1.5rem;background:#3b82f6;color:#fff;border-radius:8px;text-decoration:none;font-weight:600;margin:.25rem}
.btn:hover{background:#2563eb}
.tasks{margin-top:1.5rem;text-align:left;background:#0f172a;border-radius:8px;padding:1rem}
.tasks code{color:#38bdf8}
</style></head><body>
<div class="card">
<h1>img3txt</h1>
<p class="sub">Image captioning, OCR &amp; object detection powered by Florence-2 (ONNX)</p>
<a class="btn" href="/docs">Swagger UI</a>
<a class="btn" href="/health">Health Check</a>
<div class="tasks">
<p><strong>POST /caption</strong> with form fields:</p>
<ul style="margin:.5rem 0 0 1.2rem;color:#94a3b8">
<li><code>file</code> — image (required)</li>
<li><code>task</code> — caption, detailed_caption, more_detailed_caption, ocr, ocr_with_region, object_detection, dense_region_caption, region_proposal</li>
<li><code>max_tokens</code> — default 100</li>
</ul>
</div>
</div></body></html>`);
  }
);

app.get(
  "/health",
  {
    schema: {
      tags: ["health"],
      summary: "Health check",
      response: {
        200: {
          type: "object",
          properties: {
            status: { type: "string", example: "ok" },
            model: { type: "string" },
            tasks: { type: "array", items: { type: "string" } },
          },
        },
      },
    },
  },
  async () => ({
    status: "ok",
    model: "onnx-community/Florence-2-base",
    tasks: taskEnum,
  })
);

app.post(
  "/caption",
  {
    schema: {
      tags: ["caption"],
      summary: "Generate caption / OCR / detection for a single image",
      description: `Upload an image as multipart form data. Supported tasks: ${taskEnum.join(", ")}`,
      consumes: ["multipart/form-data"],
      response: {
        200: captionResponseSchema,
        400: errorSchema,
      },
    },
  },
  async (req, reply) => {
    const data = await req.file();
    if (!data) {
      return reply.code(400).send({ error: "No file uploaded" });
    }

    const task = data.fields.task?.value || "caption";
    const textInput = data.fields.text?.value || null;
    const maxTokens = parseInt(data.fields.max_tokens?.value || "100", 10);

    if (!TASKS[task]) {
      return reply
        .code(400)
        .send({ error: `Invalid task. Choose from: ${taskEnum.join(", ")}` });
    }

    const buffer = await data.toBuffer();
    const result = await generateCaption(buffer, task, textInput, maxTokens);

    return { task, result };
  }
);

app.post(
  "/caption/batch",
  {
    schema: {
      tags: ["caption"],
      summary: "Generate captions for multiple images",
      description:
        "Upload multiple images as multipart form data. All images share the same task and settings.",
      consumes: ["multipart/form-data"],
      response: {
        200: batchResponseSchema,
        400: errorSchema,
      },
    },
  },
  async (req, reply) => {
    const parts = await req.parts();
    const files = [];
    let task = "caption";
    let textInput = null;
    let maxTokens = 100;

    for await (const part of parts) {
      if (part.type === "file") {
        files.push({ filename: part.filename, buffer: await part.toBuffer() });
      } else if (part.fieldname === "task") {
        task = part.value;
      } else if (part.fieldname === "text") {
        textInput = part.value;
      } else if (part.fieldname === "max_tokens") {
        maxTokens = parseInt(part.value, 10);
      }
    }

    if (files.length === 0) {
      return reply.code(400).send({ error: "No files uploaded" });
    }
    if (!TASKS[task]) {
      return reply
        .code(400)
        .send({ error: `Invalid task. Choose from: ${taskEnum.join(", ")}` });
    }

    const results = [];
    for (const f of files) {
      const result = await generateCaption(f.buffer, task, textInput, maxTokens);
      results.push({ filename: f.filename, task, result });
    }

    return { results };
  }
);

// --- Start ---
const PORT = process.env.PORT || 7860;

// Pre-load model then start server
await loadModel();
app.listen({ host: "0.0.0.0", port: PORT }, (err) => {
  if (err) {
    app.log.error(err);
    process.exit(1);
  }
});