Image2Caption / src /server.js
khushalcodiste's picture
feat: added
a022cd7
raw
history blame
6.62 kB
import Fastify from "fastify";
import multipart from "@fastify/multipart";
import swagger from "@fastify/swagger";
import swaggerUi from "@fastify/swagger-ui";
import cors from "@fastify/cors";
import { generateCaption, loadModel, TASKS } from "./model.js";
const app = Fastify({ logger: true });
// --- Plugins ---
await app.register(cors);
await app.register(multipart, { limits: { fileSize: 20 * 1024 * 1024 } });
await app.register(swagger, {
openapi: {
info: {
title: "img3txt — Image Captioning API",
description:
"Generate captions, OCR, object detection and more from images using Microsoft Florence-2 (ONNX).",
version: "1.0.0",
},
servers: [{ url: "/" }],
tags: [
{ name: "caption", description: "Image captioning endpoints" },
{ name: "health", description: "Health check" },
],
},
});
await app.register(swaggerUi, {
routePrefix: "/docs",
uiConfig: { docExpansion: "list", deepLinking: true },
});
// --- Schemas ---
const taskEnum = Object.keys(TASKS);
const captionResponseSchema = {
type: "object",
properties: {
task: { type: "string", example: "caption" },
result: { type: "object", additionalProperties: true },
},
};
const batchResponseSchema = {
type: "object",
properties: {
results: {
type: "array",
items: {
type: "object",
properties: {
filename: { type: "string" },
task: { type: "string" },
result: { type: "object", additionalProperties: true },
},
},
},
},
};
const errorSchema = {
type: "object",
properties: {
error: { type: "string" },
},
};
// --- Routes ---
// Landing page — HF Spaces iframe shows this
app.get(
"/",
{ schema: { hide: true } },
async (req, reply) => {
reply.type("text/html").send(`<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>img3txt — Florence-2 Image Captioning API</title>
<style>
*{margin:0;padding:0;box-sizing:border-box}
body{font-family:system-ui,sans-serif;background:#0f172a;color:#e2e8f0;display:flex;align-items:center;justify-content:center;min-height:100vh}
.card{background:#1e293b;border-radius:16px;padding:2.5rem;max-width:520px;width:90%;text-align:center;box-shadow:0 25px 50px rgba(0,0,0,.4)}
h1{font-size:1.8rem;margin-bottom:.5rem}
.sub{color:#94a3b8;margin-bottom:1.5rem}
.btn{display:inline-block;padding:.75rem 1.5rem;background:#3b82f6;color:#fff;border-radius:8px;text-decoration:none;font-weight:600;margin:.25rem}
.btn:hover{background:#2563eb}
.tasks{margin-top:1.5rem;text-align:left;background:#0f172a;border-radius:8px;padding:1rem}
.tasks code{color:#38bdf8}
</style></head><body>
<div class="card">
<h1>img3txt</h1>
<p class="sub">Image captioning, OCR &amp; object detection powered by Florence-2 (ONNX)</p>
<a class="btn" href="/docs">Swagger UI</a>
<a class="btn" href="/health">Health Check</a>
<div class="tasks">
<p><strong>POST /caption</strong> with form fields:</p>
<ul style="margin:.5rem 0 0 1.2rem;color:#94a3b8">
<li><code>file</code> — image (required)</li>
<li><code>task</code> — caption, detailed_caption, more_detailed_caption, ocr, ocr_with_region, object_detection, dense_region_caption, region_proposal</li>
<li><code>max_tokens</code> — default 100</li>
</ul>
</div>
</div></body></html>`);
}
);
app.get(
"/health",
{
schema: {
tags: ["health"],
summary: "Health check",
response: {
200: {
type: "object",
properties: {
status: { type: "string", example: "ok" },
model: { type: "string" },
tasks: { type: "array", items: { type: "string" } },
},
},
},
},
},
async () => ({
status: "ok",
model: "onnx-community/Florence-2-base",
tasks: taskEnum,
})
);
app.post(
"/caption",
{
schema: {
tags: ["caption"],
summary: "Generate caption / OCR / detection for a single image",
description: `Upload an image as multipart form data. Supported tasks: ${taskEnum.join(", ")}`,
consumes: ["multipart/form-data"],
response: {
200: captionResponseSchema,
400: errorSchema,
},
},
},
async (req, reply) => {
const data = await req.file();
if (!data) {
return reply.code(400).send({ error: "No file uploaded" });
}
const task = data.fields.task?.value || "caption";
const textInput = data.fields.text?.value || null;
const maxTokens = parseInt(data.fields.max_tokens?.value || "100", 10);
if (!TASKS[task]) {
return reply
.code(400)
.send({ error: `Invalid task. Choose from: ${taskEnum.join(", ")}` });
}
const buffer = await data.toBuffer();
const result = await generateCaption(buffer, task, textInput, maxTokens);
return { task, result };
}
);
app.post(
"/caption/batch",
{
schema: {
tags: ["caption"],
summary: "Generate captions for multiple images",
description:
"Upload multiple images as multipart form data. All images share the same task and settings.",
consumes: ["multipart/form-data"],
response: {
200: batchResponseSchema,
400: errorSchema,
},
},
},
async (req, reply) => {
const parts = await req.parts();
const files = [];
let task = "caption";
let textInput = null;
let maxTokens = 100;
for await (const part of parts) {
if (part.type === "file") {
files.push({ filename: part.filename, buffer: await part.toBuffer() });
} else if (part.fieldname === "task") {
task = part.value;
} else if (part.fieldname === "text") {
textInput = part.value;
} else if (part.fieldname === "max_tokens") {
maxTokens = parseInt(part.value, 10);
}
}
if (files.length === 0) {
return reply.code(400).send({ error: "No files uploaded" });
}
if (!TASKS[task]) {
return reply
.code(400)
.send({ error: `Invalid task. Choose from: ${taskEnum.join(", ")}` });
}
const results = [];
for (const f of files) {
const result = await generateCaption(f.buffer, task, textInput, maxTokens);
results.push({ filename: f.filename, task, result });
}
return { results };
}
);
// --- Start ---
const PORT = process.env.PORT || 7860;
// Pre-load model then start server
await loadModel();
app.listen({ host: "0.0.0.0", port: PORT }, (err) => {
if (err) {
app.log.error(err);
process.exit(1);
}
});