import Fastify from "fastify"; import multipart from "@fastify/multipart"; import swagger from "@fastify/swagger"; import swaggerUi from "@fastify/swagger-ui"; import cors from "@fastify/cors"; import { generateCaption, loadModel, TASKS } from "./model.js"; const app = Fastify({ logger: true }); // --- Plugins --- await app.register(cors); await app.register(multipart, { limits: { fileSize: 20 * 1024 * 1024 } }); await app.register(swagger, { openapi: { info: { title: "img3txt — Image Captioning API", description: "Generate captions, OCR, object detection and more from images using Microsoft Florence-2 (ONNX).", version: "1.0.0", }, servers: [{ url: "/" }], tags: [ { name: "caption", description: "Image captioning endpoints" }, { name: "health", description: "Health check" }, ], }, }); await app.register(swaggerUi, { routePrefix: "/docs", uiConfig: { docExpansion: "list", deepLinking: true }, }); // --- Schemas --- const taskEnum = Object.keys(TASKS); const captionResponseSchema = { type: "object", properties: { task: { type: "string", example: "caption" }, result: { type: "object", additionalProperties: true }, }, }; const batchResponseSchema = { type: "object", properties: { results: { type: "array", items: { type: "object", properties: { filename: { type: "string" }, task: { type: "string" }, result: { type: "object", additionalProperties: true }, }, }, }, }, }; const errorSchema = { type: "object", properties: { error: { type: "string" }, }, }; // --- Routes --- // Landing page — HF Spaces iframe shows this app.get( "/", { schema: { hide: true } }, async (req, reply) => { reply.type("text/html").send(` img3txt — Florence-2 Image Captioning API

img3txt

Image captioning, OCR & object detection powered by Florence-2 (ONNX)

Swagger UI Health Check

POST /caption with form fields:

`); } ); app.get( "/health", { schema: { tags: ["health"], summary: "Health check", response: { 200: { type: "object", properties: { status: { type: "string", example: "ok" }, model: { type: "string" }, tasks: { type: "array", items: { type: "string" } }, }, }, }, }, }, async () => ({ status: "ok", model: "onnx-community/Florence-2-base", tasks: taskEnum, }) ); app.post( "/caption", { schema: { tags: ["caption"], summary: "Generate caption / OCR / detection for a single image", description: `Upload an image as multipart form data. Supported tasks: ${taskEnum.join(", ")}`, consumes: ["multipart/form-data"], response: { 200: captionResponseSchema, 400: errorSchema, }, }, }, async (req, reply) => { const data = await req.file(); if (!data) { return reply.code(400).send({ error: "No file uploaded" }); } const task = data.fields.task?.value || "caption"; const textInput = data.fields.text?.value || null; const maxTokens = parseInt(data.fields.max_tokens?.value || "100", 10); if (!TASKS[task]) { return reply .code(400) .send({ error: `Invalid task. Choose from: ${taskEnum.join(", ")}` }); } const buffer = await data.toBuffer(); const result = await generateCaption(buffer, task, textInput, maxTokens); return { task, result }; } ); app.post( "/caption/batch", { schema: { tags: ["caption"], summary: "Generate captions for multiple images", description: "Upload multiple images as multipart form data. All images share the same task and settings.", consumes: ["multipart/form-data"], response: { 200: batchResponseSchema, 400: errorSchema, }, }, }, async (req, reply) => { const parts = await req.parts(); const files = []; let task = "caption"; let textInput = null; let maxTokens = 100; for await (const part of parts) { if (part.type === "file") { files.push({ filename: part.filename, buffer: await part.toBuffer() }); } else if (part.fieldname === "task") { task = part.value; } else if (part.fieldname === "text") { textInput = part.value; } else if (part.fieldname === "max_tokens") { maxTokens = parseInt(part.value, 10); } } if (files.length === 0) { return reply.code(400).send({ error: "No files uploaded" }); } if (!TASKS[task]) { return reply .code(400) .send({ error: `Invalid task. Choose from: ${taskEnum.join(", ")}` }); } const results = []; for (const f of files) { const result = await generateCaption(f.buffer, task, textInput, maxTokens); results.push({ filename: f.filename, task, result }); } return { results }; } ); // --- Start --- const PORT = process.env.PORT || 7860; // Pre-load model then start server await loadModel(); app.listen({ host: "0.0.0.0", port: PORT }, (err) => { if (err) { app.log.error(err); process.exit(1); } });