Spaces:

d3evil4
/

Image2Caption

Sleeping

App Files Files Community

khushalcodiste commited on Mar 8

Commit

9a16713

1 Parent(s): ba547b6

feat: added

Browse files

Files changed (6) hide show

.dockerignore +3 -0
Dockerfile +20 -0
docker-compose.yml +8 -0
package.json +19 -0
src/model.js +72 -0
src/server.js +197 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,3 @@

+node_modules
+.git
+.env

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM node:22-slim
+WORKDIR /app
+# sharp needs these
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends libvips-dev && \
+    rm -rf /var/lib/apt/lists/*
+COPY package.json ./
+RUN npm install --omit=dev
+COPY src/ src/
+# Download model at build time so container starts fast
+RUN node -e "import('./src/model.js').then(m => m.loadModel()).then(() => process.exit(0))"
+EXPOSE 3000
+CMD ["node", "src/server.js"]

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,8 @@

+services:
+  img3txt:
+    build: .
+    ports:
+      - "3000:3000"
+    environment:
+      - PORT=3000
+    restart: unless-stopped

package.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "name": "img3txt",
+  "version": "1.0.0",
+  "description": "Image captioning API using Florence-2 ONNX model",
+  "type": "module",
+  "scripts": {
+    "start": "node src/server.js",
+    "dev": "node --watch src/server.js"
+  },
+  "dependencies": {
+    "@huggingface/transformers": "^3.4.1",
+    "fastify": "^5.2.1",
+    "@fastify/multipart": "^9.0.3",
+    "@fastify/swagger": "^9.4.2",
+    "@fastify/swagger-ui": "^5.2.1",
+    "@fastify/cors": "^10.0.2",
+    "sharp": "^0.33.5"
+  }
+}

src/model.js ADDED Viewed

	@@ -0,0 +1,72 @@

+import {
+  Florence2ForConditionalGeneration,
+  AutoProcessor,
+  RawImage,
+} from "@huggingface/transformers";
+const MODEL_ID = "onnx-community/Florence-2-base";
+let model = null;
+let processor = null;
+/** Supported Florence-2 task tokens */
+export const TASKS = {
+  caption: "<CAPTION>",
+  detailed_caption: "<DETAILED_CAPTION>",
+  more_detailed_caption: "<MORE_DETAILED_CAPTION>",
+  ocr: "<OCR>",
+  ocr_with_region: "<OCR_WITH_REGION>",
+  object_detection: "<OD>",
+  dense_region_caption: "<DENSE_REGION_CAPTION>",
+  region_proposal: "<REGION_PROPOSAL>",
+};
+export async function loadModel() {
+  if (!model) {
+    console.log("Loading Florence-2 model...");
+    model = await Florence2ForConditionalGeneration.from_pretrained(MODEL_ID, {
+      dtype: "fp32",
+    });
+    processor = await AutoProcessor.from_pretrained(MODEL_ID);
+    console.log("Model loaded.");
+  }
+  return { model, processor };
+}
+/**
+ * Generate text from an image buffer.
+ * @param {Buffer} imageBuffer - Raw image bytes
+ * @param {string} task - One of the TASKS keys (default: "caption")
+ * @param {string|null} textInput - Optional extra text input for the task
+ * @param {number} maxTokens - Max new tokens to generate
+ * @returns {Promise<object>} Parsed result from Florence-2
+ */
+export async function generateCaption(
+  imageBuffer,
+  task = "caption",
+  textInput = null,
+  maxTokens = 100
+) {
+  const { model: m, processor: p } = await loadModel();
+  const image = await RawImage.fromBlob(new Blob([imageBuffer]));
+  const taskToken = TASKS[task] || TASKS.caption;
+  const prompt = textInput ? taskToken + textInput : taskToken;
+  const prompts = p.construct_prompts(prompt);
+  const inputs = await p(image, prompts);
+  const generatedIds = await m.generate({
+    ...inputs,
+    max_new_tokens: maxTokens,
+  });
+  const generatedText = p.batch_decode(generatedIds, {
+    skip_special_tokens: false,
+  })[0];
+  const result = p.post_process_generation(generatedText, taskToken, image.size);
+  return result;
+}

src/server.js ADDED Viewed

	@@ -0,0 +1,197 @@

+import Fastify from "fastify";
+import multipart from "@fastify/multipart";
+import swagger from "@fastify/swagger";
+import swaggerUi from "@fastify/swagger-ui";
+import cors from "@fastify/cors";
+import { generateCaption, loadModel, TASKS } from "./model.js";
+const app = Fastify({ logger: true });
+// --- Plugins ---
+await app.register(cors);
+await app.register(multipart, { limits: { fileSize: 20 * 1024 * 1024 } });
+await app.register(swagger, {
+  openapi: {
+    info: {
+      title: "img3txt — Image Captioning API",
+      description:
+        "Generate captions, OCR, object detection and more from images using Microsoft Florence-2 (ONNX).",
+      version: "1.0.0",
+    },
+    servers: [{ url: "http://localhost:3000" }],
+    tags: [
+      { name: "caption", description: "Image captioning endpoints" },
+      { name: "health", description: "Health check" },
+    ],
+  },
+});
+await app.register(swaggerUi, {
+  routePrefix: "/docs",
+  uiConfig: { docExpansion: "list", deepLinking: true },
+});
+// --- Schemas ---
+const taskEnum = Object.keys(TASKS);
+const captionResponseSchema = {
+  type: "object",
+  properties: {
+    task: { type: "string", example: "caption" },
+    result: { type: "object", additionalProperties: true },
+  },
+};
+const batchResponseSchema = {
+  type: "object",
+  properties: {
+    results: {
+      type: "array",
+      items: {
+        type: "object",
+        properties: {
+          filename: { type: "string" },
+          task: { type: "string" },
+          result: { type: "object", additionalProperties: true },
+        },
+      },
+    },
+  },
+};
+const errorSchema = {
+  type: "object",
+  properties: {
+    error: { type: "string" },
+  },
+};
+// --- Routes ---
+app.get(
+  "/health",
+  {
+    schema: {
+      tags: ["health"],
+      summary: "Health check",
+      response: {
+        200: {
+          type: "object",
+          properties: {
+            status: { type: "string", example: "ok" },
+            model: { type: "string" },
+            tasks: { type: "array", items: { type: "string" } },
+          },
+        },
+      },
+    },
+  },
+  async () => ({
+    status: "ok",
+    model: "onnx-community/Florence-2-base",
+    tasks: taskEnum,
+  })
+);
+app.post(
+  "/caption",
+  {
+    schema: {
+      tags: ["caption"],
+      summary: "Generate caption / OCR / detection for a single image",
+      description: `Upload an image as multipart form data. Supported tasks: ${taskEnum.join(", ")}`,
+      consumes: ["multipart/form-data"],
+      response: {
+        200: captionResponseSchema,
+        400: errorSchema,
+      },
+    },
+  },
+  async (req, reply) => {
+    const data = await req.file();
+    if (!data) {
+      return reply.code(400).send({ error: "No file uploaded" });
+    }
+    const task = data.fields.task?.value || "caption";
+    const textInput = data.fields.text?.value || null;
+    const maxTokens = parseInt(data.fields.max_tokens?.value || "100", 10);
+    if (!TASKS[task]) {
+      return reply
+        .code(400)
+        .send({ error: `Invalid task. Choose from: ${taskEnum.join(", ")}` });
+    }
+    const buffer = await data.toBuffer();
+    const result = await generateCaption(buffer, task, textInput, maxTokens);
+    return { task, result };
+  }
+);
+app.post(
+  "/caption/batch",
+  {
+    schema: {
+      tags: ["caption"],
+      summary: "Generate captions for multiple images",
+      description:
+        "Upload multiple images as multipart form data. All images share the same task and settings.",
+      consumes: ["multipart/form-data"],
+      response: {
+        200: batchResponseSchema,
+        400: errorSchema,
+      },
+    },
+  },
+  async (req, reply) => {
+    const parts = await req.parts();
+    const files = [];
+    let task = "caption";
+    let textInput = null;
+    let maxTokens = 100;
+    for await (const part of parts) {
+      if (part.type === "file") {
+        files.push({ filename: part.filename, buffer: await part.toBuffer() });
+      } else if (part.fieldname === "task") {
+        task = part.value;
+      } else if (part.fieldname === "text") {
+        textInput = part.value;
+      } else if (part.fieldname === "max_tokens") {
+        maxTokens = parseInt(part.value, 10);
+      }
+    }
+    if (files.length === 0) {
+      return reply.code(400).send({ error: "No files uploaded" });
+    }
+    if (!TASKS[task]) {
+      return reply
+        .code(400)
+        .send({ error: `Invalid task. Choose from: ${taskEnum.join(", ")}` });
+    }
+    const results = [];
+    for (const f of files) {
+      const result = await generateCaption(f.buffer, task, textInput, maxTokens);
+      results.push({ filename: f.filename, task, result });
+    }
+    return { results };
+  }
+);
+// --- Start ---
+const PORT = process.env.PORT || 3000;
+// Pre-load model then start server
+await loadModel();
+app.listen({ host: "0.0.0.0", port: PORT }, (err) => {
+  if (err) {
+    app.log.error(err);
+    process.exit(1);
+  }
+});