Spaces:
Sleeping
Sleeping
Commit ·
9a16713
1
Parent(s): ba547b6
feat: added
Browse files- .dockerignore +3 -0
- Dockerfile +20 -0
- docker-compose.yml +8 -0
- package.json +19 -0
- src/model.js +72 -0
- src/server.js +197 -0
.dockerignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
node_modules
|
| 2 |
+
.git
|
| 3 |
+
.env
|
Dockerfile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM node:22-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# sharp needs these
|
| 6 |
+
RUN apt-get update && \
|
| 7 |
+
apt-get install -y --no-install-recommends libvips-dev && \
|
| 8 |
+
rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
COPY package.json ./
|
| 11 |
+
RUN npm install --omit=dev
|
| 12 |
+
|
| 13 |
+
COPY src/ src/
|
| 14 |
+
|
| 15 |
+
# Download model at build time so container starts fast
|
| 16 |
+
RUN node -e "import('./src/model.js').then(m => m.loadModel()).then(() => process.exit(0))"
|
| 17 |
+
|
| 18 |
+
EXPOSE 3000
|
| 19 |
+
|
| 20 |
+
CMD ["node", "src/server.js"]
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
img3txt:
|
| 3 |
+
build: .
|
| 4 |
+
ports:
|
| 5 |
+
- "3000:3000"
|
| 6 |
+
environment:
|
| 7 |
+
- PORT=3000
|
| 8 |
+
restart: unless-stopped
|
package.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "img3txt",
|
| 3 |
+
"version": "1.0.0",
|
| 4 |
+
"description": "Image captioning API using Florence-2 ONNX model",
|
| 5 |
+
"type": "module",
|
| 6 |
+
"scripts": {
|
| 7 |
+
"start": "node src/server.js",
|
| 8 |
+
"dev": "node --watch src/server.js"
|
| 9 |
+
},
|
| 10 |
+
"dependencies": {
|
| 11 |
+
"@huggingface/transformers": "^3.4.1",
|
| 12 |
+
"fastify": "^5.2.1",
|
| 13 |
+
"@fastify/multipart": "^9.0.3",
|
| 14 |
+
"@fastify/swagger": "^9.4.2",
|
| 15 |
+
"@fastify/swagger-ui": "^5.2.1",
|
| 16 |
+
"@fastify/cors": "^10.0.2",
|
| 17 |
+
"sharp": "^0.33.5"
|
| 18 |
+
}
|
| 19 |
+
}
|
src/model.js
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import {
|
| 2 |
+
Florence2ForConditionalGeneration,
|
| 3 |
+
AutoProcessor,
|
| 4 |
+
RawImage,
|
| 5 |
+
} from "@huggingface/transformers";
|
| 6 |
+
|
| 7 |
+
const MODEL_ID = "onnx-community/Florence-2-base";
|
| 8 |
+
|
| 9 |
+
let model = null;
|
| 10 |
+
let processor = null;
|
| 11 |
+
|
| 12 |
+
/** Supported Florence-2 task tokens */
|
| 13 |
+
export const TASKS = {
|
| 14 |
+
caption: "<CAPTION>",
|
| 15 |
+
detailed_caption: "<DETAILED_CAPTION>",
|
| 16 |
+
more_detailed_caption: "<MORE_DETAILED_CAPTION>",
|
| 17 |
+
ocr: "<OCR>",
|
| 18 |
+
ocr_with_region: "<OCR_WITH_REGION>",
|
| 19 |
+
object_detection: "<OD>",
|
| 20 |
+
dense_region_caption: "<DENSE_REGION_CAPTION>",
|
| 21 |
+
region_proposal: "<REGION_PROPOSAL>",
|
| 22 |
+
};
|
| 23 |
+
|
| 24 |
+
export async function loadModel() {
|
| 25 |
+
if (!model) {
|
| 26 |
+
console.log("Loading Florence-2 model...");
|
| 27 |
+
model = await Florence2ForConditionalGeneration.from_pretrained(MODEL_ID, {
|
| 28 |
+
dtype: "fp32",
|
| 29 |
+
});
|
| 30 |
+
processor = await AutoProcessor.from_pretrained(MODEL_ID);
|
| 31 |
+
console.log("Model loaded.");
|
| 32 |
+
}
|
| 33 |
+
return { model, processor };
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
/**
|
| 37 |
+
* Generate text from an image buffer.
|
| 38 |
+
* @param {Buffer} imageBuffer - Raw image bytes
|
| 39 |
+
* @param {string} task - One of the TASKS keys (default: "caption")
|
| 40 |
+
* @param {string|null} textInput - Optional extra text input for the task
|
| 41 |
+
* @param {number} maxTokens - Max new tokens to generate
|
| 42 |
+
* @returns {Promise<object>} Parsed result from Florence-2
|
| 43 |
+
*/
|
| 44 |
+
export async function generateCaption(
|
| 45 |
+
imageBuffer,
|
| 46 |
+
task = "caption",
|
| 47 |
+
textInput = null,
|
| 48 |
+
maxTokens = 100
|
| 49 |
+
) {
|
| 50 |
+
const { model: m, processor: p } = await loadModel();
|
| 51 |
+
|
| 52 |
+
const image = await RawImage.fromBlob(new Blob([imageBuffer]));
|
| 53 |
+
|
| 54 |
+
const taskToken = TASKS[task] || TASKS.caption;
|
| 55 |
+
const prompt = textInput ? taskToken + textInput : taskToken;
|
| 56 |
+
|
| 57 |
+
const prompts = p.construct_prompts(prompt);
|
| 58 |
+
const inputs = await p(image, prompts);
|
| 59 |
+
|
| 60 |
+
const generatedIds = await m.generate({
|
| 61 |
+
...inputs,
|
| 62 |
+
max_new_tokens: maxTokens,
|
| 63 |
+
});
|
| 64 |
+
|
| 65 |
+
const generatedText = p.batch_decode(generatedIds, {
|
| 66 |
+
skip_special_tokens: false,
|
| 67 |
+
})[0];
|
| 68 |
+
|
| 69 |
+
const result = p.post_process_generation(generatedText, taskToken, image.size);
|
| 70 |
+
|
| 71 |
+
return result;
|
| 72 |
+
}
|
src/server.js
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import Fastify from "fastify";
|
| 2 |
+
import multipart from "@fastify/multipart";
|
| 3 |
+
import swagger from "@fastify/swagger";
|
| 4 |
+
import swaggerUi from "@fastify/swagger-ui";
|
| 5 |
+
import cors from "@fastify/cors";
|
| 6 |
+
import { generateCaption, loadModel, TASKS } from "./model.js";
|
| 7 |
+
|
| 8 |
+
const app = Fastify({ logger: true });
|
| 9 |
+
|
| 10 |
+
// --- Plugins ---
|
| 11 |
+
await app.register(cors);
|
| 12 |
+
await app.register(multipart, { limits: { fileSize: 20 * 1024 * 1024 } });
|
| 13 |
+
|
| 14 |
+
await app.register(swagger, {
|
| 15 |
+
openapi: {
|
| 16 |
+
info: {
|
| 17 |
+
title: "img3txt — Image Captioning API",
|
| 18 |
+
description:
|
| 19 |
+
"Generate captions, OCR, object detection and more from images using Microsoft Florence-2 (ONNX).",
|
| 20 |
+
version: "1.0.0",
|
| 21 |
+
},
|
| 22 |
+
servers: [{ url: "http://localhost:3000" }],
|
| 23 |
+
tags: [
|
| 24 |
+
{ name: "caption", description: "Image captioning endpoints" },
|
| 25 |
+
{ name: "health", description: "Health check" },
|
| 26 |
+
],
|
| 27 |
+
},
|
| 28 |
+
});
|
| 29 |
+
|
| 30 |
+
await app.register(swaggerUi, {
|
| 31 |
+
routePrefix: "/docs",
|
| 32 |
+
uiConfig: { docExpansion: "list", deepLinking: true },
|
| 33 |
+
});
|
| 34 |
+
|
| 35 |
+
// --- Schemas ---
|
| 36 |
+
const taskEnum = Object.keys(TASKS);
|
| 37 |
+
|
| 38 |
+
const captionResponseSchema = {
|
| 39 |
+
type: "object",
|
| 40 |
+
properties: {
|
| 41 |
+
task: { type: "string", example: "caption" },
|
| 42 |
+
result: { type: "object", additionalProperties: true },
|
| 43 |
+
},
|
| 44 |
+
};
|
| 45 |
+
|
| 46 |
+
const batchResponseSchema = {
|
| 47 |
+
type: "object",
|
| 48 |
+
properties: {
|
| 49 |
+
results: {
|
| 50 |
+
type: "array",
|
| 51 |
+
items: {
|
| 52 |
+
type: "object",
|
| 53 |
+
properties: {
|
| 54 |
+
filename: { type: "string" },
|
| 55 |
+
task: { type: "string" },
|
| 56 |
+
result: { type: "object", additionalProperties: true },
|
| 57 |
+
},
|
| 58 |
+
},
|
| 59 |
+
},
|
| 60 |
+
},
|
| 61 |
+
};
|
| 62 |
+
|
| 63 |
+
const errorSchema = {
|
| 64 |
+
type: "object",
|
| 65 |
+
properties: {
|
| 66 |
+
error: { type: "string" },
|
| 67 |
+
},
|
| 68 |
+
};
|
| 69 |
+
|
| 70 |
+
// --- Routes ---
|
| 71 |
+
|
| 72 |
+
app.get(
|
| 73 |
+
"/health",
|
| 74 |
+
{
|
| 75 |
+
schema: {
|
| 76 |
+
tags: ["health"],
|
| 77 |
+
summary: "Health check",
|
| 78 |
+
response: {
|
| 79 |
+
200: {
|
| 80 |
+
type: "object",
|
| 81 |
+
properties: {
|
| 82 |
+
status: { type: "string", example: "ok" },
|
| 83 |
+
model: { type: "string" },
|
| 84 |
+
tasks: { type: "array", items: { type: "string" } },
|
| 85 |
+
},
|
| 86 |
+
},
|
| 87 |
+
},
|
| 88 |
+
},
|
| 89 |
+
},
|
| 90 |
+
async () => ({
|
| 91 |
+
status: "ok",
|
| 92 |
+
model: "onnx-community/Florence-2-base",
|
| 93 |
+
tasks: taskEnum,
|
| 94 |
+
})
|
| 95 |
+
);
|
| 96 |
+
|
| 97 |
+
app.post(
|
| 98 |
+
"/caption",
|
| 99 |
+
{
|
| 100 |
+
schema: {
|
| 101 |
+
tags: ["caption"],
|
| 102 |
+
summary: "Generate caption / OCR / detection for a single image",
|
| 103 |
+
description: `Upload an image as multipart form data. Supported tasks: ${taskEnum.join(", ")}`,
|
| 104 |
+
consumes: ["multipart/form-data"],
|
| 105 |
+
response: {
|
| 106 |
+
200: captionResponseSchema,
|
| 107 |
+
400: errorSchema,
|
| 108 |
+
},
|
| 109 |
+
},
|
| 110 |
+
},
|
| 111 |
+
async (req, reply) => {
|
| 112 |
+
const data = await req.file();
|
| 113 |
+
if (!data) {
|
| 114 |
+
return reply.code(400).send({ error: "No file uploaded" });
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
const task = data.fields.task?.value || "caption";
|
| 118 |
+
const textInput = data.fields.text?.value || null;
|
| 119 |
+
const maxTokens = parseInt(data.fields.max_tokens?.value || "100", 10);
|
| 120 |
+
|
| 121 |
+
if (!TASKS[task]) {
|
| 122 |
+
return reply
|
| 123 |
+
.code(400)
|
| 124 |
+
.send({ error: `Invalid task. Choose from: ${taskEnum.join(", ")}` });
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
const buffer = await data.toBuffer();
|
| 128 |
+
const result = await generateCaption(buffer, task, textInput, maxTokens);
|
| 129 |
+
|
| 130 |
+
return { task, result };
|
| 131 |
+
}
|
| 132 |
+
);
|
| 133 |
+
|
| 134 |
+
app.post(
|
| 135 |
+
"/caption/batch",
|
| 136 |
+
{
|
| 137 |
+
schema: {
|
| 138 |
+
tags: ["caption"],
|
| 139 |
+
summary: "Generate captions for multiple images",
|
| 140 |
+
description:
|
| 141 |
+
"Upload multiple images as multipart form data. All images share the same task and settings.",
|
| 142 |
+
consumes: ["multipart/form-data"],
|
| 143 |
+
response: {
|
| 144 |
+
200: batchResponseSchema,
|
| 145 |
+
400: errorSchema,
|
| 146 |
+
},
|
| 147 |
+
},
|
| 148 |
+
},
|
| 149 |
+
async (req, reply) => {
|
| 150 |
+
const parts = await req.parts();
|
| 151 |
+
const files = [];
|
| 152 |
+
let task = "caption";
|
| 153 |
+
let textInput = null;
|
| 154 |
+
let maxTokens = 100;
|
| 155 |
+
|
| 156 |
+
for await (const part of parts) {
|
| 157 |
+
if (part.type === "file") {
|
| 158 |
+
files.push({ filename: part.filename, buffer: await part.toBuffer() });
|
| 159 |
+
} else if (part.fieldname === "task") {
|
| 160 |
+
task = part.value;
|
| 161 |
+
} else if (part.fieldname === "text") {
|
| 162 |
+
textInput = part.value;
|
| 163 |
+
} else if (part.fieldname === "max_tokens") {
|
| 164 |
+
maxTokens = parseInt(part.value, 10);
|
| 165 |
+
}
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
if (files.length === 0) {
|
| 169 |
+
return reply.code(400).send({ error: "No files uploaded" });
|
| 170 |
+
}
|
| 171 |
+
if (!TASKS[task]) {
|
| 172 |
+
return reply
|
| 173 |
+
.code(400)
|
| 174 |
+
.send({ error: `Invalid task. Choose from: ${taskEnum.join(", ")}` });
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
const results = [];
|
| 178 |
+
for (const f of files) {
|
| 179 |
+
const result = await generateCaption(f.buffer, task, textInput, maxTokens);
|
| 180 |
+
results.push({ filename: f.filename, task, result });
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
return { results };
|
| 184 |
+
}
|
| 185 |
+
);
|
| 186 |
+
|
| 187 |
+
// --- Start ---
|
| 188 |
+
const PORT = process.env.PORT || 3000;
|
| 189 |
+
|
| 190 |
+
// Pre-load model then start server
|
| 191 |
+
await loadModel();
|
| 192 |
+
app.listen({ host: "0.0.0.0", port: PORT }, (err) => {
|
| 193 |
+
if (err) {
|
| 194 |
+
app.log.error(err);
|
| 195 |
+
process.exit(1);
|
| 196 |
+
}
|
| 197 |
+
});
|