khushalcodiste commited on
Commit
9a16713
·
1 Parent(s): ba547b6

feat: added

Browse files
Files changed (6) hide show
  1. .dockerignore +3 -0
  2. Dockerfile +20 -0
  3. docker-compose.yml +8 -0
  4. package.json +19 -0
  5. src/model.js +72 -0
  6. src/server.js +197 -0
.dockerignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ node_modules
2
+ .git
3
+ .env
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM node:22-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # sharp needs these
6
+ RUN apt-get update && \
7
+ apt-get install -y --no-install-recommends libvips-dev && \
8
+ rm -rf /var/lib/apt/lists/*
9
+
10
+ COPY package.json ./
11
+ RUN npm install --omit=dev
12
+
13
+ COPY src/ src/
14
+
15
+ # Download model at build time so container starts fast
16
+ RUN node -e "import('./src/model.js').then(m => m.loadModel()).then(() => process.exit(0))"
17
+
18
+ EXPOSE 3000
19
+
20
+ CMD ["node", "src/server.js"]
docker-compose.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ img3txt:
3
+ build: .
4
+ ports:
5
+ - "3000:3000"
6
+ environment:
7
+ - PORT=3000
8
+ restart: unless-stopped
package.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "img3txt",
3
+ "version": "1.0.0",
4
+ "description": "Image captioning API using Florence-2 ONNX model",
5
+ "type": "module",
6
+ "scripts": {
7
+ "start": "node src/server.js",
8
+ "dev": "node --watch src/server.js"
9
+ },
10
+ "dependencies": {
11
+ "@huggingface/transformers": "^3.4.1",
12
+ "fastify": "^5.2.1",
13
+ "@fastify/multipart": "^9.0.3",
14
+ "@fastify/swagger": "^9.4.2",
15
+ "@fastify/swagger-ui": "^5.2.1",
16
+ "@fastify/cors": "^10.0.2",
17
+ "sharp": "^0.33.5"
18
+ }
19
+ }
src/model.js ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {
2
+ Florence2ForConditionalGeneration,
3
+ AutoProcessor,
4
+ RawImage,
5
+ } from "@huggingface/transformers";
6
+
7
+ const MODEL_ID = "onnx-community/Florence-2-base";
8
+
9
+ let model = null;
10
+ let processor = null;
11
+
12
+ /** Supported Florence-2 task tokens */
13
+ export const TASKS = {
14
+ caption: "<CAPTION>",
15
+ detailed_caption: "<DETAILED_CAPTION>",
16
+ more_detailed_caption: "<MORE_DETAILED_CAPTION>",
17
+ ocr: "<OCR>",
18
+ ocr_with_region: "<OCR_WITH_REGION>",
19
+ object_detection: "<OD>",
20
+ dense_region_caption: "<DENSE_REGION_CAPTION>",
21
+ region_proposal: "<REGION_PROPOSAL>",
22
+ };
23
+
24
+ export async function loadModel() {
25
+ if (!model) {
26
+ console.log("Loading Florence-2 model...");
27
+ model = await Florence2ForConditionalGeneration.from_pretrained(MODEL_ID, {
28
+ dtype: "fp32",
29
+ });
30
+ processor = await AutoProcessor.from_pretrained(MODEL_ID);
31
+ console.log("Model loaded.");
32
+ }
33
+ return { model, processor };
34
+ }
35
+
36
+ /**
37
+ * Generate text from an image buffer.
38
+ * @param {Buffer} imageBuffer - Raw image bytes
39
+ * @param {string} task - One of the TASKS keys (default: "caption")
40
+ * @param {string|null} textInput - Optional extra text input for the task
41
+ * @param {number} maxTokens - Max new tokens to generate
42
+ * @returns {Promise<object>} Parsed result from Florence-2
43
+ */
44
+ export async function generateCaption(
45
+ imageBuffer,
46
+ task = "caption",
47
+ textInput = null,
48
+ maxTokens = 100
49
+ ) {
50
+ const { model: m, processor: p } = await loadModel();
51
+
52
+ const image = await RawImage.fromBlob(new Blob([imageBuffer]));
53
+
54
+ const taskToken = TASKS[task] || TASKS.caption;
55
+ const prompt = textInput ? taskToken + textInput : taskToken;
56
+
57
+ const prompts = p.construct_prompts(prompt);
58
+ const inputs = await p(image, prompts);
59
+
60
+ const generatedIds = await m.generate({
61
+ ...inputs,
62
+ max_new_tokens: maxTokens,
63
+ });
64
+
65
+ const generatedText = p.batch_decode(generatedIds, {
66
+ skip_special_tokens: false,
67
+ })[0];
68
+
69
+ const result = p.post_process_generation(generatedText, taskToken, image.size);
70
+
71
+ return result;
72
+ }
src/server.js ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Fastify from "fastify";
2
+ import multipart from "@fastify/multipart";
3
+ import swagger from "@fastify/swagger";
4
+ import swaggerUi from "@fastify/swagger-ui";
5
+ import cors from "@fastify/cors";
6
+ import { generateCaption, loadModel, TASKS } from "./model.js";
7
+
8
+ const app = Fastify({ logger: true });
9
+
10
+ // --- Plugins ---
11
+ await app.register(cors);
12
+ await app.register(multipart, { limits: { fileSize: 20 * 1024 * 1024 } });
13
+
14
+ await app.register(swagger, {
15
+ openapi: {
16
+ info: {
17
+ title: "img3txt — Image Captioning API",
18
+ description:
19
+ "Generate captions, OCR, object detection and more from images using Microsoft Florence-2 (ONNX).",
20
+ version: "1.0.0",
21
+ },
22
+ servers: [{ url: "http://localhost:3000" }],
23
+ tags: [
24
+ { name: "caption", description: "Image captioning endpoints" },
25
+ { name: "health", description: "Health check" },
26
+ ],
27
+ },
28
+ });
29
+
30
+ await app.register(swaggerUi, {
31
+ routePrefix: "/docs",
32
+ uiConfig: { docExpansion: "list", deepLinking: true },
33
+ });
34
+
35
+ // --- Schemas ---
36
+ const taskEnum = Object.keys(TASKS);
37
+
38
+ const captionResponseSchema = {
39
+ type: "object",
40
+ properties: {
41
+ task: { type: "string", example: "caption" },
42
+ result: { type: "object", additionalProperties: true },
43
+ },
44
+ };
45
+
46
+ const batchResponseSchema = {
47
+ type: "object",
48
+ properties: {
49
+ results: {
50
+ type: "array",
51
+ items: {
52
+ type: "object",
53
+ properties: {
54
+ filename: { type: "string" },
55
+ task: { type: "string" },
56
+ result: { type: "object", additionalProperties: true },
57
+ },
58
+ },
59
+ },
60
+ },
61
+ };
62
+
63
+ const errorSchema = {
64
+ type: "object",
65
+ properties: {
66
+ error: { type: "string" },
67
+ },
68
+ };
69
+
70
+ // --- Routes ---
71
+
72
+ app.get(
73
+ "/health",
74
+ {
75
+ schema: {
76
+ tags: ["health"],
77
+ summary: "Health check",
78
+ response: {
79
+ 200: {
80
+ type: "object",
81
+ properties: {
82
+ status: { type: "string", example: "ok" },
83
+ model: { type: "string" },
84
+ tasks: { type: "array", items: { type: "string" } },
85
+ },
86
+ },
87
+ },
88
+ },
89
+ },
90
+ async () => ({
91
+ status: "ok",
92
+ model: "onnx-community/Florence-2-base",
93
+ tasks: taskEnum,
94
+ })
95
+ );
96
+
97
+ app.post(
98
+ "/caption",
99
+ {
100
+ schema: {
101
+ tags: ["caption"],
102
+ summary: "Generate caption / OCR / detection for a single image",
103
+ description: `Upload an image as multipart form data. Supported tasks: ${taskEnum.join(", ")}`,
104
+ consumes: ["multipart/form-data"],
105
+ response: {
106
+ 200: captionResponseSchema,
107
+ 400: errorSchema,
108
+ },
109
+ },
110
+ },
111
+ async (req, reply) => {
112
+ const data = await req.file();
113
+ if (!data) {
114
+ return reply.code(400).send({ error: "No file uploaded" });
115
+ }
116
+
117
+ const task = data.fields.task?.value || "caption";
118
+ const textInput = data.fields.text?.value || null;
119
+ const maxTokens = parseInt(data.fields.max_tokens?.value || "100", 10);
120
+
121
+ if (!TASKS[task]) {
122
+ return reply
123
+ .code(400)
124
+ .send({ error: `Invalid task. Choose from: ${taskEnum.join(", ")}` });
125
+ }
126
+
127
+ const buffer = await data.toBuffer();
128
+ const result = await generateCaption(buffer, task, textInput, maxTokens);
129
+
130
+ return { task, result };
131
+ }
132
+ );
133
+
134
+ app.post(
135
+ "/caption/batch",
136
+ {
137
+ schema: {
138
+ tags: ["caption"],
139
+ summary: "Generate captions for multiple images",
140
+ description:
141
+ "Upload multiple images as multipart form data. All images share the same task and settings.",
142
+ consumes: ["multipart/form-data"],
143
+ response: {
144
+ 200: batchResponseSchema,
145
+ 400: errorSchema,
146
+ },
147
+ },
148
+ },
149
+ async (req, reply) => {
150
+ const parts = await req.parts();
151
+ const files = [];
152
+ let task = "caption";
153
+ let textInput = null;
154
+ let maxTokens = 100;
155
+
156
+ for await (const part of parts) {
157
+ if (part.type === "file") {
158
+ files.push({ filename: part.filename, buffer: await part.toBuffer() });
159
+ } else if (part.fieldname === "task") {
160
+ task = part.value;
161
+ } else if (part.fieldname === "text") {
162
+ textInput = part.value;
163
+ } else if (part.fieldname === "max_tokens") {
164
+ maxTokens = parseInt(part.value, 10);
165
+ }
166
+ }
167
+
168
+ if (files.length === 0) {
169
+ return reply.code(400).send({ error: "No files uploaded" });
170
+ }
171
+ if (!TASKS[task]) {
172
+ return reply
173
+ .code(400)
174
+ .send({ error: `Invalid task. Choose from: ${taskEnum.join(", ")}` });
175
+ }
176
+
177
+ const results = [];
178
+ for (const f of files) {
179
+ const result = await generateCaption(f.buffer, task, textInput, maxTokens);
180
+ results.push({ filename: f.filename, task, result });
181
+ }
182
+
183
+ return { results };
184
+ }
185
+ );
186
+
187
+ // --- Start ---
188
+ const PORT = process.env.PORT || 3000;
189
+
190
+ // Pre-load model then start server
191
+ await loadModel();
192
+ app.listen({ host: "0.0.0.0", port: PORT }, (err) => {
193
+ if (err) {
194
+ app.log.error(err);
195
+ process.exit(1);
196
+ }
197
+ });