khushalcodiste commited on
Commit
b02d5c5
·
1 Parent(s): c356756

feat: added

Browse files
Files changed (4) hide show
  1. README.md +2 -0
  2. docker-compose.yml +3 -0
  3. src/model.js +30 -3
  4. src/server.js +7 -3
README.md CHANGED
@@ -9,3 +9,5 @@ pinned: false
9
  ---
10
 
11
  Image captioning API using FastVLM (ONNX). Open `/docs` for Swagger UI.
 
 
 
9
  ---
10
 
11
  Image captioning API using FastVLM (ONNX). Open `/docs` for Swagger UI.
12
+
13
+ Speed tuning env vars: `DEFAULT_MAX_TOKENS` (default `64`), `MAX_IMAGE_SIDE` (default `896`), `MAX_MAX_TOKENS` (default `256`).
docker-compose.yml CHANGED
@@ -5,4 +5,7 @@ services:
5
  - "7860:7860"
6
  environment:
7
  - PORT=7860
 
 
 
8
  restart: unless-stopped
 
5
  - "7860:7860"
6
  environment:
7
  - PORT=7860
8
+ - DEFAULT_MAX_TOKENS=64
9
+ - MAX_IMAGE_SIDE=896
10
+ - MAX_MAX_TOKENS=256
11
  restart: unless-stopped
src/model.js CHANGED
@@ -3,8 +3,12 @@ import {
3
  AutoProcessor,
4
  RawImage,
5
  } from "@huggingface/transformers";
 
6
 
7
  const MODEL_ID = "onnx-community/FastVLM-0.5B-ONNX";
 
 
 
8
 
9
  let model = null;
10
  let processor = null;
@@ -53,11 +57,33 @@ export async function generateCaption(
53
  imageBuffer,
54
  task = "caption",
55
  textInput = null,
56
- maxTokens = 100
57
  ) {
58
  const { model: m, processor: p } = await loadModel();
59
 
60
- const image = await RawImage.fromBlob(new Blob([imageBuffer]));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  const baseInstruction = TASKS[task] || TASKS.caption;
63
  const instruction = textInput
@@ -69,7 +95,8 @@ export async function generateCaption(
69
 
70
  const generatedIds = await m.generate({
71
  ...inputs,
72
- max_new_tokens: maxTokens,
 
73
  });
74
 
75
  const generatedText = p.batch_decode(generatedIds, {
 
3
  AutoProcessor,
4
  RawImage,
5
  } from "@huggingface/transformers";
6
+ import sharp from "sharp";
7
 
8
  const MODEL_ID = "onnx-community/FastVLM-0.5B-ONNX";
9
+ const DEFAULT_MAX_TOKENS = parseInt(process.env.DEFAULT_MAX_TOKENS || "64", 10);
10
+ const MAX_MAX_TOKENS = parseInt(process.env.MAX_MAX_TOKENS || "256", 10);
11
+ const MAX_IMAGE_SIDE = parseInt(process.env.MAX_IMAGE_SIDE || "896", 10);
12
 
13
  let model = null;
14
  let processor = null;
 
57
  imageBuffer,
58
  task = "caption",
59
  textInput = null,
60
+ maxTokens = DEFAULT_MAX_TOKENS
61
  ) {
62
  const { model: m, processor: p } = await loadModel();
63
 
64
+ const safeMaxTokens = Number.isFinite(maxTokens)
65
+ ? Math.min(Math.max(maxTokens, 8), MAX_MAX_TOKENS)
66
+ : DEFAULT_MAX_TOKENS;
67
+
68
+ // Downscale large uploads to reduce encoder latency.
69
+ const metadata = await sharp(imageBuffer).metadata();
70
+ let preparedBuffer = imageBuffer;
71
+ if (
72
+ metadata.width &&
73
+ metadata.height &&
74
+ (metadata.width > MAX_IMAGE_SIDE || metadata.height > MAX_IMAGE_SIDE)
75
+ ) {
76
+ preparedBuffer = await sharp(imageBuffer)
77
+ .resize({
78
+ width: MAX_IMAGE_SIDE,
79
+ height: MAX_IMAGE_SIDE,
80
+ fit: "inside",
81
+ withoutEnlargement: true,
82
+ })
83
+ .toBuffer();
84
+ }
85
+
86
+ const image = await RawImage.fromBlob(new Blob([preparedBuffer]));
87
 
88
  const baseInstruction = TASKS[task] || TASKS.caption;
89
  const instruction = textInput
 
95
 
96
  const generatedIds = await m.generate({
97
  ...inputs,
98
+ do_sample: false,
99
+ max_new_tokens: safeMaxTokens,
100
  });
101
 
102
  const generatedText = p.batch_decode(generatedIds, {
src/server.js CHANGED
@@ -6,6 +6,7 @@ import cors from "@fastify/cors";
6
  import { generateCaption, loadModel, TASKS } from "./model.js";
7
 
8
  const app = Fastify({ logger: true });
 
9
 
10
  // --- Plugins ---
11
  await app.register(cors);
@@ -99,7 +100,7 @@ h1{font-size:1.8rem;margin-bottom:.5rem}
99
  <ul style="margin:.5rem 0 0 1.2rem;color:#94a3b8">
100
  <li><code>file</code> — image (required)</li>
101
  <li><code>task</code> — caption, detailed_caption, more_detailed_caption, ocr, ocr_with_region, object_detection, dense_region_caption, region_proposal</li>
102
- <li><code>max_tokens</code> — default 100</li>
103
  </ul>
104
  </div>
105
  </div></body></html>`);
@@ -153,7 +154,10 @@ app.post(
153
 
154
  const task = data.fields.task?.value || "caption";
155
  const textInput = data.fields.text?.value || null;
156
- const maxTokens = parseInt(data.fields.max_tokens?.value || "100", 10);
 
 
 
157
 
158
  if (!TASKS[task]) {
159
  return reply
@@ -188,7 +192,7 @@ app.post(
188
  const files = [];
189
  let task = "caption";
190
  let textInput = null;
191
- let maxTokens = 100;
192
 
193
  for await (const part of parts) {
194
  if (part.type === "file") {
 
6
  import { generateCaption, loadModel, TASKS } from "./model.js";
7
 
8
  const app = Fastify({ logger: true });
9
+ const DEFAULT_MAX_TOKENS = parseInt(process.env.DEFAULT_MAX_TOKENS || "64", 10);
10
 
11
  // --- Plugins ---
12
  await app.register(cors);
 
100
  <ul style="margin:.5rem 0 0 1.2rem;color:#94a3b8">
101
  <li><code>file</code> — image (required)</li>
102
  <li><code>task</code> — caption, detailed_caption, more_detailed_caption, ocr, ocr_with_region, object_detection, dense_region_caption, region_proposal</li>
103
+ <li><code>max_tokens</code> — default 64 (smaller = faster)</li>
104
  </ul>
105
  </div>
106
  </div></body></html>`);
 
154
 
155
  const task = data.fields.task?.value || "caption";
156
  const textInput = data.fields.text?.value || null;
157
+ const maxTokens = parseInt(
158
+ data.fields.max_tokens?.value || String(DEFAULT_MAX_TOKENS),
159
+ 10
160
+ );
161
 
162
  if (!TASKS[task]) {
163
  return reply
 
192
  const files = [];
193
  let task = "caption";
194
  let textInput = null;
195
+ let maxTokens = DEFAULT_MAX_TOKENS;
196
 
197
  for await (const part of parts) {
198
  if (part.type === "file") {