Spaces:
Running
Running
Commit ·
b02d5c5
1
Parent(s): c356756
feat: added
Browse files- README.md +2 -0
- docker-compose.yml +3 -0
- src/model.js +30 -3
- src/server.js +7 -3
README.md
CHANGED
|
@@ -9,3 +9,5 @@ pinned: false
|
|
| 9 |
---
|
| 10 |
|
| 11 |
Image captioning API using FastVLM (ONNX). Open `/docs` for Swagger UI.
|
|
|
|
|
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
Image captioning API using FastVLM (ONNX). Open `/docs` for Swagger UI.
|
| 12 |
+
|
| 13 |
+
Speed tuning env vars: `DEFAULT_MAX_TOKENS` (default `64`), `MAX_IMAGE_SIDE` (default `896`), `MAX_MAX_TOKENS` (default `256`).
|
docker-compose.yml
CHANGED
|
@@ -5,4 +5,7 @@ services:
|
|
| 5 |
- "7860:7860"
|
| 6 |
environment:
|
| 7 |
- PORT=7860
|
|
|
|
|
|
|
|
|
|
| 8 |
restart: unless-stopped
|
|
|
|
| 5 |
- "7860:7860"
|
| 6 |
environment:
|
| 7 |
- PORT=7860
|
| 8 |
+
- DEFAULT_MAX_TOKENS=64
|
| 9 |
+
- MAX_IMAGE_SIDE=896
|
| 10 |
+
- MAX_MAX_TOKENS=256
|
| 11 |
restart: unless-stopped
|
src/model.js
CHANGED
|
@@ -3,8 +3,12 @@ import {
|
|
| 3 |
AutoProcessor,
|
| 4 |
RawImage,
|
| 5 |
} from "@huggingface/transformers";
|
|
|
|
| 6 |
|
| 7 |
const MODEL_ID = "onnx-community/FastVLM-0.5B-ONNX";
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
let model = null;
|
| 10 |
let processor = null;
|
|
@@ -53,11 +57,33 @@ export async function generateCaption(
|
|
| 53 |
imageBuffer,
|
| 54 |
task = "caption",
|
| 55 |
textInput = null,
|
| 56 |
-
maxTokens =
|
| 57 |
) {
|
| 58 |
const { model: m, processor: p } = await loadModel();
|
| 59 |
|
| 60 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
const baseInstruction = TASKS[task] || TASKS.caption;
|
| 63 |
const instruction = textInput
|
|
@@ -69,7 +95,8 @@ export async function generateCaption(
|
|
| 69 |
|
| 70 |
const generatedIds = await m.generate({
|
| 71 |
...inputs,
|
| 72 |
-
|
|
|
|
| 73 |
});
|
| 74 |
|
| 75 |
const generatedText = p.batch_decode(generatedIds, {
|
|
|
|
| 3 |
AutoProcessor,
|
| 4 |
RawImage,
|
| 5 |
} from "@huggingface/transformers";
|
| 6 |
+
import sharp from "sharp";
|
| 7 |
|
| 8 |
const MODEL_ID = "onnx-community/FastVLM-0.5B-ONNX";
|
| 9 |
+
const DEFAULT_MAX_TOKENS = parseInt(process.env.DEFAULT_MAX_TOKENS || "64", 10);
|
| 10 |
+
const MAX_MAX_TOKENS = parseInt(process.env.MAX_MAX_TOKENS || "256", 10);
|
| 11 |
+
const MAX_IMAGE_SIDE = parseInt(process.env.MAX_IMAGE_SIDE || "896", 10);
|
| 12 |
|
| 13 |
let model = null;
|
| 14 |
let processor = null;
|
|
|
|
| 57 |
imageBuffer,
|
| 58 |
task = "caption",
|
| 59 |
textInput = null,
|
| 60 |
+
maxTokens = DEFAULT_MAX_TOKENS
|
| 61 |
) {
|
| 62 |
const { model: m, processor: p } = await loadModel();
|
| 63 |
|
| 64 |
+
const safeMaxTokens = Number.isFinite(maxTokens)
|
| 65 |
+
? Math.min(Math.max(maxTokens, 8), MAX_MAX_TOKENS)
|
| 66 |
+
: DEFAULT_MAX_TOKENS;
|
| 67 |
+
|
| 68 |
+
// Downscale large uploads to reduce encoder latency.
|
| 69 |
+
const metadata = await sharp(imageBuffer).metadata();
|
| 70 |
+
let preparedBuffer = imageBuffer;
|
| 71 |
+
if (
|
| 72 |
+
metadata.width &&
|
| 73 |
+
metadata.height &&
|
| 74 |
+
(metadata.width > MAX_IMAGE_SIDE || metadata.height > MAX_IMAGE_SIDE)
|
| 75 |
+
) {
|
| 76 |
+
preparedBuffer = await sharp(imageBuffer)
|
| 77 |
+
.resize({
|
| 78 |
+
width: MAX_IMAGE_SIDE,
|
| 79 |
+
height: MAX_IMAGE_SIDE,
|
| 80 |
+
fit: "inside",
|
| 81 |
+
withoutEnlargement: true,
|
| 82 |
+
})
|
| 83 |
+
.toBuffer();
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
const image = await RawImage.fromBlob(new Blob([preparedBuffer]));
|
| 87 |
|
| 88 |
const baseInstruction = TASKS[task] || TASKS.caption;
|
| 89 |
const instruction = textInput
|
|
|
|
| 95 |
|
| 96 |
const generatedIds = await m.generate({
|
| 97 |
...inputs,
|
| 98 |
+
do_sample: false,
|
| 99 |
+
max_new_tokens: safeMaxTokens,
|
| 100 |
});
|
| 101 |
|
| 102 |
const generatedText = p.batch_decode(generatedIds, {
|
src/server.js
CHANGED
|
@@ -6,6 +6,7 @@ import cors from "@fastify/cors";
|
|
| 6 |
import { generateCaption, loadModel, TASKS } from "./model.js";
|
| 7 |
|
| 8 |
const app = Fastify({ logger: true });
|
|
|
|
| 9 |
|
| 10 |
// --- Plugins ---
|
| 11 |
await app.register(cors);
|
|
@@ -99,7 +100,7 @@ h1{font-size:1.8rem;margin-bottom:.5rem}
|
|
| 99 |
<ul style="margin:.5rem 0 0 1.2rem;color:#94a3b8">
|
| 100 |
<li><code>file</code> — image (required)</li>
|
| 101 |
<li><code>task</code> — caption, detailed_caption, more_detailed_caption, ocr, ocr_with_region, object_detection, dense_region_caption, region_proposal</li>
|
| 102 |
-
<li><code>max_tokens</code> — default
|
| 103 |
</ul>
|
| 104 |
</div>
|
| 105 |
</div></body></html>`);
|
|
@@ -153,7 +154,10 @@ app.post(
|
|
| 153 |
|
| 154 |
const task = data.fields.task?.value || "caption";
|
| 155 |
const textInput = data.fields.text?.value || null;
|
| 156 |
-
const maxTokens = parseInt(
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
if (!TASKS[task]) {
|
| 159 |
return reply
|
|
@@ -188,7 +192,7 @@ app.post(
|
|
| 188 |
const files = [];
|
| 189 |
let task = "caption";
|
| 190 |
let textInput = null;
|
| 191 |
-
let maxTokens =
|
| 192 |
|
| 193 |
for await (const part of parts) {
|
| 194 |
if (part.type === "file") {
|
|
|
|
| 6 |
import { generateCaption, loadModel, TASKS } from "./model.js";
|
| 7 |
|
| 8 |
const app = Fastify({ logger: true });
|
| 9 |
+
const DEFAULT_MAX_TOKENS = parseInt(process.env.DEFAULT_MAX_TOKENS || "64", 10);
|
| 10 |
|
| 11 |
// --- Plugins ---
|
| 12 |
await app.register(cors);
|
|
|
|
| 100 |
<ul style="margin:.5rem 0 0 1.2rem;color:#94a3b8">
|
| 101 |
<li><code>file</code> — image (required)</li>
|
| 102 |
<li><code>task</code> — caption, detailed_caption, more_detailed_caption, ocr, ocr_with_region, object_detection, dense_region_caption, region_proposal</li>
|
| 103 |
+
<li><code>max_tokens</code> — default 64 (smaller = faster)</li>
|
| 104 |
</ul>
|
| 105 |
</div>
|
| 106 |
</div></body></html>`);
|
|
|
|
| 154 |
|
| 155 |
const task = data.fields.task?.value || "caption";
|
| 156 |
const textInput = data.fields.text?.value || null;
|
| 157 |
+
const maxTokens = parseInt(
|
| 158 |
+
data.fields.max_tokens?.value || String(DEFAULT_MAX_TOKENS),
|
| 159 |
+
10
|
| 160 |
+
);
|
| 161 |
|
| 162 |
if (!TASKS[task]) {
|
| 163 |
return reply
|
|
|
|
| 192 |
const files = [];
|
| 193 |
let task = "caption";
|
| 194 |
let textInput = null;
|
| 195 |
+
let maxTokens = DEFAULT_MAX_TOKENS;
|
| 196 |
|
| 197 |
for await (const part of parts) {
|
| 198 |
if (part.type === "file") {
|