khushalcodiste commited on
Commit
c356756
·
1 Parent(s): 207a6dd

feat: added

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. package.json +1 -1
  3. src/model.js +32 -25
  4. src/server.js +4 -4
README.md CHANGED
@@ -8,4 +8,4 @@ app_port: 7860
8
  pinned: false
9
  ---
10
 
11
- Image captioning API using Microsoft Florence-2 (ONNX). Open `/docs` for Swagger UI.
 
8
  pinned: false
9
  ---
10
 
11
+ Image captioning API using FastVLM (ONNX). Open `/docs` for Swagger UI.
package.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "name": "img3txt",
3
  "version": "1.0.0",
4
- "description": "Image captioning API using Florence-2 ONNX model",
5
  "type": "module",
6
  "scripts": {
7
  "start": "node src/server.js",
 
1
  {
2
  "name": "img3txt",
3
  "version": "1.0.0",
4
+ "description": "Image captioning API using FastVLM ONNX model",
5
  "type": "module",
6
  "scripts": {
7
  "start": "node src/server.js",
src/model.js CHANGED
@@ -1,31 +1,39 @@
1
  import {
2
- Florence2ForConditionalGeneration,
3
  AutoProcessor,
4
  RawImage,
5
  } from "@huggingface/transformers";
6
 
7
- const MODEL_ID = "onnx-community/Florence-2-base";
8
 
9
  let model = null;
10
  let processor = null;
11
 
12
- /** Supported Florence-2 task tokens */
13
  export const TASKS = {
14
- caption: "<CAPTION>",
15
- detailed_caption: "<DETAILED_CAPTION>",
16
- more_detailed_caption: "<MORE_DETAILED_CAPTION>",
17
- ocr: "<OCR>",
18
- ocr_with_region: "<OCR_WITH_REGION>",
19
- object_detection: "<OD>",
20
- dense_region_caption: "<DENSE_REGION_CAPTION>",
21
- region_proposal: "<REGION_PROPOSAL>",
 
 
 
 
22
  };
23
 
24
  export async function loadModel() {
25
  if (!model) {
26
- console.log("Loading Florence-2 model...");
27
- model = await Florence2ForConditionalGeneration.from_pretrained(MODEL_ID, {
28
- dtype: "fp32",
 
 
 
 
29
  });
30
  processor = await AutoProcessor.from_pretrained(MODEL_ID);
31
  console.log("Model loaded.");
@@ -39,7 +47,7 @@ export async function loadModel() {
39
  * @param {string} task - One of the TASKS keys (default: "caption")
40
  * @param {string|null} textInput - Optional extra text input for the task
41
  * @param {number} maxTokens - Max new tokens to generate
42
- * @returns {Promise<object>} Parsed result from Florence-2
43
  */
44
  export async function generateCaption(
45
  imageBuffer,
@@ -51,11 +59,13 @@ export async function generateCaption(
51
 
52
  const image = await RawImage.fromBlob(new Blob([imageBuffer]));
53
 
54
- const taskToken = TASKS[task] || TASKS.caption;
55
- const prompt = textInput ? taskToken + textInput : taskToken;
56
-
57
- const prompts = p.construct_prompts(prompt);
58
- const inputs = await p(image, prompts);
 
 
59
 
60
  const generatedIds = await m.generate({
61
  ...inputs,
@@ -63,10 +73,7 @@ export async function generateCaption(
63
  });
64
 
65
  const generatedText = p.batch_decode(generatedIds, {
66
- skip_special_tokens: false,
67
  })[0];
68
-
69
- const result = p.post_process_generation(generatedText, taskToken, image.size);
70
-
71
- return result;
72
  }
 
1
  import {
2
+ AutoModelForImageTextToText,
3
  AutoProcessor,
4
  RawImage,
5
  } from "@huggingface/transformers";
6
 
7
+ const MODEL_ID = "onnx-community/FastVLM-0.5B-ONNX";
8
 
9
  let model = null;
10
  let processor = null;
11
 
12
+ /** Supported task instructions for FastVLM */
13
  export const TASKS = {
14
+ caption: "Describe this image.",
15
+ detailed_caption: "Describe this image in detail.",
16
+ more_detailed_caption:
17
+ "Provide a very detailed description of this image.",
18
+ ocr: "Extract all readable text from this image.",
19
+ ocr_with_region:
20
+ "Extract all readable text and include where it appears in the image.",
21
+ object_detection: "List the visible objects in this image.",
22
+ dense_region_caption:
23
+ "Describe this image region by region with detailed observations.",
24
+ region_proposal:
25
+ "Propose important regions in this image and explain what each region contains.",
26
  };
27
 
28
  export async function loadModel() {
29
  if (!model) {
30
+ console.log("Loading FastVLM model...");
31
+ model = await AutoModelForImageTextToText.from_pretrained(MODEL_ID, {
32
+ dtype: {
33
+ embed_tokens: "fp16",
34
+ vision_encoder: "q4",
35
+ decoder_model_merged: "q4",
36
+ },
37
  });
38
  processor = await AutoProcessor.from_pretrained(MODEL_ID);
39
  console.log("Model loaded.");
 
47
  * @param {string} task - One of the TASKS keys (default: "caption")
48
  * @param {string|null} textInput - Optional extra text input for the task
49
  * @param {number} maxTokens - Max new tokens to generate
50
+ * @returns {Promise<object>} Generated result from FastVLM
51
  */
52
  export async function generateCaption(
53
  imageBuffer,
 
59
 
60
  const image = await RawImage.fromBlob(new Blob([imageBuffer]));
61
 
62
+ const baseInstruction = TASKS[task] || TASKS.caption;
63
+ const instruction = textInput
64
+ ? `${baseInstruction}\nAdditional instruction: ${textInput}`
65
+ : baseInstruction;
66
+ const messages = [{ role: "user", content: `<image>${instruction}` }];
67
+ const prompt = p.apply_chat_template(messages, { add_generation_prompt: true });
68
+ const inputs = await p(image, prompt, { add_special_tokens: false });
69
 
70
  const generatedIds = await m.generate({
71
  ...inputs,
 
73
  });
74
 
75
  const generatedText = p.batch_decode(generatedIds, {
76
+ skip_special_tokens: true,
77
  })[0];
78
+ return { text: generatedText.trim() };
 
 
 
79
  }
src/server.js CHANGED
@@ -16,7 +16,7 @@ await app.register(swagger, {
16
  info: {
17
  title: "img3txt — Image Captioning API",
18
  description:
19
- "Generate captions, OCR, object detection and more from images using Microsoft Florence-2 (ONNX).",
20
  version: "1.0.0",
21
  },
22
  servers: [{ url: "/" }],
@@ -77,7 +77,7 @@ app.get(
77
  reply.type("text/html").send(`<!DOCTYPE html>
78
  <html lang="en"><head><meta charset="utf-8">
79
  <meta name="viewport" content="width=device-width,initial-scale=1">
80
- <title>img3txt — Florence-2 Image Captioning API</title>
81
  <style>
82
  *{margin:0;padding:0;box-sizing:border-box}
83
  body{font-family:system-ui,sans-serif;background:#0f172a;color:#e2e8f0;display:flex;align-items:center;justify-content:center;min-height:100vh}
@@ -91,7 +91,7 @@ h1{font-size:1.8rem;margin-bottom:.5rem}
91
  </style></head><body>
92
  <div class="card">
93
  <h1>img3txt</h1>
94
- <p class="sub">Image captioning, OCR &amp; object detection powered by Florence-2 (ONNX)</p>
95
  <a class="btn" href="/docs">Swagger UI</a>
96
  <a class="btn" href="/health">Health Check</a>
97
  <div class="tasks">
@@ -126,7 +126,7 @@ app.get(
126
  },
127
  async () => ({
128
  status: "ok",
129
- model: "onnx-community/Florence-2-base",
130
  tasks: taskEnum,
131
  })
132
  );
 
16
  info: {
17
  title: "img3txt — Image Captioning API",
18
  description:
19
+ "Generate captions, OCR, object detection and more from images using FastVLM (ONNX).",
20
  version: "1.0.0",
21
  },
22
  servers: [{ url: "/" }],
 
77
  reply.type("text/html").send(`<!DOCTYPE html>
78
  <html lang="en"><head><meta charset="utf-8">
79
  <meta name="viewport" content="width=device-width,initial-scale=1">
80
+ <title>img3txt — FastVLM Image Captioning API</title>
81
  <style>
82
  *{margin:0;padding:0;box-sizing:border-box}
83
  body{font-family:system-ui,sans-serif;background:#0f172a;color:#e2e8f0;display:flex;align-items:center;justify-content:center;min-height:100vh}
 
91
  </style></head><body>
92
  <div class="card">
93
  <h1>img3txt</h1>
94
+ <p class="sub">Image captioning, OCR &amp; object detection powered by FastVLM (ONNX)</p>
95
  <a class="btn" href="/docs">Swagger UI</a>
96
  <a class="btn" href="/health">Health Check</a>
97
  <div class="tasks">
 
126
  },
127
  async () => ({
128
  status: "ok",
129
+ model: "onnx-community/FastVLM-0.5B-ONNX",
130
  tasks: taskEnum,
131
  })
132
  );