Spaces:

d3evil4
/

Image2Caption

Running

App Files Files Community

khushalcodiste commited on Mar 8

Commit

c356756

1 Parent(s): 207a6dd

feat: added

Browse files

Files changed (4) hide show

README.md +1 -1
package.json +1 -1
src/model.js +32 -25
src/server.js +4 -4

README.md CHANGED Viewed

@@ -8,4 +8,4 @@ app_port: 7860
 pinned: false
 ---
-Image captioning API using Microsoft Florence-2 (ONNX). Open `/docs` for Swagger UI.

 pinned: false
 ---
+Image captioning API using FastVLM (ONNX). Open `/docs` for Swagger UI.

package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "img3txt",
   "version": "1.0.0",
-  "description": "Image captioning API using Florence-2 ONNX model",
   "type": "module",
   "scripts": {
     "start": "node src/server.js",

 {
   "name": "img3txt",
   "version": "1.0.0",
+  "description": "Image captioning API using FastVLM ONNX model",
   "type": "module",
   "scripts": {
     "start": "node src/server.js",

src/model.js CHANGED Viewed

@@ -1,31 +1,39 @@
 import {
-  Florence2ForConditionalGeneration,
   AutoProcessor,
   RawImage,
 } from "@huggingface/transformers";
-const MODEL_ID = "onnx-community/Florence-2-base";
 let model = null;
 let processor = null;
-/** Supported Florence-2 task tokens */
 export const TASKS = {
-  caption: "<CAPTION>",
-  detailed_caption: "<DETAILED_CAPTION>",
-  more_detailed_caption: "<MORE_DETAILED_CAPTION>",
-  ocr: "<OCR>",
-  ocr_with_region: "<OCR_WITH_REGION>",
-  object_detection: "<OD>",
-  dense_region_caption: "<DENSE_REGION_CAPTION>",
-  region_proposal: "<REGION_PROPOSAL>",
 };
 export async function loadModel() {
   if (!model) {
-    console.log("Loading Florence-2 model...");
-    model = await Florence2ForConditionalGeneration.from_pretrained(MODEL_ID, {
-      dtype: "fp32",
     });
     processor = await AutoProcessor.from_pretrained(MODEL_ID);
     console.log("Model loaded.");
@@ -39,7 +47,7 @@ export async function loadModel() {
  * @param {string} task - One of the TASKS keys (default: "caption")
  * @param {string|null} textInput - Optional extra text input for the task
  * @param {number} maxTokens - Max new tokens to generate
- * @returns {Promise<object>} Parsed result from Florence-2
  */
 export async function generateCaption(
   imageBuffer,
@@ -51,11 +59,13 @@ export async function generateCaption(
   const image = await RawImage.fromBlob(new Blob([imageBuffer]));
-  const taskToken = TASKS[task] || TASKS.caption;
-  const prompt = textInput ? taskToken + textInput : taskToken;
-  const prompts = p.construct_prompts(prompt);
-  const inputs = await p(image, prompts);
   const generatedIds = await m.generate({
     ...inputs,
@@ -63,10 +73,7 @@ export async function generateCaption(
   });
   const generatedText = p.batch_decode(generatedIds, {
-    skip_special_tokens: false,
   })[0];
-  const result = p.post_process_generation(generatedText, taskToken, image.size);
-  return result;
 }

 import {
+  AutoModelForImageTextToText,
   AutoProcessor,
   RawImage,
 } from "@huggingface/transformers";
+const MODEL_ID = "onnx-community/FastVLM-0.5B-ONNX";
 let model = null;
 let processor = null;
+/** Supported task instructions for FastVLM */
 export const TASKS = {
+  caption: "Describe this image.",
+  detailed_caption: "Describe this image in detail.",
+  more_detailed_caption:
+    "Provide a very detailed description of this image.",
+  ocr: "Extract all readable text from this image.",
+  ocr_with_region:
+    "Extract all readable text and include where it appears in the image.",
+  object_detection: "List the visible objects in this image.",
+  dense_region_caption:
+    "Describe this image region by region with detailed observations.",
+  region_proposal:
+    "Propose important regions in this image and explain what each region contains.",
 };
 export async function loadModel() {
   if (!model) {
+    console.log("Loading FastVLM model...");
+    model = await AutoModelForImageTextToText.from_pretrained(MODEL_ID, {
+      dtype: {
+        embed_tokens: "fp16",
+        vision_encoder: "q4",
+        decoder_model_merged: "q4",
+      },
     });
     processor = await AutoProcessor.from_pretrained(MODEL_ID);
     console.log("Model loaded.");
  * @param {string} task - One of the TASKS keys (default: "caption")
  * @param {string|null} textInput - Optional extra text input for the task
  * @param {number} maxTokens - Max new tokens to generate
+ * @returns {Promise<object>} Generated result from FastVLM
  */
 export async function generateCaption(
   imageBuffer,
   const image = await RawImage.fromBlob(new Blob([imageBuffer]));
+  const baseInstruction = TASKS[task] || TASKS.caption;
+  const instruction = textInput
+    ? `${baseInstruction}\nAdditional instruction: ${textInput}`
+    : baseInstruction;
+  const messages = [{ role: "user", content: `<image>${instruction}` }];
+  const prompt = p.apply_chat_template(messages, { add_generation_prompt: true });
+  const inputs = await p(image, prompt, { add_special_tokens: false });
   const generatedIds = await m.generate({
     ...inputs,
   });
   const generatedText = p.batch_decode(generatedIds, {
+    skip_special_tokens: true,
   })[0];
+  return { text: generatedText.trim() };
 }

src/server.js CHANGED Viewed

@@ -16,7 +16,7 @@ await app.register(swagger, {
     info: {
       title: "img3txt — Image Captioning API",
       description:
-        "Generate captions, OCR, object detection and more from images using Microsoft Florence-2 (ONNX).",
       version: "1.0.0",
     },
     servers: [{ url: "/" }],
@@ -77,7 +77,7 @@ app.get(
     reply.type("text/html").send(`<!DOCTYPE html>
 <html lang="en"><head><meta charset="utf-8">
 <meta name="viewport" content="width=device-width,initial-scale=1">
-<title>img3txt — Florence-2 Image Captioning API</title>
 <style>
 *{margin:0;padding:0;box-sizing:border-box}
 body{font-family:system-ui,sans-serif;background:#0f172a;color:#e2e8f0;display:flex;align-items:center;justify-content:center;min-height:100vh}
@@ -91,7 +91,7 @@ h1{font-size:1.8rem;margin-bottom:.5rem}
 </style></head><body>
 <div class="card">
 <h1>img3txt</h1>
-<p class="sub">Image captioning, OCR &amp; object detection powered by Florence-2 (ONNX)</p>
 <a class="btn" href="/docs">Swagger UI</a>
 <a class="btn" href="/health">Health Check</a>
 <div class="tasks">
@@ -126,7 +126,7 @@ app.get(
   },
   async () => ({
     status: "ok",
-    model: "onnx-community/Florence-2-base",
     tasks: taskEnum,
   })
 );

     info: {
       title: "img3txt — Image Captioning API",
       description:
+        "Generate captions, OCR, object detection and more from images using FastVLM (ONNX).",
       version: "1.0.0",
     },
     servers: [{ url: "/" }],
     reply.type("text/html").send(`<!DOCTYPE html>
 <html lang="en"><head><meta charset="utf-8">
 <meta name="viewport" content="width=device-width,initial-scale=1">
+<title>img3txt — FastVLM Image Captioning API</title>
 <style>
 *{margin:0;padding:0;box-sizing:border-box}
 body{font-family:system-ui,sans-serif;background:#0f172a;color:#e2e8f0;display:flex;align-items:center;justify-content:center;min-height:100vh}
 </style></head><body>
 <div class="card">
 <h1>img3txt</h1>
+<p class="sub">Image captioning, OCR &amp; object detection powered by FastVLM (ONNX)</p>
 <a class="btn" href="/docs">Swagger UI</a>
 <a class="btn" href="/health">Health Check</a>
 <div class="tasks">
   },
   async () => ({
     status: "ok",
+    model: "onnx-community/FastVLM-0.5B-ONNX",
     tasks: taskEnum,
   })
 );