Image2Caption / src /model.js
khushalcodiste's picture
feat: added
9a16713
raw
history blame
2.01 kB
import {
Florence2ForConditionalGeneration,
AutoProcessor,
RawImage,
} from "@huggingface/transformers";
const MODEL_ID = "onnx-community/Florence-2-base";
let model = null;
let processor = null;
/** Supported Florence-2 task tokens */
export const TASKS = {
caption: "<CAPTION>",
detailed_caption: "<DETAILED_CAPTION>",
more_detailed_caption: "<MORE_DETAILED_CAPTION>",
ocr: "<OCR>",
ocr_with_region: "<OCR_WITH_REGION>",
object_detection: "<OD>",
dense_region_caption: "<DENSE_REGION_CAPTION>",
region_proposal: "<REGION_PROPOSAL>",
};
export async function loadModel() {
if (!model) {
console.log("Loading Florence-2 model...");
model = await Florence2ForConditionalGeneration.from_pretrained(MODEL_ID, {
dtype: "fp32",
});
processor = await AutoProcessor.from_pretrained(MODEL_ID);
console.log("Model loaded.");
}
return { model, processor };
}
/**
* Generate text from an image buffer.
* @param {Buffer} imageBuffer - Raw image bytes
* @param {string} task - One of the TASKS keys (default: "caption")
* @param {string|null} textInput - Optional extra text input for the task
* @param {number} maxTokens - Max new tokens to generate
* @returns {Promise<object>} Parsed result from Florence-2
*/
export async function generateCaption(
imageBuffer,
task = "caption",
textInput = null,
maxTokens = 100
) {
const { model: m, processor: p } = await loadModel();
const image = await RawImage.fromBlob(new Blob([imageBuffer]));
const taskToken = TASKS[task] || TASKS.caption;
const prompt = textInput ? taskToken + textInput : taskToken;
const prompts = p.construct_prompts(prompt);
const inputs = await p(image, prompts);
const generatedIds = await m.generate({
...inputs,
max_new_tokens: maxTokens,
});
const generatedText = p.batch_decode(generatedIds, {
skip_special_tokens: false,
})[0];
const result = p.post_process_generation(generatedText, taskToken, image.size);
return result;
}