import { Florence2ForConditionalGeneration, AutoProcessor, RawImage, } from "@huggingface/transformers"; const MODEL_ID = "onnx-community/Florence-2-base"; let model = null; let processor = null; /** Supported Florence-2 task tokens */ export const TASKS = { caption: "", detailed_caption: "", more_detailed_caption: "", ocr: "", ocr_with_region: "", object_detection: "", dense_region_caption: "", region_proposal: "", }; export async function loadModel() { if (!model) { console.log("Loading Florence-2 model..."); model = await Florence2ForConditionalGeneration.from_pretrained(MODEL_ID, { dtype: "fp32", }); processor = await AutoProcessor.from_pretrained(MODEL_ID); console.log("Model loaded."); } return { model, processor }; } /** * Generate text from an image buffer. * @param {Buffer} imageBuffer - Raw image bytes * @param {string} task - One of the TASKS keys (default: "caption") * @param {string|null} textInput - Optional extra text input for the task * @param {number} maxTokens - Max new tokens to generate * @returns {Promise} Parsed result from Florence-2 */ export async function generateCaption( imageBuffer, task = "caption", textInput = null, maxTokens = 100 ) { const { model: m, processor: p } = await loadModel(); const image = await RawImage.fromBlob(new Blob([imageBuffer])); const taskToken = TASKS[task] || TASKS.caption; const prompt = textInput ? taskToken + textInput : taskToken; const prompts = p.construct_prompts(prompt); const inputs = await p(image, prompts); const generatedIds = await m.generate({ ...inputs, max_new_tokens: maxTokens, }); const generatedText = p.batch_decode(generatedIds, { skip_special_tokens: false, })[0]; const result = p.post_process_generation(generatedText, taskToken, image.size); return result; }