Spaces:
Running
Running
| import { | |
| Florence2ForConditionalGeneration, | |
| AutoProcessor, | |
| RawImage, | |
| } from "@huggingface/transformers"; | |
| const MODEL_ID = "onnx-community/Florence-2-base"; | |
| let model = null; | |
| let processor = null; | |
| /** Supported Florence-2 task tokens */ | |
| export const TASKS = { | |
| caption: "<CAPTION>", | |
| detailed_caption: "<DETAILED_CAPTION>", | |
| more_detailed_caption: "<MORE_DETAILED_CAPTION>", | |
| ocr: "<OCR>", | |
| ocr_with_region: "<OCR_WITH_REGION>", | |
| object_detection: "<OD>", | |
| dense_region_caption: "<DENSE_REGION_CAPTION>", | |
| region_proposal: "<REGION_PROPOSAL>", | |
| }; | |
| export async function loadModel() { | |
| if (!model) { | |
| console.log("Loading Florence-2 model..."); | |
| model = await Florence2ForConditionalGeneration.from_pretrained(MODEL_ID, { | |
| dtype: "fp32", | |
| }); | |
| processor = await AutoProcessor.from_pretrained(MODEL_ID); | |
| console.log("Model loaded."); | |
| } | |
| return { model, processor }; | |
| } | |
| /** | |
| * Generate text from an image buffer. | |
| * @param {Buffer} imageBuffer - Raw image bytes | |
| * @param {string} task - One of the TASKS keys (default: "caption") | |
| * @param {string|null} textInput - Optional extra text input for the task | |
| * @param {number} maxTokens - Max new tokens to generate | |
| * @returns {Promise<object>} Parsed result from Florence-2 | |
| */ | |
| export async function generateCaption( | |
| imageBuffer, | |
| task = "caption", | |
| textInput = null, | |
| maxTokens = 100 | |
| ) { | |
| const { model: m, processor: p } = await loadModel(); | |
| const image = await RawImage.fromBlob(new Blob([imageBuffer])); | |
| const taskToken = TASKS[task] || TASKS.caption; | |
| const prompt = textInput ? taskToken + textInput : taskToken; | |
| const prompts = p.construct_prompts(prompt); | |
| const inputs = await p(image, prompts); | |
| const generatedIds = await m.generate({ | |
| ...inputs, | |
| max_new_tokens: maxTokens, | |
| }); | |
| const generatedText = p.batch_decode(generatedIds, { | |
| skip_special_tokens: false, | |
| })[0]; | |
| const result = p.post_process_generation(generatedText, taskToken, image.size); | |
| return result; | |
| } | |