import {
  Florence2ForConditionalGeneration,
  AutoProcessor,
  RawImage,
} from "@huggingface/transformers";

const MODEL_ID = "onnx-community/Florence-2-base";

let model = null;
let processor = null;

/** Supported Florence-2 task tokens */
export const TASKS = {
  caption: "<CAPTION>",
  detailed_caption: "<DETAILED_CAPTION>",
  more_detailed_caption: "<MORE_DETAILED_CAPTION>",
  ocr: "<OCR>",
  ocr_with_region: "<OCR_WITH_REGION>",
  object_detection: "<OD>",
  dense_region_caption: "<DENSE_REGION_CAPTION>",
  region_proposal: "<REGION_PROPOSAL>",
};

export async function loadModel() {
  if (!model) {
    console.log("Loading Florence-2 model...");
    model = await Florence2ForConditionalGeneration.from_pretrained(MODEL_ID, {
      dtype: "fp32",
    });
    processor = await AutoProcessor.from_pretrained(MODEL_ID);
    console.log("Model loaded.");
  }
  return { model, processor };
}

/**
 * Generate text from an image buffer.
 * @param {Buffer} imageBuffer - Raw image bytes
 * @param {string} task - One of the TASKS keys (default: "caption")
 * @param {string|null} textInput - Optional extra text input for the task
 * @param {number} maxTokens - Max new tokens to generate
 * @returns {Promise<object>} Parsed result from Florence-2
 */
export async function generateCaption(
  imageBuffer,
  task = "caption",
  textInput = null,
  maxTokens = 100
) {
  const { model: m, processor: p } = await loadModel();

  const image = await RawImage.fromBlob(new Blob([imageBuffer]));

  const taskToken = TASKS[task] || TASKS.caption;
  const prompt = textInput ? taskToken + textInput : taskToken;

  const prompts = p.construct_prompts(prompt);
  const inputs = await p(image, prompts);

  const generatedIds = await m.generate({
    ...inputs,
    max_new_tokens: maxTokens,
  });

  const generatedText = p.batch_decode(generatedIds, {
    skip_special_tokens: false,
  })[0];

  const result = p.post_process_generation(generatedText, taskToken, image.size);

  return result;
}