import type { SIEClient } from "@superlinked/sie-sdk"; import { NER_MODELS, RECOGNITION_MODELS, STRUCTURED_MODELS } from "./config.js"; import { structuredExtract } from "./donut.js"; import type { PipelineEvent } from "./events.js"; import { extractFields } from "./extract.js"; import { recognize } from "./ocr.js"; import type { SampleDoc, TriageResult } from "./types.js"; export type RunInput = { client: SIEClient; imageBytes: Uint8Array; sample: SampleDoc; recognitionModel: string; structuredModel: string; nerModel: string; emit: (event: PipelineEvent) => void; }; function lookup(list: T[], id: string): T { const found = list.find((m) => m.id === id); if (!found) throw new Error(`unknown model id: ${id}`); return found; } export async function runPipeline({ client, imageBytes, sample, recognitionModel, structuredModel, nerModel, emit, }: RunInput): Promise { const t0 = Date.now(); emit({ type: "models", data: { extractor: nerModel, recognition: recognitionModel, structured: structuredModel }, }); // Recognition const recOpt = lookup(RECOGNITION_MODELS, recognitionModel); emit({ type: "recognition_start", data: { model: recognitionModel } }); const tRec = Date.now(); let markdown = ""; try { markdown = await recognize(client, recOpt.id, imageBytes, recOpt.options); } catch (err) { emit({ type: "error", data: { stage: "recognition", message: `${recognitionModel} failed: ${(err as Error).message}` }, }); throw err; } const recognitionMs = Date.now() - tRec; emit({ type: "recognition_done", data: { markdown, ms: recognitionMs } }); // Structured (Donut variants, etc.) const strOpt = lookup(STRUCTURED_MODELS, structuredModel); emit({ type: "donut_start" }); const tDon = Date.now(); let donut = { entities: [] as { label: string; text: string }[], data: undefined as unknown }; try { donut = await structuredExtract(client, strOpt.id, imageBytes, strOpt.options); } catch (err) { emit({ type: "error", data: { stage: "donut", message: `${structuredModel} failed: ${(err as Error).message}` }, }); } const donutMs = Date.now() - tDon; emit({ type: "donut_done", data: { entities: donut.entities, rawData: donut.data, ms: donutMs } }); // NER (GLiNER variants) const nerOpt = lookup(NER_MODELS, nerModel); emit({ type: "gliner_start", data: { labels: sample.labels } }); const tGli = Date.now(); let fields: { label: string; text: string; score: number }[] = []; try { fields = await extractFields(client, nerOpt.id, markdown, sample.labels); } catch (err) { emit({ type: "error", data: { stage: "gliner", message: `${nerModel} failed: ${(err as Error).message}` }, }); } const glinerMs = Date.now() - tGli; emit({ type: "gliner_done", data: { fields, ms: glinerMs } }); const totalMs = Date.now() - t0; emit({ type: "done", data: { totalMs } }); return { sampleId: sample.id, recognitionModel, markdown, donutEntities: donut.entities, donutData: donut.data, glinerFields: fields, timings: { recognitionMs, donutMs, glinerMs, totalMs }, }; }