import { chunkLongText } from "./chunker.js?v=20260513-rps2"; import { deduplicateEntities } from "./deduplicate_entities.js?v=20260513-rps2"; import { inferRuleEntities } from "./rule_entities.js?v=20260513-rps2"; export async function analyzeLongText(pipe, inputText, options = {}) { if (!pipe) { throw new Error("Modèle NER non chargé"); } const { chunkOptions = {}, minScore = 0, onProgress = null, } = options; const { text, chunks } = chunkLongText(inputText, chunkOptions); const allEntities = []; if (!text) { return { text, chunks, entities: [], rawEntityCount: 0, }; } for (const chunk of chunks) { const rawTokens = await pipe(chunk.text, { ignore_labels: ["O"] }); const localEntities = mergeTokenEntities(chunk.text, rawTokens); for (const entity of localEntities) { allEntities.push({ ...entity, start: chunk.start + entity.start, end: chunk.start + entity.end, word: text.slice(chunk.start + entity.start, chunk.start + entity.end), chunk_id: chunk.id, chunk_index: chunk.index, }); } if (typeof onProgress === "function") { onProgress({ phase: "ner", chunk, chunks, done: chunk.index + 1, total: chunks.length, }); } } const ruleEntities = inferRuleEntities(text, chunks); const mergedCandidates = [...allEntities, ...ruleEntities]; const entities = deduplicateEntities(mergedCandidates, text, { minScore }); return { text, chunks, entities, rawEntityCount: mergedCandidates.length, modelEntityCount: allEntities.length, ruleEntityCount: ruleEntities.length, }; } function getTokenLabel(token) { const raw = token.entity_group ?? token.entity ?? token.label ?? ""; return String(raw); } function splitBio(label) { const match = String(label).match(/^(B|I)-(.+)$/); if (match) { return { prefix: match[1], type: match[2] }; } if (label && label !== "O") { return { prefix: "B", type: label }; } return { prefix: "O", type: "O" }; } function cleanTokenWord(word) { return String(word ?? "") .replace(/^▁+/, "") .replace(/^##/, "") .replace(/Ġ/g, "") .trim(); } function hasWordStartMarker(word) { return /^[▁Ġ]+/.test(String(word ?? "")); } function hasContinuationMarker(word) { return /^##/.test(String(word ?? "")); } function isLetterOrNumber(char) { return /[\p{L}\p{N}]/u.test(char); } function isWordStartBoundary(text, index) { return index <= 0 || !isLetterOrNumber(text[index - 1]); } function findTokenTextPosition(text, tokenText, from, options = {}) { const { requireWordStart = false, ignoreCase = false } = options; const haystack = ignoreCase ? text.toLocaleLowerCase("fr-FR") : text; const needle = ignoreCase ? tokenText.toLocaleLowerCase("fr-FR") : tokenText; let index = haystack.indexOf(needle, from); while (index !== -1) { if (!requireWordStart || isWordStartBoundary(text, index)) { return [index, index + tokenText.length]; } index = haystack.indexOf(needle, index + 1); } return null; } function shouldRequireWordStart(rawToken, prefix) { const rawWord = String(rawToken.word ?? ""); if (hasWordStartMarker(rawWord)) { return true; } if (hasContinuationMarker(rawWord)) { return false; } return prefix === "B"; } function findTokenPosition(text, rawToken, tokenText, from, options = {}) { if (Number.isInteger(rawToken.start) && Number.isInteger(rawToken.end)) { const start = rawToken.start; const end = rawToken.end; if (start >= 0 && end > start && end <= text.length) { return [start, end]; } } if (!tokenText) { return null; } const requireWordStart = shouldRequireWordStart(rawToken, options.prefix); const exactPosition = findTokenTextPosition(text, tokenText, from, { requireWordStart }); if (exactPosition) return exactPosition; const lowerPosition = findTokenTextPosition(text, tokenText, from, { requireWordStart, ignoreCase: true, }); if (lowerPosition) return lowerPosition; return null; } export function mergeTokenEntities(text, rawTokens) { const tokens = []; let searchFrom = 0; for (const rawToken of rawTokens) { const label = getTokenLabel(rawToken); const { prefix, type } = splitBio(label); if (!type || type === "O") { continue; } const tokenText = cleanTokenWord(rawToken.word); const position = findTokenPosition(text, rawToken, tokenText, searchFrom, { prefix }); if (!position) { console.warn("Token introuvable dans le chunk:", rawToken); continue; } const [start, end] = position; tokens.push({ type, prefix, start, end, word: text.slice(start, end), score: rawToken.score ?? 0, }); searchFrom = end; } const merged = []; for (const token of tokens) { const last = merged[merged.length - 1]; const shouldStartNew = !last || token.prefix === "B" || token.type !== last.entity_group || token.start > last.end + 8; if (shouldStartNew) { merged.push({ entity_group: token.type, start: token.start, end: token.end, word: text.slice(token.start, token.end), score: token.score, _scores: [token.score], }); continue; } last.end = token.end; last.word = text.slice(last.start, last.end); last._scores.push(token.score); last.score = last._scores.reduce((sum, score) => sum + score, 0) / last._scores.length; } for (const entity of merged) { delete entity._scores; } return merged; }