Spaces:
Running
Running
| import { chunkLongText } from "./chunker.js?v=20260513-rps2"; | |
| import { deduplicateEntities } from "./deduplicate_entities.js?v=20260513-rps2"; | |
| import { inferRuleEntities } from "./rule_entities.js?v=20260513-rps2"; | |
| export async function analyzeLongText(pipe, inputText, options = {}) { | |
| if (!pipe) { | |
| throw new Error("Modèle NER non chargé"); | |
| } | |
| const { | |
| chunkOptions = {}, | |
| minScore = 0, | |
| onProgress = null, | |
| } = options; | |
| const { text, chunks } = chunkLongText(inputText, chunkOptions); | |
| const allEntities = []; | |
| if (!text) { | |
| return { | |
| text, | |
| chunks, | |
| entities: [], | |
| rawEntityCount: 0, | |
| }; | |
| } | |
| for (const chunk of chunks) { | |
| const rawTokens = await pipe(chunk.text, { ignore_labels: ["O"] }); | |
| const localEntities = mergeTokenEntities(chunk.text, rawTokens); | |
| for (const entity of localEntities) { | |
| allEntities.push({ | |
| ...entity, | |
| start: chunk.start + entity.start, | |
| end: chunk.start + entity.end, | |
| word: text.slice(chunk.start + entity.start, chunk.start + entity.end), | |
| chunk_id: chunk.id, | |
| chunk_index: chunk.index, | |
| }); | |
| } | |
| if (typeof onProgress === "function") { | |
| onProgress({ | |
| phase: "ner", | |
| chunk, | |
| chunks, | |
| done: chunk.index + 1, | |
| total: chunks.length, | |
| }); | |
| } | |
| } | |
| const ruleEntities = inferRuleEntities(text, chunks); | |
| const mergedCandidates = [...allEntities, ...ruleEntities]; | |
| const entities = deduplicateEntities(mergedCandidates, text, { minScore }); | |
| return { | |
| text, | |
| chunks, | |
| entities, | |
| rawEntityCount: mergedCandidates.length, | |
| modelEntityCount: allEntities.length, | |
| ruleEntityCount: ruleEntities.length, | |
| }; | |
| } | |
| function getTokenLabel(token) { | |
| const raw = token.entity_group ?? token.entity ?? token.label ?? ""; | |
| return String(raw); | |
| } | |
| function splitBio(label) { | |
| const match = String(label).match(/^(B|I)-(.+)$/); | |
| if (match) { | |
| return { prefix: match[1], type: match[2] }; | |
| } | |
| if (label && label !== "O") { | |
| return { prefix: "B", type: label }; | |
| } | |
| return { prefix: "O", type: "O" }; | |
| } | |
| function cleanTokenWord(word) { | |
| return String(word ?? "") | |
| .replace(/^▁+/, "") | |
| .replace(/^##/, "") | |
| .replace(/Ġ/g, "") | |
| .trim(); | |
| } | |
| function hasWordStartMarker(word) { | |
| return /^[▁Ġ]+/.test(String(word ?? "")); | |
| } | |
| function hasContinuationMarker(word) { | |
| return /^##/.test(String(word ?? "")); | |
| } | |
| function isLetterOrNumber(char) { | |
| return /[\p{L}\p{N}]/u.test(char); | |
| } | |
| function isWordStartBoundary(text, index) { | |
| return index <= 0 || !isLetterOrNumber(text[index - 1]); | |
| } | |
| function findTokenTextPosition(text, tokenText, from, options = {}) { | |
| const { requireWordStart = false, ignoreCase = false } = options; | |
| const haystack = ignoreCase ? text.toLocaleLowerCase("fr-FR") : text; | |
| const needle = ignoreCase ? tokenText.toLocaleLowerCase("fr-FR") : tokenText; | |
| let index = haystack.indexOf(needle, from); | |
| while (index !== -1) { | |
| if (!requireWordStart || isWordStartBoundary(text, index)) { | |
| return [index, index + tokenText.length]; | |
| } | |
| index = haystack.indexOf(needle, index + 1); | |
| } | |
| return null; | |
| } | |
| function shouldRequireWordStart(rawToken, prefix) { | |
| const rawWord = String(rawToken.word ?? ""); | |
| if (hasWordStartMarker(rawWord)) { | |
| return true; | |
| } | |
| if (hasContinuationMarker(rawWord)) { | |
| return false; | |
| } | |
| return prefix === "B"; | |
| } | |
| function findTokenPosition(text, rawToken, tokenText, from, options = {}) { | |
| if (Number.isInteger(rawToken.start) && Number.isInteger(rawToken.end)) { | |
| const start = rawToken.start; | |
| const end = rawToken.end; | |
| if (start >= 0 && end > start && end <= text.length) { | |
| return [start, end]; | |
| } | |
| } | |
| if (!tokenText) { | |
| return null; | |
| } | |
| const requireWordStart = shouldRequireWordStart(rawToken, options.prefix); | |
| const exactPosition = findTokenTextPosition(text, tokenText, from, { requireWordStart }); | |
| if (exactPosition) return exactPosition; | |
| const lowerPosition = findTokenTextPosition(text, tokenText, from, { | |
| requireWordStart, | |
| ignoreCase: true, | |
| }); | |
| if (lowerPosition) return lowerPosition; | |
| return null; | |
| } | |
| export function mergeTokenEntities(text, rawTokens) { | |
| const tokens = []; | |
| let searchFrom = 0; | |
| for (const rawToken of rawTokens) { | |
| const label = getTokenLabel(rawToken); | |
| const { prefix, type } = splitBio(label); | |
| if (!type || type === "O") { | |
| continue; | |
| } | |
| const tokenText = cleanTokenWord(rawToken.word); | |
| const position = findTokenPosition(text, rawToken, tokenText, searchFrom, { prefix }); | |
| if (!position) { | |
| console.warn("Token introuvable dans le chunk:", rawToken); | |
| continue; | |
| } | |
| const [start, end] = position; | |
| tokens.push({ | |
| type, | |
| prefix, | |
| start, | |
| end, | |
| word: text.slice(start, end), | |
| score: rawToken.score ?? 0, | |
| }); | |
| searchFrom = end; | |
| } | |
| const merged = []; | |
| for (const token of tokens) { | |
| const last = merged[merged.length - 1]; | |
| const shouldStartNew = | |
| !last || | |
| token.prefix === "B" || | |
| token.type !== last.entity_group || | |
| token.start > last.end + 8; | |
| if (shouldStartNew) { | |
| merged.push({ | |
| entity_group: token.type, | |
| start: token.start, | |
| end: token.end, | |
| word: text.slice(token.start, token.end), | |
| score: token.score, | |
| _scores: [token.score], | |
| }); | |
| continue; | |
| } | |
| last.end = token.end; | |
| last.word = text.slice(last.start, last.end); | |
| last._scores.push(token.score); | |
| last.score = last._scores.reduce((sum, score) => sum + score, 0) / last._scores.length; | |
| } | |
| for (const entity of merged) { | |
| delete entity._scores; | |
| } | |
| return merged; | |
| } | |