NER_DEMO / ner_long_text.js
jma-informatique's picture
Upload 8 files
77c70e7 verified
import { chunkLongText } from "./chunker.js?v=20260513-rps2";
import { deduplicateEntities } from "./deduplicate_entities.js?v=20260513-rps2";
import { inferRuleEntities } from "./rule_entities.js?v=20260513-rps2";
export async function analyzeLongText(pipe, inputText, options = {}) {
if (!pipe) {
throw new Error("Modèle NER non chargé");
}
const {
chunkOptions = {},
minScore = 0,
onProgress = null,
} = options;
const { text, chunks } = chunkLongText(inputText, chunkOptions);
const allEntities = [];
if (!text) {
return {
text,
chunks,
entities: [],
rawEntityCount: 0,
};
}
for (const chunk of chunks) {
const rawTokens = await pipe(chunk.text, { ignore_labels: ["O"] });
const localEntities = mergeTokenEntities(chunk.text, rawTokens);
for (const entity of localEntities) {
allEntities.push({
...entity,
start: chunk.start + entity.start,
end: chunk.start + entity.end,
word: text.slice(chunk.start + entity.start, chunk.start + entity.end),
chunk_id: chunk.id,
chunk_index: chunk.index,
});
}
if (typeof onProgress === "function") {
onProgress({
phase: "ner",
chunk,
chunks,
done: chunk.index + 1,
total: chunks.length,
});
}
}
const ruleEntities = inferRuleEntities(text, chunks);
const mergedCandidates = [...allEntities, ...ruleEntities];
const entities = deduplicateEntities(mergedCandidates, text, { minScore });
return {
text,
chunks,
entities,
rawEntityCount: mergedCandidates.length,
modelEntityCount: allEntities.length,
ruleEntityCount: ruleEntities.length,
};
}
function getTokenLabel(token) {
const raw = token.entity_group ?? token.entity ?? token.label ?? "";
return String(raw);
}
function splitBio(label) {
const match = String(label).match(/^(B|I)-(.+)$/);
if (match) {
return { prefix: match[1], type: match[2] };
}
if (label && label !== "O") {
return { prefix: "B", type: label };
}
return { prefix: "O", type: "O" };
}
function cleanTokenWord(word) {
return String(word ?? "")
.replace(/^▁+/, "")
.replace(/^##/, "")
.replace(/Ġ/g, "")
.trim();
}
function hasWordStartMarker(word) {
return /^[▁Ġ]+/.test(String(word ?? ""));
}
function hasContinuationMarker(word) {
return /^##/.test(String(word ?? ""));
}
function isLetterOrNumber(char) {
return /[\p{L}\p{N}]/u.test(char);
}
function isWordStartBoundary(text, index) {
return index <= 0 || !isLetterOrNumber(text[index - 1]);
}
function findTokenTextPosition(text, tokenText, from, options = {}) {
const { requireWordStart = false, ignoreCase = false } = options;
const haystack = ignoreCase ? text.toLocaleLowerCase("fr-FR") : text;
const needle = ignoreCase ? tokenText.toLocaleLowerCase("fr-FR") : tokenText;
let index = haystack.indexOf(needle, from);
while (index !== -1) {
if (!requireWordStart || isWordStartBoundary(text, index)) {
return [index, index + tokenText.length];
}
index = haystack.indexOf(needle, index + 1);
}
return null;
}
function shouldRequireWordStart(rawToken, prefix) {
const rawWord = String(rawToken.word ?? "");
if (hasWordStartMarker(rawWord)) {
return true;
}
if (hasContinuationMarker(rawWord)) {
return false;
}
return prefix === "B";
}
function findTokenPosition(text, rawToken, tokenText, from, options = {}) {
if (Number.isInteger(rawToken.start) && Number.isInteger(rawToken.end)) {
const start = rawToken.start;
const end = rawToken.end;
if (start >= 0 && end > start && end <= text.length) {
return [start, end];
}
}
if (!tokenText) {
return null;
}
const requireWordStart = shouldRequireWordStart(rawToken, options.prefix);
const exactPosition = findTokenTextPosition(text, tokenText, from, { requireWordStart });
if (exactPosition) return exactPosition;
const lowerPosition = findTokenTextPosition(text, tokenText, from, {
requireWordStart,
ignoreCase: true,
});
if (lowerPosition) return lowerPosition;
return null;
}
export function mergeTokenEntities(text, rawTokens) {
const tokens = [];
let searchFrom = 0;
for (const rawToken of rawTokens) {
const label = getTokenLabel(rawToken);
const { prefix, type } = splitBio(label);
if (!type || type === "O") {
continue;
}
const tokenText = cleanTokenWord(rawToken.word);
const position = findTokenPosition(text, rawToken, tokenText, searchFrom, { prefix });
if (!position) {
console.warn("Token introuvable dans le chunk:", rawToken);
continue;
}
const [start, end] = position;
tokens.push({
type,
prefix,
start,
end,
word: text.slice(start, end),
score: rawToken.score ?? 0,
});
searchFrom = end;
}
const merged = [];
for (const token of tokens) {
const last = merged[merged.length - 1];
const shouldStartNew =
!last ||
token.prefix === "B" ||
token.type !== last.entity_group ||
token.start > last.end + 8;
if (shouldStartNew) {
merged.push({
entity_group: token.type,
start: token.start,
end: token.end,
word: text.slice(token.start, token.end),
score: token.score,
_scores: [token.score],
});
continue;
}
last.end = token.end;
last.word = text.slice(last.start, last.end);
last._scores.push(token.score);
last.score = last._scores.reduce((sum, score) => sum + score, 0) / last._scores.length;
}
for (const entity of merged) {
delete entity._scores;
}
return merged;
}