Spaces:

jma-informatique
/

NER_DEMO

Running

App Files Files Community

NER_DEMO / ner_long_text.js

jma-informatique's picture

jma-informatique

Upload 8 files

77c70e7 verified 7 days ago

history blame contribute delete

5.73 kB

	import { chunkLongText } from "./chunker.js?v=20260513-rps2";
	import { deduplicateEntities } from "./deduplicate_entities.js?v=20260513-rps2";
	import { inferRuleEntities } from "./rule_entities.js?v=20260513-rps2";

	export async function analyzeLongText(pipe, inputText, options = {}) {
	if (!pipe) {
	throw new Error("Modèle NER non chargé");
	}

	const {
	chunkOptions = {},
	minScore = 0,
	onProgress = null,
	} = options;

	const { text, chunks } = chunkLongText(inputText, chunkOptions);
	const allEntities = [];

	if (!text) {
	return {
	text,
	chunks,
	entities: [],
	rawEntityCount: 0,
	};
	}

	for (const chunk of chunks) {
	const rawTokens = await pipe(chunk.text, { ignore_labels: ["O"] });
	const localEntities = mergeTokenEntities(chunk.text, rawTokens);

	for (const entity of localEntities) {
	allEntities.push({
	...entity,
	start: chunk.start + entity.start,
	end: chunk.start + entity.end,
	word: text.slice(chunk.start + entity.start, chunk.start + entity.end),
	chunk_id: chunk.id,
	chunk_index: chunk.index,
	});
	}

	if (typeof onProgress === "function") {
	onProgress({
	phase: "ner",
	chunk,
	chunks,
	done: chunk.index + 1,
	total: chunks.length,
	});
	}
	}

	const ruleEntities = inferRuleEntities(text, chunks);
	const mergedCandidates = [...allEntities, ...ruleEntities];
	const entities = deduplicateEntities(mergedCandidates, text, { minScore });

	return {
	text,
	chunks,
	entities,
	rawEntityCount: mergedCandidates.length,
	modelEntityCount: allEntities.length,
	ruleEntityCount: ruleEntities.length,
	};
	}

	function getTokenLabel(token) {
	const raw = token.entity_group ?? token.entity ?? token.label ?? "";
	return String(raw);
	}

	function splitBio(label) {
	const match = String(label).match(/^(B\|I)-(.+)$/);
	if (match) {
	return { prefix: match[1], type: match[2] };
	}

	if (label && label !== "O") {
	return { prefix: "B", type: label };
	}

	return { prefix: "O", type: "O" };
	}

	function cleanTokenWord(word) {
	return String(word ?? "")
	.replace(/^▁+/, "")
	.replace(/^##/, "")
	.replace(/Ġ/g, "")
	.trim();
	}

	function hasWordStartMarker(word) {
	return /^[▁Ġ]+/.test(String(word ?? ""));
	}

	function hasContinuationMarker(word) {
	return /^##/.test(String(word ?? ""));
	}

	function isLetterOrNumber(char) {
	return /[\p{L}\p{N}]/u.test(char);
	}

	function isWordStartBoundary(text, index) {
	return index <= 0 \|\| !isLetterOrNumber(text[index - 1]);
	}

	function findTokenTextPosition(text, tokenText, from, options = {}) {
	const { requireWordStart = false, ignoreCase = false } = options;
	const haystack = ignoreCase ? text.toLocaleLowerCase("fr-FR") : text;
	const needle = ignoreCase ? tokenText.toLocaleLowerCase("fr-FR") : tokenText;

	let index = haystack.indexOf(needle, from);
	while (index !== -1) {
	if (!requireWordStart \|\| isWordStartBoundary(text, index)) {
	return [index, index + tokenText.length];
	}
	index = haystack.indexOf(needle, index + 1);
	}

	return null;
	}

	function shouldRequireWordStart(rawToken, prefix) {
	const rawWord = String(rawToken.word ?? "");
	if (hasWordStartMarker(rawWord)) {
	return true;
	}
	if (hasContinuationMarker(rawWord)) {
	return false;
	}
	return prefix === "B";
	}

	function findTokenPosition(text, rawToken, tokenText, from, options = {}) {
	if (Number.isInteger(rawToken.start) && Number.isInteger(rawToken.end)) {
	const start = rawToken.start;
	const end = rawToken.end;
	if (start >= 0 && end > start && end <= text.length) {
	return [start, end];
	}
	}

	if (!tokenText) {
	return null;
	}

	const requireWordStart = shouldRequireWordStart(rawToken, options.prefix);
	const exactPosition = findTokenTextPosition(text, tokenText, from, { requireWordStart });
	if (exactPosition) return exactPosition;

	const lowerPosition = findTokenTextPosition(text, tokenText, from, {
	requireWordStart,
	ignoreCase: true,
	});
	if (lowerPosition) return lowerPosition;

	return null;
	}

	export function mergeTokenEntities(text, rawTokens) {
	const tokens = [];
	let searchFrom = 0;

	for (const rawToken of rawTokens) {
	const label = getTokenLabel(rawToken);
	const { prefix, type } = splitBio(label);

	if (!type \|\| type === "O") {
	continue;
	}

	const tokenText = cleanTokenWord(rawToken.word);
	const position = findTokenPosition(text, rawToken, tokenText, searchFrom, { prefix });
	if (!position) {
	console.warn("Token introuvable dans le chunk:", rawToken);
	continue;
	}

	const [start, end] = position;
	tokens.push({
	type,
	prefix,
	start,
	end,
	word: text.slice(start, end),
	score: rawToken.score ?? 0,
	});
	searchFrom = end;
	}

	const merged = [];

	for (const token of tokens) {
	const last = merged[merged.length - 1];
	const shouldStartNew =
	!last \|\|
	token.prefix === "B" \|\|
	token.type !== last.entity_group \|\|
	token.start > last.end + 8;

	if (shouldStartNew) {
	merged.push({
	entity_group: token.type,
	start: token.start,
	end: token.end,
	word: text.slice(token.start, token.end),
	score: token.score,
	_scores: [token.score],
	});
	continue;
	}

	last.end = token.end;
	last.word = text.slice(last.start, last.end);
	last._scores.push(token.score);
	last.score = last._scores.reduce((sum, score) => sum + score, 0) / last._scores.length;
	}

	for (const entity of merged) {
	delete entity._scores;
	}

	return merged;
	}