Spaces:

jma-informatique
/

NER_DEMO

Running

File size: 5,734 Bytes

77c70e7

import { chunkLongText } from "./chunker.js?v=20260513-rps2";
import { deduplicateEntities } from "./deduplicate_entities.js?v=20260513-rps2";
import { inferRuleEntities } from "./rule_entities.js?v=20260513-rps2";

export async function analyzeLongText(pipe, inputText, options = {}) {
  if (!pipe) {
    throw new Error("Modèle NER non chargé");
  }

  const {
    chunkOptions = {},
    minScore = 0,
    onProgress = null,
  } = options;

  const { text, chunks } = chunkLongText(inputText, chunkOptions);
  const allEntities = [];

  if (!text) {
    return {
      text,
      chunks,
      entities: [],
      rawEntityCount: 0,
    };
  }

  for (const chunk of chunks) {
    const rawTokens = await pipe(chunk.text, { ignore_labels: ["O"] });
    const localEntities = mergeTokenEntities(chunk.text, rawTokens);

    for (const entity of localEntities) {
      allEntities.push({
        ...entity,
        start: chunk.start + entity.start,
        end: chunk.start + entity.end,
        word: text.slice(chunk.start + entity.start, chunk.start + entity.end),
        chunk_id: chunk.id,
        chunk_index: chunk.index,
      });
    }

    if (typeof onProgress === "function") {
      onProgress({
        phase: "ner",
        chunk,
        chunks,
        done: chunk.index + 1,
        total: chunks.length,
      });
    }
  }

  const ruleEntities = inferRuleEntities(text, chunks);
  const mergedCandidates = [...allEntities, ...ruleEntities];
  const entities = deduplicateEntities(mergedCandidates, text, { minScore });

  return {
    text,
    chunks,
    entities,
    rawEntityCount: mergedCandidates.length,
    modelEntityCount: allEntities.length,
    ruleEntityCount: ruleEntities.length,
  };
}

function getTokenLabel(token) {
  const raw = token.entity_group ?? token.entity ?? token.label ?? "";
  return String(raw);
}

function splitBio(label) {
  const match = String(label).match(/^(B|I)-(.+)$/);
  if (match) {
    return { prefix: match[1], type: match[2] };
  }

  if (label && label !== "O") {
    return { prefix: "B", type: label };
  }

  return { prefix: "O", type: "O" };
}

function cleanTokenWord(word) {
  return String(word ?? "")
    .replace(/^▁+/, "")
    .replace(/^##/, "")
    .replace(/Ġ/g, "")
    .trim();
}

function hasWordStartMarker(word) {
  return /^[▁Ġ]+/.test(String(word ?? ""));
}

function hasContinuationMarker(word) {
  return /^##/.test(String(word ?? ""));
}

function isLetterOrNumber(char) {
  return /[\p{L}\p{N}]/u.test(char);
}

function isWordStartBoundary(text, index) {
  return index <= 0 || !isLetterOrNumber(text[index - 1]);
}

function findTokenTextPosition(text, tokenText, from, options = {}) {
  const { requireWordStart = false, ignoreCase = false } = options;
  const haystack = ignoreCase ? text.toLocaleLowerCase("fr-FR") : text;
  const needle = ignoreCase ? tokenText.toLocaleLowerCase("fr-FR") : tokenText;

  let index = haystack.indexOf(needle, from);
  while (index !== -1) {
    if (!requireWordStart || isWordStartBoundary(text, index)) {
      return [index, index + tokenText.length];
    }
    index = haystack.indexOf(needle, index + 1);
  }

  return null;
}

function shouldRequireWordStart(rawToken, prefix) {
  const rawWord = String(rawToken.word ?? "");
  if (hasWordStartMarker(rawWord)) {
    return true;
  }
  if (hasContinuationMarker(rawWord)) {
    return false;
  }
  return prefix === "B";
}

function findTokenPosition(text, rawToken, tokenText, from, options = {}) {
  if (Number.isInteger(rawToken.start) && Number.isInteger(rawToken.end)) {
    const start = rawToken.start;
    const end = rawToken.end;
    if (start >= 0 && end > start && end <= text.length) {
      return [start, end];
    }
  }

  if (!tokenText) {
    return null;
  }

  const requireWordStart = shouldRequireWordStart(rawToken, options.prefix);
  const exactPosition = findTokenTextPosition(text, tokenText, from, { requireWordStart });
  if (exactPosition) return exactPosition;

  const lowerPosition = findTokenTextPosition(text, tokenText, from, {
    requireWordStart,
    ignoreCase: true,
  });
  if (lowerPosition) return lowerPosition;

  return null;
}

export function mergeTokenEntities(text, rawTokens) {
  const tokens = [];
  let searchFrom = 0;

  for (const rawToken of rawTokens) {
    const label = getTokenLabel(rawToken);
    const { prefix, type } = splitBio(label);

    if (!type || type === "O") {
      continue;
    }

    const tokenText = cleanTokenWord(rawToken.word);
    const position = findTokenPosition(text, rawToken, tokenText, searchFrom, { prefix });
    if (!position) {
      console.warn("Token introuvable dans le chunk:", rawToken);
      continue;
    }

    const [start, end] = position;
    tokens.push({
      type,
      prefix,
      start,
      end,
      word: text.slice(start, end),
      score: rawToken.score ?? 0,
    });
    searchFrom = end;
  }

  const merged = [];

  for (const token of tokens) {
    const last = merged[merged.length - 1];
    const shouldStartNew =
      !last ||
      token.prefix === "B" ||
      token.type !== last.entity_group ||
      token.start > last.end + 8;

    if (shouldStartNew) {
      merged.push({
        entity_group: token.type,
        start: token.start,
        end: token.end,
        word: text.slice(token.start, token.end),
        score: token.score,
        _scores: [token.score],
      });
      continue;
    }

    last.end = token.end;
    last.word = text.slice(last.start, last.end);
    last._scores.push(token.score);
    last.score = last._scores.reduce((sum, score) => sum + score, 0) / last._scores.length;
  }

  for (const entity of merged) {
    delete entity._scores;
  }

  return merged;
}