// citations.js — split paragraph text into plain-text and in-text citation segments.
//
// splitCitations(text, citeIndex, references)
//   text       — raw paragraph string
//   citeIndex  — { 'surname:year': [refId, ...] }
//   references — array of Reference objects from the parse result
//
// Returns segments:
//   { t: 'text', s: '...' }
//   { t: 'cite', s: '...', refs: [{ id, corpusMatch }] }
//
// Recognizes:
//   Narrative:      Surname et al. (YEAR) / Surname & Other (YEAR) / Surname (YEAR)
//   Parenthetical:  (Surname et al. YEAR; Surname YEAR)
//
// Parenthetical blocks are expanded: each individual cite unit inside becomes
// its own 'cite' segment so that clicking Noeske vs Elbaz opens different papers.

const _NARRATIVE_RE = /\b([A-Z][a-zA-ZÀ-ÿ\-']+)(?:\s+et\s+al\.?|\s+&\s+[A-Z][a-zA-ZÀ-ÿ\-']+)?\s+\(((19|20)\d{2}[a-z]?)\)/g;
const _PAREN_RE = /\(([A-Z][a-zA-ZÀ-ÿ\-']+(?:\s+et\s+al\.?|\s+&\s+[A-Z][a-zA-ZÀ-ÿ\-']+)?\s+(?:19|20)\d{2}[a-z]?(?:\s*;\s*[A-Z][a-zA-ZÀ-ÿ\-']+(?:\s+et\s+al\.?|\s+&\s+[A-Z][a-zA-ZÀ-ÿ\-']+)?\s+(?:19|20)\d{2}[a-z]?)*)\)/g;
const _CITE_UNIT = /([A-Z][a-zA-ZÀ-ÿ\-']+)(?:\s+et\s+al\.?|\s+&\s+[A-Z][a-zA-ZÀ-ÿ\-']+)?\s+((19|20)\d{2}[a-z]?)/g;

function _lookupRefs(surname, year, citeIndex, refsById) {
  const key = `${surname.toLowerCase()}:${year}`;
  const ids = citeIndex[key] || [];
  return ids.map((id) => refsById[id]).filter(Boolean).map((r) => ({
    id: r.id,
    corpusMatch: r.corpusMatch,
    bibcode: r.bibcode,
    arxiv: r.arxiv,
  }));
}

export function splitCitations(text, citeIndex, references) {
  if (!text) return [];
  const refsById = Object.fromEntries((references || []).map((r) => [r.id, r]));

  // Collect all matches (narrative and parenthetical), sorted by position.
  // Paren matches carry a `units` array so the segment builder can expand them.
  const matches = [];

  _NARRATIVE_RE.lastIndex = 0;
  let m;
  while ((m = _NARRATIVE_RE.exec(text)) !== null) {
    const surname = m[1];
    const year = m[2];
    const refs = _lookupRefs(surname, year, citeIndex, refsById);
    matches.push({ start: m.index, end: m.index + m[0].length, s: m[0], refs });
  }

  _PAREN_RE.lastIndex = 0;
  while ((m = _PAREN_RE.exec(text)) !== null) {
    // Skip if already covered by a narrative match
    const covered = matches.some((mx) => mx.start <= m.index && m.index < mx.end);
    if (covered) continue;

    const inner = m[1];
    _CITE_UNIT.lastIndex = 0; // must reset — module-level /g regex retains state between calls
    const units = [];
    let unit;
    while ((unit = _CITE_UNIT.exec(inner)) !== null) {
      const refs = _lookupRefs(unit[1], unit[2], citeIndex, refsById);
      units.push({ s: unit[0], refs, iStart: unit.index, iEnd: unit.index + unit[0].length });
    }
    matches.push({
      start: m.index, end: m.index + m[0].length, s: m[0],
      refs: units.flatMap((u) => u.refs), // kept for compatibility
      inner, units,
    });
  }

  matches.sort((a, b) => a.start - b.start);

  // Build segments. Parenthetical matches are expanded into individual cite spans
  // separated by their original punctuation (; etc.) so each is independently clickable.
  const segments = [];
  let lastIndex = 0;
  for (const mx of matches) {
    if (mx.start > lastIndex) {
      segments.push({ t: 'text', s: text.slice(lastIndex, mx.start) });
    }
    if (mx.units) {
      segments.push({ t: 'text', s: '(' });
      let innerLast = 0;
      for (const u of mx.units) {
        if (u.iStart > innerLast) {
          segments.push({ t: 'text', s: mx.inner.slice(innerLast, u.iStart) });
        }
        segments.push({ t: 'cite', s: u.s, refs: u.refs });
        innerLast = u.iEnd;
      }
      if (innerLast < mx.inner.length) {
        segments.push({ t: 'text', s: mx.inner.slice(innerLast) });
      }
      segments.push({ t: 'text', s: ')' });
    } else {
      segments.push({ t: 'cite', s: mx.s, refs: mx.refs });
    }
    lastIndex = mx.end;
  }
  if (lastIndex < text.length) {
    segments.push({ t: 'text', s: text.slice(lastIndex) });
  }
  return segments;
}