// citations.js — split paragraph text into plain-text and in-text citation segments. // // splitCitations(text, citeIndex, references) // text — raw paragraph string // citeIndex — { 'surname:year': [refId, ...] } // references — array of Reference objects from the parse result // // Returns segments: // { t: 'text', s: '...' } // { t: 'cite', s: '...', refs: [{ id, corpusMatch }] } // // Recognizes: // Narrative: Surname et al. (YEAR) / Surname & Other (YEAR) / Surname (YEAR) // Parenthetical: (Surname et al. YEAR; Surname YEAR) // // Parenthetical blocks are expanded: each individual cite unit inside becomes // its own 'cite' segment so that clicking Noeske vs Elbaz opens different papers. const _NARRATIVE_RE = /\b([A-Z][a-zA-ZÀ-ÿ\-']+)(?:\s+et\s+al\.?|\s+&\s+[A-Z][a-zA-ZÀ-ÿ\-']+)?\s+\(((19|20)\d{2}[a-z]?)\)/g; const _PAREN_RE = /\(([A-Z][a-zA-ZÀ-ÿ\-']+(?:\s+et\s+al\.?|\s+&\s+[A-Z][a-zA-ZÀ-ÿ\-']+)?\s+(?:19|20)\d{2}[a-z]?(?:\s*;\s*[A-Z][a-zA-ZÀ-ÿ\-']+(?:\s+et\s+al\.?|\s+&\s+[A-Z][a-zA-ZÀ-ÿ\-']+)?\s+(?:19|20)\d{2}[a-z]?)*)\)/g; const _CITE_UNIT = /([A-Z][a-zA-ZÀ-ÿ\-']+)(?:\s+et\s+al\.?|\s+&\s+[A-Z][a-zA-ZÀ-ÿ\-']+)?\s+((19|20)\d{2}[a-z]?)/g; function _lookupRefs(surname, year, citeIndex, refsById) { const key = `${surname.toLowerCase()}:${year}`; const ids = citeIndex[key] || []; return ids.map((id) => refsById[id]).filter(Boolean).map((r) => ({ id: r.id, corpusMatch: r.corpusMatch, bibcode: r.bibcode, arxiv: r.arxiv, })); } export function splitCitations(text, citeIndex, references) { if (!text) return []; const refsById = Object.fromEntries((references || []).map((r) => [r.id, r])); // Collect all matches (narrative and parenthetical), sorted by position. // Paren matches carry a `units` array so the segment builder can expand them. const matches = []; _NARRATIVE_RE.lastIndex = 0; let m; while ((m = _NARRATIVE_RE.exec(text)) !== null) { const surname = m[1]; const year = m[2]; const refs = _lookupRefs(surname, year, citeIndex, refsById); matches.push({ start: m.index, end: m.index + m[0].length, s: m[0], refs }); } _PAREN_RE.lastIndex = 0; while ((m = _PAREN_RE.exec(text)) !== null) { // Skip if already covered by a narrative match const covered = matches.some((mx) => mx.start <= m.index && m.index < mx.end); if (covered) continue; const inner = m[1]; _CITE_UNIT.lastIndex = 0; // must reset — module-level /g regex retains state between calls const units = []; let unit; while ((unit = _CITE_UNIT.exec(inner)) !== null) { const refs = _lookupRefs(unit[1], unit[2], citeIndex, refsById); units.push({ s: unit[0], refs, iStart: unit.index, iEnd: unit.index + unit[0].length }); } matches.push({ start: m.index, end: m.index + m[0].length, s: m[0], refs: units.flatMap((u) => u.refs), // kept for compatibility inner, units, }); } matches.sort((a, b) => a.start - b.start); // Build segments. Parenthetical matches are expanded into individual cite spans // separated by their original punctuation (; etc.) so each is independently clickable. const segments = []; let lastIndex = 0; for (const mx of matches) { if (mx.start > lastIndex) { segments.push({ t: 'text', s: text.slice(lastIndex, mx.start) }); } if (mx.units) { segments.push({ t: 'text', s: '(' }); let innerLast = 0; for (const u of mx.units) { if (u.iStart > innerLast) { segments.push({ t: 'text', s: mx.inner.slice(innerLast, u.iStart) }); } segments.push({ t: 'cite', s: u.s, refs: u.refs }); innerLast = u.iEnd; } if (innerLast < mx.inner.length) { segments.push({ t: 'text', s: mx.inner.slice(innerLast) }); } segments.push({ t: 'text', s: ')' }); } else { segments.push({ t: 'cite', s: mx.s, refs: mx.refs }); } lastIndex = mx.end; } if (lastIndex < text.length) { segments.push({ t: 'text', s: text.slice(lastIndex) }); } return segments; }