beacon / frontend /src /lib /citations.js
kiyer's picture
feat: in-text citations — visual styling, per-unit splitting, ADS search fallback
6ac1f78
Raw
History Blame Contribute Delete
4.1 kB
// citations.js — split paragraph text into plain-text and in-text citation segments.
//
// splitCitations(text, citeIndex, references)
// text — raw paragraph string
// citeIndex — { 'surname:year': [refId, ...] }
// references — array of Reference objects from the parse result
//
// Returns segments:
// { t: 'text', s: '...' }
// { t: 'cite', s: '...', refs: [{ id, corpusMatch }] }
//
// Recognizes:
// Narrative: Surname et al. (YEAR) / Surname & Other (YEAR) / Surname (YEAR)
// Parenthetical: (Surname et al. YEAR; Surname YEAR)
//
// Parenthetical blocks are expanded: each individual cite unit inside becomes
// its own 'cite' segment so that clicking Noeske vs Elbaz opens different papers.
const _NARRATIVE_RE = /\b([A-Z][a-zA-ZÀ-ÿ\-']+)(?:\s+et\s+al\.?|\s+&\s+[A-Z][a-zA-ZÀ-ÿ\-']+)?\s+\(((19|20)\d{2}[a-z]?)\)/g;
const _PAREN_RE = /\(([A-Z][a-zA-ZÀ-ÿ\-']+(?:\s+et\s+al\.?|\s+&\s+[A-Z][a-zA-ZÀ-ÿ\-']+)?\s+(?:19|20)\d{2}[a-z]?(?:\s*;\s*[A-Z][a-zA-ZÀ-ÿ\-']+(?:\s+et\s+al\.?|\s+&\s+[A-Z][a-zA-ZÀ-ÿ\-']+)?\s+(?:19|20)\d{2}[a-z]?)*)\)/g;
const _CITE_UNIT = /([A-Z][a-zA-ZÀ-ÿ\-']+)(?:\s+et\s+al\.?|\s+&\s+[A-Z][a-zA-ZÀ-ÿ\-']+)?\s+((19|20)\d{2}[a-z]?)/g;
function _lookupRefs(surname, year, citeIndex, refsById) {
const key = `${surname.toLowerCase()}:${year}`;
const ids = citeIndex[key] || [];
return ids.map((id) => refsById[id]).filter(Boolean).map((r) => ({
id: r.id,
corpusMatch: r.corpusMatch,
bibcode: r.bibcode,
arxiv: r.arxiv,
}));
}
export function splitCitations(text, citeIndex, references) {
if (!text) return [];
const refsById = Object.fromEntries((references || []).map((r) => [r.id, r]));
// Collect all matches (narrative and parenthetical), sorted by position.
// Paren matches carry a `units` array so the segment builder can expand them.
const matches = [];
_NARRATIVE_RE.lastIndex = 0;
let m;
while ((m = _NARRATIVE_RE.exec(text)) !== null) {
const surname = m[1];
const year = m[2];
const refs = _lookupRefs(surname, year, citeIndex, refsById);
matches.push({ start: m.index, end: m.index + m[0].length, s: m[0], refs });
}
_PAREN_RE.lastIndex = 0;
while ((m = _PAREN_RE.exec(text)) !== null) {
// Skip if already covered by a narrative match
const covered = matches.some((mx) => mx.start <= m.index && m.index < mx.end);
if (covered) continue;
const inner = m[1];
_CITE_UNIT.lastIndex = 0; // must reset — module-level /g regex retains state between calls
const units = [];
let unit;
while ((unit = _CITE_UNIT.exec(inner)) !== null) {
const refs = _lookupRefs(unit[1], unit[2], citeIndex, refsById);
units.push({ s: unit[0], refs, iStart: unit.index, iEnd: unit.index + unit[0].length });
}
matches.push({
start: m.index, end: m.index + m[0].length, s: m[0],
refs: units.flatMap((u) => u.refs), // kept for compatibility
inner, units,
});
}
matches.sort((a, b) => a.start - b.start);
// Build segments. Parenthetical matches are expanded into individual cite spans
// separated by their original punctuation (; etc.) so each is independently clickable.
const segments = [];
let lastIndex = 0;
for (const mx of matches) {
if (mx.start > lastIndex) {
segments.push({ t: 'text', s: text.slice(lastIndex, mx.start) });
}
if (mx.units) {
segments.push({ t: 'text', s: '(' });
let innerLast = 0;
for (const u of mx.units) {
if (u.iStart > innerLast) {
segments.push({ t: 'text', s: mx.inner.slice(innerLast, u.iStart) });
}
segments.push({ t: 'cite', s: u.s, refs: u.refs });
innerLast = u.iEnd;
}
if (innerLast < mx.inner.length) {
segments.push({ t: 'text', s: mx.inner.slice(innerLast) });
}
segments.push({ t: 'text', s: ')' });
} else {
segments.push({ t: 'cite', s: mx.s, refs: mx.refs });
}
lastIndex = mx.end;
}
if (lastIndex < text.length) {
segments.push({ t: 'text', s: text.slice(lastIndex) });
}
return segments;
}