Spaces:

kiyer
/

beacon

Running

App Files Files Community

beacon / frontend /src /lib /citations.js

kiyer's picture

feat: in-text citations — visual styling, per-unit splitting, ADS search fallback

6ac1f78 18 days ago

History Blame Contribute Delete

4.1 kB

	// citations.js — split paragraph text into plain-text and in-text citation segments.
	//
	// splitCitations(text, citeIndex, references)
	// text — raw paragraph string
	// citeIndex — { 'surname:year': [refId, ...] }
	// references — array of Reference objects from the parse result
	//
	// Returns segments:
	// { t: 'text', s: '...' }
	// { t: 'cite', s: '...', refs: [{ id, corpusMatch }] }
	//
	// Recognizes:
	// Narrative: Surname et al. (YEAR) / Surname & Other (YEAR) / Surname (YEAR)
	// Parenthetical: (Surname et al. YEAR; Surname YEAR)
	//
	// Parenthetical blocks are expanded: each individual cite unit inside becomes
	// its own 'cite' segment so that clicking Noeske vs Elbaz opens different papers.

	const _NARRATIVE_RE = /\b([A-Z][a-zA-ZÀ-ÿ\-']+)(?:\s+et\s+al\.?\|\s+&\s+[A-Z][a-zA-ZÀ-ÿ\-']+)?\s+\(((19\|20)\d{2}[a-z]?)\)/g;
	const _PAREN_RE = /\(([A-Z][a-zA-ZÀ-ÿ\-']+(?:\s+et\s+al\.?\|\s+&\s+[A-Z][a-zA-ZÀ-ÿ\-']+)?\s+(?:19\|20)\d{2}[a-z]?(?:\s;\s[A-Z][a-zA-ZÀ-ÿ\-']+(?:\s+et\s+al\.?\|\s+&\s+[A-Z][a-zA-ZÀ-ÿ\-']+)?\s+(?:19\|20)\d{2}[a-z]?)*)\)/g;
	const _CITE_UNIT = /([A-Z][a-zA-ZÀ-ÿ\-']+)(?:\s+et\s+al\.?\|\s+&\s+[A-Z][a-zA-ZÀ-ÿ\-']+)?\s+((19\|20)\d{2}[a-z]?)/g;

	function _lookupRefs(surname, year, citeIndex, refsById) {
	const key = `${surname.toLowerCase()}:${year}`;
	const ids = citeIndex[key] \|\| [];
	return ids.map((id) => refsById[id]).filter(Boolean).map((r) => ({
	id: r.id,
	corpusMatch: r.corpusMatch,
	bibcode: r.bibcode,
	arxiv: r.arxiv,
	}));
	}

	export function splitCitations(text, citeIndex, references) {
	if (!text) return [];
	const refsById = Object.fromEntries((references \|\| []).map((r) => [r.id, r]));

	// Collect all matches (narrative and parenthetical), sorted by position.
	// Paren matches carry a `units` array so the segment builder can expand them.
	const matches = [];

	_NARRATIVE_RE.lastIndex = 0;
	let m;
	while ((m = _NARRATIVE_RE.exec(text)) !== null) {
	const surname = m[1];
	const year = m[2];
	const refs = _lookupRefs(surname, year, citeIndex, refsById);
	matches.push({ start: m.index, end: m.index + m[0].length, s: m[0], refs });
	}

	_PAREN_RE.lastIndex = 0;
	while ((m = _PAREN_RE.exec(text)) !== null) {
	// Skip if already covered by a narrative match
	const covered = matches.some((mx) => mx.start <= m.index && m.index < mx.end);
	if (covered) continue;

	const inner = m[1];
	_CITE_UNIT.lastIndex = 0; // must reset — module-level /g regex retains state between calls
	const units = [];
	let unit;
	while ((unit = _CITE_UNIT.exec(inner)) !== null) {
	const refs = _lookupRefs(unit[1], unit[2], citeIndex, refsById);
	units.push({ s: unit[0], refs, iStart: unit.index, iEnd: unit.index + unit[0].length });
	}
	matches.push({
	start: m.index, end: m.index + m[0].length, s: m[0],
	refs: units.flatMap((u) => u.refs), // kept for compatibility
	inner, units,
	});
	}

	matches.sort((a, b) => a.start - b.start);

	// Build segments. Parenthetical matches are expanded into individual cite spans
	// separated by their original punctuation (; etc.) so each is independently clickable.
	const segments = [];
	let lastIndex = 0;
	for (const mx of matches) {
	if (mx.start > lastIndex) {
	segments.push({ t: 'text', s: text.slice(lastIndex, mx.start) });
	}
	if (mx.units) {
	segments.push({ t: 'text', s: '(' });
	let innerLast = 0;
	for (const u of mx.units) {
	if (u.iStart > innerLast) {
	segments.push({ t: 'text', s: mx.inner.slice(innerLast, u.iStart) });
	}
	segments.push({ t: 'cite', s: u.s, refs: u.refs });
	innerLast = u.iEnd;
	}
	if (innerLast < mx.inner.length) {
	segments.push({ t: 'text', s: mx.inner.slice(innerLast) });
	}
	segments.push({ t: 'text', s: ')' });
	} else {
	segments.push({ t: 'cite', s: mx.s, refs: mx.refs });
	}
	lastIndex = mx.end;
	}
	if (lastIndex < text.length) {
	segments.push({ t: 'text', s: text.slice(lastIndex) });
	}
	return segments;
	}