dsleo's picture
better fetch paper's metadata
fe3898a verified
// === Configuration ========================================================
const HF_DATASET_BASE_URL = "https://huggingface.co/datasets/turnstilelabs/mathxiv/resolve/main/data";
const MAX_VERSION_TO_TRY = 9;
const CACHE_KEY_PREFIX = "arxigraph_meta_v3_"; // Incremented to force fresh cache logic
const inputEl = document.getElementById("arxiv-input");
const loadBtn = document.getElementById("load-btn");
const samplePapersEl = document.getElementById("sample-papers");
const heroCarouselEl = document.querySelector(".hero-carousel");
const homeTitleEl = document.getElementById("home-title");
const carouselPrevBtn = document.getElementById("carousel-prev");
const carouselNextBtn = document.getElementById("carousel-next");
const paperTitleEl = document.getElementById("paper-title");
const paperAuthorsEl = document.getElementById("paper-authors");
const paperAbstractEl = document.getElementById("paper-abstract");
const paperStatusTextEl = document.getElementById("paper-status-text");
const paperErrorTextEl = document.getElementById("paper-error-text");
const paperLinkEl = document.getElementById("paper-link");
const definitionBankEl = document.getElementById("definition-bank");
const paperAreaEl = document.getElementById("paper-area");
const paperHeaderEl = document.getElementById("paper-header");
const graphSectionEl = document.getElementById("graph-section");
const emptyStateEl = document.getElementById("empty-state");
const infoPanelEl = document.getElementById("info-panel");
const infoTitleEl = document.getElementById("info-title");
const infoBodyEl = document.getElementById("info-body");
const closeInfoPanelBtn = document.getElementById("close-info-panel");
const svg = d3.select("#graph");
const tooltip = d3.select("#tooltip");
let currentSimulation = null;
// Sample papers carousel state
let samplePapersData = [];
let sampleCarouselIndex = 0;
// Queue State
const metadataQueue = [];
let isProcessingQueue = false;
// Lightweight graph UI state
const graphState = {
hiddenTypes: new Set(),
selectedNodeElement: null,
};
const graphRefs = {
nodeGroups: null,
links: null,
simulation: null,
};
if (closeInfoPanelBtn && infoPanelEl) {
closeInfoPanelBtn.addEventListener("click", () => {
infoPanelEl.classList.remove("visible");
if (graphState.selectedNodeElement) {
graphState.selectedNodeElement.classList.remove("selected");
graphState.selectedNodeElement = null;
}
});
}
// === Helpers ==============================================================
function cleanLatex(content) {
if (!content) return "";
// Normalize whitespace and remove common LaTeX artifacts
const normalized = String(content).replace(/\\\\/g, "\\").replace(/\s+/g, ' ');
return normalized.replace(/\\label\{[^}]*\}/g, "").trim();
}
async function typesetMath(elements) {
try {
if (typeof window === "undefined") return;
const filtered = (elements || []).filter(Boolean);
if (!filtered.length) return;
const mj = window.MathJax;
if (!mj || typeof mj.typesetPromise !== "function") return;
if (mj.startup && mj.startup.promise) {
await mj.startup.promise;
}
await mj.typesetPromise(filtered);
} catch (e) {
console.warn("MathJax typesetting failed", e);
}
}
function setStatus(msg, kind = "info") {
if (!paperStatusTextEl || !paperErrorTextEl) return;
if (kind === "error") {
paperStatusTextEl.textContent = "";
paperErrorTextEl.textContent = msg;
} else {
paperErrorTextEl.textContent = "";
paperStatusTextEl.textContent = msg;
}
}
function normalizeInputId(raw) {
let id = String(raw || "").trim();
if (!id) return "";
id = id.replace(/^arxiv:/i, "");
id = id.replace(/^arxiv_/i, "");
return id;
}
function hasExplicitVersion(id) {
return /v\d+$/i.test(id);
}
function stripArxivVersion(id) {
return String(id || "").replace(/v\d+$/i, "");
}
function buildCandidateIds(baseId) {
if (!baseId) return [];
if (hasExplicitVersion(baseId)) return [baseId];
const candidates = [];
for (let v = 1; v <= MAX_VERSION_TO_TRY; v++) {
candidates.push(`${baseId}v${v}`);
}
return candidates;
}
function buildFileUrl(arxivIdWithVersion) {
const safeId = arxivIdWithVersion.replace("/", "_");
const fileName = `arxiv_${safeId}.json`;
return `${HF_DATASET_BASE_URL}/${fileName}`;
}
async function fetchFirstExisting(candidates) {
for (const id of candidates) {
const url = buildFileUrl(id);
try {
const res = await fetch(url, { cache: "no-store" });
if (!res.ok) continue;
const data = await res.json();
return { arxivId: id, data, url };
} catch (err) {
// ignore
}
}
return null;
}
function truncate(text, max = 160) {
if (!text) return "";
if (text.length <= max) return text;
const t = text.slice(0, max);
const lastSpace = t.lastIndexOf(" ");
return (lastSpace > 40 ? t.slice(0, lastSpace) : t) + "...";
}
function clearGraph() {
svg.selectAll("*").remove();
if (currentSimulation) {
currentSimulation.stop();
currentSimulation = null;
}
}
function resetToLandingView() {
if (heroCarouselEl) heroCarouselEl.classList.remove("hidden");
if (inputEl) inputEl.value = "";
setStatus("", "info");
clearGraph();
renderDefinitionBank(null);
if (paperAreaEl) paperAreaEl.classList.add("hidden");
if (paperHeaderEl) paperHeaderEl.classList.add("hidden");
if (graphSectionEl) graphSectionEl.classList.add("hidden");
if (emptyStateEl) emptyStateEl.classList.remove("hidden");
if (infoPanelEl) infoPanelEl.classList.remove("visible");
window.scrollTo({ top: 0, behavior: "smooth" });
}
function delay(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
// === Caching & Queuing System =============================================
function getCachedMetadata(arxivId) {
try {
const raw = localStorage.getItem(CACHE_KEY_PREFIX + arxivId);
if (raw) return JSON.parse(raw);
} catch (e) { /* ignore */ }
return null;
}
function setCachedMetadata(arxivId, data) {
try {
if (data && (data.title || data.authors)) {
localStorage.setItem(CACHE_KEY_PREFIX + arxivId, JSON.stringify(data));
}
} catch (e) { /* ignore */ }
}
function requestMetadata(arxivId, urgent = false) {
return new Promise((resolve) => {
// 1. Check Cache
const cached = getCachedMetadata(arxivId);
if (cached) {
resolve(cached);
return;
}
// 2. Add to Queue
const task = { arxivId, resolve };
if (urgent) {
metadataQueue.unshift(task);
} else {
metadataQueue.push(task);
}
// 3. Process
if (!isProcessingQueue) {
processQueue();
}
});
}
async function processQueue() {
if (isProcessingQueue) return;
isProcessingQueue = true;
while (metadataQueue.length > 0) {
const { arxivId, resolve } = metadataQueue.shift();
let result = null;
try {
result = await fetchMetadataWithStrategies(arxivId);
} catch (e) {
console.warn(`Failed to load meta for ${arxivId}`, e);
}
if (result) {
setCachedMetadata(arxivId, result);
}
resolve(result);
// Throttle requests (1.2s)
if (metadataQueue.length > 0) {
await delay(1200);
}
}
isProcessingQueue = false;
}
// === Metadata Strategies ================================================
async function fetchMetadataWithStrategies(arxivId) {
// Strategy 1: Semantic Scholar
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 6000);
const baseId = stripArxivVersion(arxivId);
const url = `https://api.semanticscholar.org/graph/v1/paper/ARXIV:${baseId}?fields=title,abstract,authors`;
const res = await fetch(url, { signal: controller.signal });
clearTimeout(timeout);
if (res.status === 429) throw new Error("Rate Limited");
if (!res.ok) throw new Error("S2 Failed");
const data = await res.json();
return {
title: data.title,
// Handle rare case where S2 returns title but null abstract
abstract: data.abstract || null,
authors: (data.authors || []).map(a => a.name)
};
} catch (err) {
// Strategy 2: ArXiv Proxy
return await fetchArxivProxy(arxivId);
}
}
async function fetchArxivProxy(arxivId) {
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 8000);
const target = `https://export.arxiv.org/api/query?search_query=id:${arxivId}&max_results=1`;
const proxyUrl = `https://corsproxy.io/?${encodeURIComponent(target)}`;
const res = await fetch(proxyUrl, { signal: controller.signal });
clearTimeout(timeout);
if (!res.ok) return null;
const text = await res.text();
const parser = new DOMParser();
const doc = parser.parseFromString(text, "application/xml");
const entry = doc.getElementsByTagName("entry")[0];
if (!entry) return null;
return {
title: entry.getElementsByTagName("title")[0]?.textContent?.trim(),
abstract: entry.getElementsByTagName("summary")[0]?.textContent?.trim(),
authors: Array.from(entry.getElementsByTagName("author")).map(a => a.textContent.trim())
};
} catch (e) {
return null;
}
}
// === Sample Papers Carousel =============================================
function formatAuthorList(authors, maxVisible = 2) {
if (!Array.isArray(authors) || !authors.length) return "";
if (authors.length <= maxVisible) return authors.join(", ");
const visible = authors.slice(0, maxVisible).join(", ");
return `${visible} et al.`;
}
async function updateCardMetadata(card, arxivId) {
const authorsEl = card.querySelector(".sample-paper-authors");
const meta = await requestMetadata(arxivId, false);
if (meta && meta.authors && authorsEl) {
authorsEl.textContent = formatAuthorList(meta.authors);
}
}
function renderSamplePapers() {
if (!samplePapersEl || !Array.isArray(samplePapersData) || !samplePapersData.length) return;
samplePapersEl.innerHTML = "";
const total = samplePapersData.length;
const visibleCount = Math.min(3, total);
for (let i = 0; i < visibleCount; i++) {
const idx = (sampleCarouselIndex + i) % total;
const paper = samplePapersData[idx];
if (!paper || !paper.arxiv_id) continue;
const card = document.createElement("button");
card.type = "button";
card.className = "sample-paper-card";
const titleEl = document.createElement("p");
titleEl.className = "sample-paper-title";
titleEl.textContent = truncate(paper.title || paper.arxiv_id, 80);
card.appendChild(titleEl);
const authorsEl = document.createElement("p");
authorsEl.className = "sample-paper-authors";
if (paper.authors) {
authorsEl.textContent = Array.isArray(paper.authors) ? formatAuthorList(paper.authors) : paper.authors;
} else {
authorsEl.textContent = paper.arxiv_id;
updateCardMetadata(card, paper.arxiv_id);
}
card.appendChild(authorsEl);
card.addEventListener("click", () => {
inputEl.value = paper.arxiv_id;
handleLoadClick();
});
samplePapersEl.appendChild(card);
}
}
async function initSamplePapers() {
if (!samplePapersEl) return;
try {
const res = await fetch("./data/data.json", { cache: "no-store" });
if (!res.ok) return;
const papers = await res.json();
samplePapersData = Array.isArray(papers) ? papers : [];
sampleCarouselIndex = 0;
renderSamplePapers();
if (carouselPrevBtn && carouselNextBtn) {
carouselPrevBtn.onclick = () => {
sampleCarouselIndex = (sampleCarouselIndex - 1 + samplePapersData.length) % samplePapersData.length;
renderSamplePapers();
};
carouselNextBtn.onclick = () => {
sampleCarouselIndex = (sampleCarouselIndex + 1) % samplePapersData.length;
renderSamplePapers();
};
}
} catch (err) {
console.warn("Failed to load sample papers", err);
}
}
// === Metadata Display on Main View ======================================
function renderMetadataSkeleton(arxivId) {
paperTitleEl.textContent = `Loading ${arxivId}…`;
paperAuthorsEl.textContent = "";
paperAbstractEl.textContent = "";
paperLinkEl.href = arxivId ? `https://arxiv.org/abs/${arxivId}` : "https://arxiv.org/";
}
function applyMetadataFromDataset(arxivId, rawData) {
const meta = rawData.metadata || rawData.meta || {};
const titleCandidate =
meta.title || rawData.title || rawData.paper_title || rawData.paperTitle || null;
const abstractCandidate =
meta.abstract || rawData.abstract || rawData.summary || rawData.paper_abstract || "";
let authorsCandidate = [];
if (Array.isArray(meta.authors)) authorsCandidate = meta.authors;
else if (Array.isArray(rawData.authors)) authorsCandidate = rawData.authors;
else if (typeof meta.authors === "string") authorsCandidate = meta.authors.split(",").map(s => s.trim());
if (titleCandidate) paperTitleEl.textContent = titleCandidate;
else paperTitleEl.textContent = arxivId;
if (authorsCandidate.length > 0) paperAuthorsEl.textContent = authorsCandidate.join(", ");
if (abstractCandidate) {
paperAbstractEl.textContent = abstractCandidate;
void typesetMath([paperAbstractEl]);
}
}
async function enhanceMetadata(arxivId) {
// Urgent request to queue
const meta = await requestMetadata(arxivId, true);
if (!meta) return;
// --- FIX: Unconditional Overwrite Strategy ---
// Always prefer the API data over the static dataset data for the view
if (meta.title && meta.title.length > 0) {
paperTitleEl.textContent = meta.title;
}
if (meta.authors && meta.authors.length > 0) {
paperAuthorsEl.textContent = meta.authors.join(", ");
}
// Always overwrite abstract if the API returned one (even if dataset had one)
// The API is the "source of truth".
if (meta.abstract && meta.abstract.length > 0) {
paperAbstractEl.textContent = meta.abstract;
void typesetMath([paperAbstractEl]);
}
}
// === Graph rendering with D3 ============================================
function typeColor(type) {
const t = String(type || "").toLowerCase();
if (["theorem", "proposition", "lemma"].includes(t)) return getComputedStyle(document.documentElement).getPropertyValue("--graph-node-theorem").trim();
if (["fact", "example"].includes(t)) return getComputedStyle(document.documentElement).getPropertyValue("--graph-node-example").trim();
if (["remark", "observation"].includes(t)) return getComputedStyle(document.documentElement).getPropertyValue("--graph-node-remark").trim();
if (t === "corollary") return getComputedStyle(document.documentElement).getPropertyValue("--graph-node-corollary").trim();
if (t === "definition") return getComputedStyle(document.documentElement).getPropertyValue("--graph-node-definition").trim();
if (["external_reference", "external", "citation"].includes(t)) return getComputedStyle(document.documentElement).getPropertyValue("--graph-node-external").trim();
return getComputedStyle(document.documentElement).getPropertyValue("--graph-node-generic").trim();
}
function applyLegendVisibility() {
if (!graphRefs.nodeGroups || !graphRefs.links) return;
graphRefs.nodeGroups.style("display", (d) => graphState.hiddenTypes.has(String(d.type || "").toLowerCase()) ? "none" : null);
graphRefs.links.style("display", (d) => {
const sType = String(d.source.type || "").toLowerCase();
const tType = String(d.target.type || "").toLowerCase();
return (!graphState.hiddenTypes.has(sType) && !graphState.hiddenTypes.has(tType)) ? null : "none";
});
if (graphRefs.simulation) graphRefs.simulation.alpha(0.3).restart();
}
function setupLegend(nodes, links, onResetView) {
const nodeLegendContainer = document.getElementById("node-legend-container");
const legendStatsEl = document.getElementById("legend-stats");
const legendToggleBtn = document.getElementById("legend-toggle");
const legendRoot = document.getElementById("legend-root");
if (!nodeLegendContainer) return;
nodeLegendContainer.innerHTML = "";
if (legendStatsEl) legendStatsEl.textContent = `${nodes.length} artifacts · ${links.length} links`;
const typeSet = new Set(nodes.map(n => String(n.type || "").toLowerCase()));
Array.from(typeSet).sort().forEach(type => {
const item = document.createElement("div");
item.className = "legend-item";
const color = document.createElement("div");
color.className = "legend-color";
color.style.backgroundColor = typeColor(type);
const label = document.createElement("span");
label.textContent = type.charAt(0).toUpperCase() + type.slice(1).replace(/_/g, " ");
item.appendChild(color);
item.appendChild(label);
item.onclick = () => {
if (graphState.hiddenTypes.has(type)) {
graphState.hiddenTypes.delete(type);
item.classList.remove("inactive");
} else {
graphState.hiddenTypes.add(type);
item.classList.add("inactive");
}
applyLegendVisibility();
};
nodeLegendContainer.appendChild(item);
});
if (legendToggleBtn && legendRoot) {
legendToggleBtn.onclick = () => {
legendRoot.classList.toggle("legend--collapsed");
legendToggleBtn.textContent = legendRoot.classList.contains("legend--collapsed") ? "▾" : "▸";
};
}
}
function renderGraph(graph) {
clearGraph();
const container = document.querySelector(".graph-container");
const width = container.clientWidth || 800;
const height = container.clientHeight || 600;
const nodes = (graph.nodes || []).map(n => ({ ...n }));
const idMap = new Map(nodes.map(n => [n.id, n]));
const links = (graph.edges || []).map(e => ({
source: idMap.get(e.source) || e.source,
target: idMap.get(e.target) || e.target,
type: e.type || "edge"
})).filter(e => typeof e.source === 'object' && typeof e.target === 'object');
const degrees = new Map();
links.forEach(l => {
degrees.set(l.source.id, (degrees.get(l.source.id) || 0) + 1);
degrees.set(l.target.id, (degrees.get(l.target.id) || 0) + 1);
});
const radiusScale = d3.scaleSqrt().domain([0, 10]).range([10, 25]).clamp(true);
const zoom = d3.zoom().scaleExtent([0.1, 4]).on("zoom", (e) => g.attr("transform", e.transform));
svg.attr("viewBox", `0 0 ${width} ${height}`).call(zoom);
const g = svg.append("g");
g.append("defs").append("marker")
.attr("id", "arrow").attr("viewBox", "0 -5 10 10")
.attr("refX", 18).attr("refY", 0).attr("markerWidth", 6).attr("markerHeight", 6)
.attr("orient", "auto").append("path").attr("d", "M0,-5L10,0L0,5").attr("fill", "#777");
const link = g.append("g").selectAll("line")
.data(links).enter().append("line")
.attr("stroke", "#555").attr("stroke-opacity", 0.6).attr("stroke-width", 1)
.attr("marker-end", "url(#arrow)");
const node = g.append("g").selectAll("g")
.data(nodes).enter().append("g").attr("class", "graph-node")
.call(d3.drag().on("start", dragStarted).on("drag", dragged).on("end", dragEnded));
node.append("circle")
.attr("r", d => radiusScale(degrees.get(d.id) || 0))
.attr("fill", d => typeColor(d.type))
.attr("stroke", "#000");
node.append("text")
.attr("y", d => radiusScale(degrees.get(d.id) || 0) + 12)
.attr("text-anchor", "middle")
.attr("class", "node-label")
.text(d => truncate(d.display_name || d.id, 20));
node.on("mouseenter", (e, d) => {
tooltip.style("opacity", 1)
.html(`<b>${d.display_name || d.id}</b><br>${d.type}`)
.style("left", (e.pageX + 10) + "px").style("top", (e.pageY + 10) + "px");
}).on("mousemove", (e) => {
tooltip.style("left", (e.pageX + 10) + "px").style("top", (e.pageY + 10) + "px");
}).on("mouseleave", () => tooltip.style("opacity", 0))
.on("click", (e, d) => {
e.stopPropagation();
showNodeInfo(d, e.currentTarget);
});
svg.on("click", () => {
if (infoPanelEl) infoPanelEl.classList.remove("visible");
if (graphState.selectedNodeElement) {
graphState.selectedNodeElement.classList.remove("selected");
graphState.selectedNodeElement = null;
}
});
currentSimulation = d3.forceSimulation(nodes)
.force("link", d3.forceLink(links).id(d => d.id).distance(100))
.force("charge", d3.forceManyBody().strength(-300))
.force("center", d3.forceCenter(width / 2, height / 2))
.force("collide", d3.forceCollide().radius(d => radiusScale(degrees.get(d.id) || 0) + 5))
.on("tick", () => {
link.attr("x1", d => d.source.x).attr("y1", d => d.source.y)
.attr("x2", d => d.target.x).attr("y2", d => d.target.y);
node.attr("transform", d => `translate(${d.x},${d.y})`);
});
graphRefs.nodeGroups = node;
graphRefs.links = link;
graphRefs.simulation = currentSimulation;
setupLegend(nodes, links, () => { });
function dragStarted(e, d) {
if (!e.active) currentSimulation.alphaTarget(0.3).restart();
d.fx = d.x; d.fy = d.y;
}
function dragged(e, d) { d.fx = e.x; d.fy = e.y; }
function dragEnded(e, d) {
if (!e.active) currentSimulation.alphaTarget(0);
d.fx = null; d.fy = null;
}
function showNodeInfo(d, el) {
infoTitleEl.textContent = d.display_name || d.id;
let html = "";
if (d.content) html += `<h4>Content</h4><div class="math-content">${cleanLatex(d.content)}</div>`;
infoBodyEl.innerHTML = html;
infoPanelEl.classList.add("visible");
typesetMath([infoBodyEl]);
if (graphState.selectedNodeElement) graphState.selectedNodeElement.classList.remove("selected");
el.classList.add("selected");
graphState.selectedNodeElement = el;
}
}
function renderDefinitionBank(bank) {
if (!definitionBankEl) return;
if (!bank || Object.keys(bank).length === 0) {
definitionBankEl.classList.add("hidden");
return;
}
definitionBankEl.classList.remove("hidden");
definitionBankEl.innerHTML = "<h3>Definition Bank</h3>";
const list = document.createElement("div");
list.className = "definition-bank-list";
Object.values(bank).forEach(def => {
const item = document.createElement("div");
item.className = "definition-item-card";
item.innerHTML = `<strong>${def.term}</strong><div class="math-content">${cleanLatex(def.definition_text)}</div>`;
list.appendChild(item);
});
definitionBankEl.appendChild(list);
typesetMath([definitionBankEl]);
}
// === Main Flow ==========================================================
async function handleLoadClick() {
const raw = inputEl.value;
const normalized = normalizeInputId(raw);
if (!normalized) {
setStatus("Please enter an arXiv ID.", "error");
return;
}
loadBtn.disabled = true;
clearGraph();
if (heroCarouselEl) heroCarouselEl.classList.add("hidden");
paperAreaEl.classList.add("hidden");
paperHeaderEl.classList.add("hidden");
graphSectionEl.classList.add("hidden");
if (emptyStateEl) emptyStateEl.classList.add("hidden");
setStatus("Searching HuggingFace dataset...", "info");
const candidates = buildCandidateIds(normalized);
const result = await fetchFirstExisting(candidates.length ? candidates : [normalized]);
if (!result) {
setStatus("Paper not found in dataset.", "error");
renderMetadataSkeleton(normalized);
paperAreaEl.classList.remove("hidden");
if (heroCarouselEl) heroCarouselEl.classList.remove("hidden");
loadBtn.disabled = false;
return;
}
setStatus("", "info");
const { arxivId, data } = result;
paperAreaEl.classList.remove("hidden");
paperHeaderEl.classList.remove("hidden");
graphSectionEl.classList.remove("hidden");
if (emptyStateEl) emptyStateEl.classList.add("hidden");
renderMetadataSkeleton(arxivId);
applyMetadataFromDataset(arxivId, data);
// Try to get better metadata (Semantic Scholar/ArXiv) via Queue
enhanceMetadata(arxivId);
if (data.graph) {
setTimeout(() => renderGraph(data.graph), 50);
} else {
setStatus("No graph data available.", "error");
}
renderDefinitionBank(data.definition_bank);
loadBtn.disabled = false;
}
loadBtn.addEventListener("click", (e) => { e.preventDefault(); handleLoadClick(); });
inputEl.addEventListener("keydown", (e) => { if (e.key === "Enter") { e.preventDefault(); handleLoadClick(); } });
if (homeTitleEl) homeTitleEl.addEventListener("click", resetToLandingView);
// Initialize
initSamplePapers();