// BibTeX search utility - searches DBLP and Semantic Scholar for academic references export interface BibtexEntry { key: string; type: 'article' | 'inproceedings' | 'misc' | 'book' | 'phdthesis'; title: string; authors: string[]; year?: number; venue?: string; volume?: string; number?: string; pages?: string; doi?: string; arxiv_id?: string; url?: string; abstract?: string; citations?: number; bibtex: string; source: 'dblp' | 'semanticscholar'; } export interface BibtexSearchArgs { query: string; num?: number; year?: number; author?: string; } // Generate a citation key from title and year function generateKey(title: string, year?: number): string { const words = title.toLowerCase().split(/\s+/); const firstWord = words[0]?.replace(/[^a-z]/g, '') || 'unknown'; return `${firstWord}${year || ''}`; } // Format authors for BibTeX (Last, First and Last, First format) function formatAuthorsForBibtex(authors: string[]): string { return authors.join(' and '); } // Escape special characters for BibTeX function escapeBibtex(str: string): string { return str .replace(/&/g, '\\&') .replace(/%/g, '\\%') .replace(/_/g, '\\_') .replace(/\$/g, '\\$') .replace(/#/g, '\\#'); } // Generate BibTeX string from entry data function generateBibtexString(entry: Partial): string { const fields: string[] = []; if (entry.title) fields.push(` title = {${escapeBibtex(entry.title)}}`); if (entry.authors && entry.authors.length > 0) { fields.push(` author = {${formatAuthorsForBibtex(entry.authors)}}`); } if (entry.year) fields.push(` year = {${entry.year}}`); if (entry.venue) { const venueField = entry.type === 'inproceedings' ? 'booktitle' : 'journal'; fields.push(` ${venueField} = {${escapeBibtex(entry.venue)}}`); } if (entry.volume) fields.push(` volume = {${entry.volume}}`); if (entry.number) fields.push(` number = {${entry.number}}`); if (entry.pages) fields.push(` pages = {${entry.pages}}`); if (entry.doi) fields.push(` doi = {${entry.doi}}`); if (entry.url) fields.push(` url = {${entry.url}}`); if (entry.arxiv_id) { fields.push(` eprint = {${entry.arxiv_id}}`); fields.push(` archivePrefix = {arXiv}`); } const type = entry.type || 'misc'; const key = entry.key || generateKey(entry.title || 'unknown', entry.year); return `@${type}{${key},\n${fields.join(',\n')}\n}`; } // Search DBLP API export async function searchDblp(args: BibtexSearchArgs): Promise { const { query, num = 10, year, author } = args; // Build query string let searchQuery = query; if (author) { searchQuery = `${searchQuery} ${author}`; } const params = new URLSearchParams({ q: searchQuery, format: 'json', h: String(Math.min(num * 2, 100)), // Over-fetch for filtering }); try { const response = await fetch(`https://dblp.org/search/publ/api?${params}`, { headers: { 'Accept': 'application/json' }, signal: AbortSignal.timeout(5000), }); if (!response.ok) { return []; } const data = await response.json() as any; const hits = data?.result?.hits?.hit || []; const results: BibtexEntry[] = []; for (const hit of hits) { const info = hit.info; if (!info) continue; // Apply year filter const pubYear = info.year ? parseInt(info.year) : undefined; if (year && pubYear && pubYear < year) continue; // Parse authors let authors: string[] = []; if (info.authors?.author) { const authorList = Array.isArray(info.authors.author) ? info.authors.author : [info.authors.author]; authors = authorList.map((a: any) => typeof a === 'string' ? a : a.text || a._); } // Determine entry type let type: BibtexEntry['type'] = 'misc'; if (info.type === 'Conference and Workshop Papers') { type = 'inproceedings'; } else if (info.type === 'Journal Articles') { type = 'article'; } else if (info.type === 'Books and Theses') { type = 'book'; } const entry: Partial = { type, title: info.title?.replace(/\.$/, '') || '', // Remove trailing period authors, year: pubYear, venue: info.venue, volume: info.volume, number: info.number, pages: info.pages, doi: info.doi, url: info.ee || info.url, source: 'dblp', }; entry.key = generateKey(entry.title!, entry.year); entry.bibtex = generateBibtexString(entry); results.push(entry as BibtexEntry); if (results.length >= num) break; } return results; } catch (error) { // Timeout or network error return []; } } // Search Semantic Scholar API export async function searchSemanticScholar(args: BibtexSearchArgs): Promise { const { query, num = 10, year } = args; const params = new URLSearchParams({ query, limit: String(Math.min(num * 2, 100)), // Over-fetch for filtering fields: 'title,authors,year,venue,externalIds,abstract,citationCount,url', }); if (year) { params.set('year', `${year}-`); // >= year } try { const response = await fetch(`https://api.semanticscholar.org/graph/v1/paper/search?${params}`, { headers: { 'Accept': 'application/json' }, signal: AbortSignal.timeout(5000), }); if (!response.ok) { return []; } const data = await response.json() as any; const papers = data?.data || []; const results: BibtexEntry[] = []; for (const paper of papers) { if (!paper.title) continue; // Parse authors const authors = (paper.authors || []).map((a: any) => a.name).filter(Boolean); // Extract external IDs const externalIds = paper.externalIds || {}; const doi = externalIds.DOI; const arxivId = externalIds.ArXiv; // Determine entry type (default to article for S2) const type: BibtexEntry['type'] = paper.venue?.toLowerCase().includes('conference') ? 'inproceedings' : 'article'; const entry: Partial = { type, title: paper.title, authors, year: paper.year, venue: paper.venue, doi, arxiv_id: arxivId, url: paper.url, abstract: paper.abstract, citations: paper.citationCount, source: 'semanticscholar', }; entry.key = generateKey(entry.title!, entry.year); entry.bibtex = generateBibtexString(entry); results.push(entry as BibtexEntry); if (results.length >= num) break; } return results; } catch (error) { // Timeout or network error return []; } } // Normalize DOI for comparison function normalizeDoi(doi: string): string { return doi.toLowerCase() .replace(/^https?:\/\/doi\.org\//i, '') .replace(/^doi:/i, '') .trim(); } // Simple string similarity (Jaccard on words) function similarity(a: string, b: string): number { const wordsA = new Set(a.toLowerCase().split(/\s+/).filter(w => w.length > 2)); const wordsB = new Set(b.toLowerCase().split(/\s+/).filter(w => w.length > 2)); if (wordsA.size === 0 || wordsB.size === 0) return 0; let intersection = 0; for (const word of wordsA) { if (wordsB.has(word)) intersection++; } return intersection / Math.max(wordsA.size, wordsB.size); } // Deduplicate results from multiple sources export function deduplicateResults(results: BibtexEntry[]): BibtexEntry[] { const seen = new Map(); const seenDois = new Map(); // doi -> key const seenArxiv = new Map(); // arxiv_id -> key for (const entry of results) { // Check DOI match if (entry.doi) { const normalizedDoi = normalizeDoi(entry.doi); if (seenDois.has(normalizedDoi)) { // Merge with existing entry (prefer one with more data) const existingKey = seenDois.get(normalizedDoi)!; const existing = seen.get(existingKey)!; mergeEntries(existing, entry); continue; } seenDois.set(normalizedDoi, entry.key); } // Check arXiv ID match if (entry.arxiv_id) { const normalizedArxiv = entry.arxiv_id.replace(/v\d+$/, ''); // Remove version if (seenArxiv.has(normalizedArxiv)) { const existingKey = seenArxiv.get(normalizedArxiv)!; const existing = seen.get(existingKey)!; mergeEntries(existing, entry); continue; } seenArxiv.set(normalizedArxiv, entry.key); } // Check title similarity (for entries without DOI/arXiv) let isDuplicate = false; for (const [key, existing] of seen) { if (similarity(entry.title, existing.title) > 0.85 && entry.year === existing.year) { mergeEntries(existing, entry); isDuplicate = true; break; } } if (!isDuplicate) { seen.set(entry.key, entry); } } // Sort by year (descending) then title return Array.from(seen.values()).sort((a, b) => { if (a.year && b.year) { if (a.year !== b.year) return b.year - a.year; } return a.title.localeCompare(b.title); }); } // Merge two entries, keeping the most complete data function mergeEntries(target: BibtexEntry, source: BibtexEntry): void { // Keep longer abstract if (source.abstract && (!target.abstract || source.abstract.length > target.abstract.length)) { target.abstract = source.abstract; } // Keep higher citation count if (source.citations && (!target.citations || source.citations > target.citations)) { target.citations = source.citations; } // Fill in missing fields if (!target.doi && source.doi) target.doi = source.doi; if (!target.arxiv_id && source.arxiv_id) target.arxiv_id = source.arxiv_id; if (!target.url && source.url) target.url = source.url; if (!target.volume && source.volume) target.volume = source.volume; if (!target.pages && source.pages) target.pages = source.pages; // Regenerate bibtex with updated fields target.bibtex = generateBibtexString(target); } // Main search function - searches both providers and deduplicates export async function searchBibtex(args: BibtexSearchArgs): Promise { const { num = 10 } = args; // Search both providers in parallel const [dblpResults, s2Results] = await Promise.all([ searchDblp(args), searchSemanticScholar(args), ]); // Combine and deduplicate const combined = [...dblpResults, ...s2Results]; const deduplicated = deduplicateResults(combined); // Return requested number return deduplicated.slice(0, num); }