import { Paper, Reference } from '../types'; // ==================== Rate Limiter ==================== class RateLimiter { private lastRequestTime = 0; private queue: Array<{ fn: () => Promise; resolve: (value: unknown) => void; reject: (reason?: unknown) => void; }> = []; private processing = false; private minDelay: number; constructor(minDelayMs: number = 3000) { this.minDelay = minDelayMs; } async execute(fn: () => Promise): Promise { return new Promise((resolve, reject) => { this.queue.push({ fn: fn as () => Promise, resolve: resolve as (value: unknown) => void, reject, }); this.processQueue(); }); } private async processQueue() { if (this.processing || this.queue.length === 0) return; this.processing = true; while (this.queue.length > 0) { const item = this.queue.shift()!; const now = Date.now(); const elapsed = now - this.lastRequestTime; if (elapsed < this.minDelay) { await new Promise((r) => setTimeout(r, this.minDelay - elapsed)); } try { this.lastRequestTime = Date.now(); const result = await item.fn(); item.resolve(result); } catch (error) { item.reject(error); } } this.processing = false; } } const arxivLimiter = new RateLimiter(3500); const ar5ivLimiter = new RateLimiter(2500); // ==================== CORS Proxy ==================== async function fetchWithProxy(url: string): Promise { // Try direct fetch first try { const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), 8000); const response = await fetch(url, { signal: controller.signal }); clearTimeout(timeout); if (response.ok) return await response.text(); } catch { // Direct fetch failed, try proxies } // Try allorigins proxy const proxies = [ `https://api.allorigins.win/raw?url=${encodeURIComponent(url)}`, `https://corsproxy.io/?${encodeURIComponent(url)}`, ]; for (const proxyUrl of proxies) { try { const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), 15000); const response = await fetch(proxyUrl, { signal: controller.signal }); clearTimeout(timeout); if (response.ok) return await response.text(); } catch { continue; } } throw new Error(`Failed to fetch: ${url}`); } // ==================== ArXiv Search API ==================== export async function searchArxiv( query: string, start: number = 0, maxResults: number = 10 ): Promise<{ papers: Paper[]; total: number }> { return arxivLimiter.execute(async () => { const searchQuery = query .split(/\s+/) .map((term) => `all:${term}`) .join('+AND+'); const url = `https://export.arxiv.org/api/query?search_query=${searchQuery}&start=${start}&max_results=${maxResults}&sortBy=relevance&sortOrder=descending`; const xml = await fetchWithProxy(url); return parseArxivAtom(xml); }); } function parseArxivAtom(xml: string): { papers: Paper[]; total: number } { const parser = new DOMParser(); const doc = parser.parseFromString(xml, 'application/xml'); const totalEl = doc.querySelector('totalResults') || doc.getElementsByTagNameNS('http://a9.com/-/spec/opensearch/1.1/', 'totalResults')[0]; const total = totalEl ? parseInt(totalEl.textContent || '0') : 0; const entries = doc.getElementsByTagName('entry'); const papers: Paper[] = []; for (let i = 0; i < entries.length; i++) { const entry = entries[i]; const getTag = (tag: string) => entry.getElementsByTagName(tag)[0]?.textContent?.trim() || ''; const idUrl = getTag('id'); const arxivId = idUrl.replace(/^https?:\/\/arxiv\.org\/abs\//, '').replace(/v\d+$/, ''); const title = getTag('title').replace(/\s+/g, ' '); const abstract = getTag('summary').replace(/\s+/g, ' '); const published = getTag('published'); const updated = getTag('updated'); const authorEls = entry.getElementsByTagName('author'); const authors: string[] = []; for (let j = 0; j < authorEls.length; j++) { const name = authorEls[j].getElementsByTagName('name')[0]?.textContent; if (name) authors.push(name); } const catEls = entry.getElementsByTagName('category'); const categories: string[] = []; for (let j = 0; j < catEls.length; j++) { const term = catEls[j].getAttribute('term'); if (term) categories.push(term); } const linkEls = entry.getElementsByTagName('link'); let pdfLink = ''; for (let j = 0; j < linkEls.length; j++) { if (linkEls[j].getAttribute('title') === 'pdf') { pdfLink = linkEls[j].getAttribute('href') || ''; } } if (title) { papers.push({ id: arxivId, title, authors, abstract, published, updated, categories, pdfLink, htmlLink: `https://ar5iv.labs.arxiv.org/html/${arxivId}`, sectionsLoaded: false, sectionsLoading: false, }); } } return { papers, total }; } // ==================== ar5iv Section Parser ==================== export async function fetchPaperSections( arxivId: string ): Promise<{ introduction?: string; relatedWork?: string; methods?: string; references?: Reference[]; }> { return ar5ivLimiter.execute(async () => { const url = `https://ar5iv.labs.arxiv.org/html/${arxivId}`; const html = await fetchWithProxy(url); return parseAr5ivHtml(html); }); } function parseAr5ivHtml(html: string): { introduction?: string; relatedWork?: string; methods?: string; references?: Reference[]; } { const parser = new DOMParser(); const doc = parser.parseFromString(html, 'text/html'); doc.querySelectorAll('script, style, nav, header, footer').forEach((el) => el.remove()); const result: { introduction?: string; relatedWork?: string; methods?: string; references?: Reference[]; } = {}; // Try multiple selectors for sections const sectionSelectors = [ 'section.ltx_section', 'section.ltx_chapter', 'div.ltx_section', 'section[id]', ]; let sections: Element[] = []; for (const sel of sectionSelectors) { const found = doc.querySelectorAll(sel); if (found.length > 0) { sections = Array.from(found); break; } } // If no structured sections found, try to parse by headings if (sections.length === 0) { const headings = doc.querySelectorAll('h2, h3'); headings.forEach((h) => { const text = h.textContent?.toLowerCase() || ''; const parent = h.parentElement; if (parent) { if (text.includes('introduction')) result.introduction = parent.innerHTML; else if (text.includes('related work') || text.includes('background')) result.relatedWork = parent.innerHTML; else if (text.includes('method') || text.includes('approach')) result.methods = parent.innerHTML; } }); } else { for (const section of sections) { const heading = section.querySelector('h1, h2, h3, h4, .ltx_title'); if (!heading) continue; const headingText = heading.textContent?.toLowerCase() || ''; if ( headingText.includes('introduction') && !headingText.includes('related') ) { result.introduction = section.innerHTML; } else if ( headingText.includes('related work') || headingText.includes('related works') || headingText.includes('literature review') || headingText.includes('background and related') || (headingText.includes('background') && headingText.includes('work')) ) { result.relatedWork = section.innerHTML; } else if ( !result.methods && (headingText.includes('method') || headingText.includes('approach') || headingText.includes('proposed') || headingText.includes('architecture') || headingText.includes('framework') || headingText.includes('model description')) ) { result.methods = section.innerHTML; } } } // Parse references const bibItems = doc.querySelectorAll( '.ltx_bibitem, li[id*="bib"], .ltx_biblist > li' ); const references: Reference[] = []; bibItems.forEach((item) => { const tagEl = item.querySelector('.ltx_tag, .ltx_tag_bibitem'); const number = tagEl?.textContent?.replace(/[\[\]]/g, '').trim() || ''; const key = item.id || `ref-${number}`; let text = ''; const blocks = item.querySelectorAll('.ltx_bibblock'); if (blocks.length > 0) { blocks.forEach((block) => { text += block.textContent + ' '; }); } else { text = item.textContent?.replace(tagEl?.textContent || '', '').trim() || ''; } text = text.trim(); let arxivId: string | undefined; const links = item.querySelectorAll('a[href]'); links.forEach((link) => { const href = link.getAttribute('href') || ''; const match = href.match(/arxiv\.org\/abs\/(\d{4}\.\d{4,5})/); if (match) arxivId = match[1]; }); if (!arxivId) { const textMatch = text.match(/arXiv[:\s]*(\d{4}\.\d{4,5})/i); if (textMatch) arxivId = textMatch[1]; } if (number || text) { references.push({ key, number, text, arxivId }); } }); result.references = references; return result; } // ==================== Translation ==================== export async function translateText( text: string, targetLang: string = 'zh-CN' ): Promise { if (!text || text.trim().length === 0) return ''; // Strip HTML tags for translation const plainText = text.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim(); const chunks = splitIntoChunks(plainText, 4500); const results: string[] = []; for (const chunk of chunks) { const translated = await translateChunk(chunk, targetLang); results.push(translated); if (chunks.length > 1) { await new Promise((r) => setTimeout(r, 300)); } } return results.join(''); } function splitIntoChunks(text: string, maxLen: number): string[] { const chunks: string[] = []; let remaining = text; while (remaining.length > 0) { if (remaining.length <= maxLen) { chunks.push(remaining); break; } let bp = maxLen; const sentEnd = remaining.lastIndexOf('. ', maxLen); if (sentEnd > maxLen * 0.5) bp = sentEnd + 2; chunks.push(remaining.substring(0, bp)); remaining = remaining.substring(bp); } return chunks; } async function translateChunk(text: string, targetLang: string): Promise { // Try Google Translate unofficial API try { const url = `https://translate.googleapis.com/translate_a/single?client=gtx&sl=en&tl=${targetLang}&dt=t&q=${encodeURIComponent(text)}`; const response = await fetch(url); if (response.ok) { const data = await response.json(); if (Array.isArray(data) && Array.isArray(data[0])) { return data[0] .filter((item: unknown) => Array.isArray(item) && item[0]) .map((item: unknown[]) => item[0]) .join(''); } } } catch { // fallthrough } // Fallback: MyMemory try { const url = `https://api.mymemory.translated.net/get?q=${encodeURIComponent(text.substring(0, 500))}&langpair=en|${targetLang}`; const response = await fetch(url); if (response.ok) { const data = await response.json(); if (data.responseStatus === 200) { return data.responseData.translatedText; } } } catch { // fallthrough } throw new Error('翻译失败,请稍后重试 / Translation failed'); } // ==================== Fetch Paper By ID ==================== export async function fetchPaperById(arxivId: string): Promise { return arxivLimiter.execute(async () => { const cleanId = arxivId.replace(/^https?:\/\/arxiv\.org\/abs\//, '').replace(/v\d+$/, ''); const url = `https://export.arxiv.org/api/query?id_list=${encodeURIComponent(cleanId)}`; const xml = await fetchWithProxy(url); const { papers } = parseArxivAtom(xml); return papers.length > 0 ? papers[0] : null; }); } // ==================== Helpers ==================== export function extractPlainText(html: string): string { const div = document.createElement('div'); div.innerHTML = html; return div.textContent || div.innerText || ''; }