RBJin's picture
Upload 20 files
81cb6e0 verified
import { Paper, Reference } from '../types';
// ==================== Rate Limiter ====================
class RateLimiter {
private lastRequestTime = 0;
private queue: Array<{
fn: () => Promise<unknown>;
resolve: (value: unknown) => void;
reject: (reason?: unknown) => void;
}> = [];
private processing = false;
private minDelay: number;
constructor(minDelayMs: number = 3000) {
this.minDelay = minDelayMs;
}
async execute<T>(fn: () => Promise<T>): Promise<T> {
return new Promise<T>((resolve, reject) => {
this.queue.push({
fn: fn as () => Promise<unknown>,
resolve: resolve as (value: unknown) => void,
reject,
});
this.processQueue();
});
}
private async processQueue() {
if (this.processing || this.queue.length === 0) return;
this.processing = true;
while (this.queue.length > 0) {
const item = this.queue.shift()!;
const now = Date.now();
const elapsed = now - this.lastRequestTime;
if (elapsed < this.minDelay) {
await new Promise((r) => setTimeout(r, this.minDelay - elapsed));
}
try {
this.lastRequestTime = Date.now();
const result = await item.fn();
item.resolve(result);
} catch (error) {
item.reject(error);
}
}
this.processing = false;
}
}
const arxivLimiter = new RateLimiter(3500);
const ar5ivLimiter = new RateLimiter(2500);
// ==================== CORS Proxy ====================
async function fetchWithProxy(url: string): Promise<string> {
// Try direct fetch first
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 8000);
const response = await fetch(url, { signal: controller.signal });
clearTimeout(timeout);
if (response.ok) return await response.text();
} catch {
// Direct fetch failed, try proxies
}
// Try allorigins proxy
const proxies = [
`https://api.allorigins.win/raw?url=${encodeURIComponent(url)}`,
`https://corsproxy.io/?${encodeURIComponent(url)}`,
];
for (const proxyUrl of proxies) {
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 15000);
const response = await fetch(proxyUrl, { signal: controller.signal });
clearTimeout(timeout);
if (response.ok) return await response.text();
} catch {
continue;
}
}
throw new Error(`Failed to fetch: ${url}`);
}
// ==================== ArXiv Search API ====================
export async function searchArxiv(
query: string,
start: number = 0,
maxResults: number = 10
): Promise<{ papers: Paper[]; total: number }> {
return arxivLimiter.execute(async () => {
const searchQuery = query
.split(/\s+/)
.map((term) => `all:${term}`)
.join('+AND+');
const url = `https://export.arxiv.org/api/query?search_query=${searchQuery}&start=${start}&max_results=${maxResults}&sortBy=relevance&sortOrder=descending`;
const xml = await fetchWithProxy(url);
return parseArxivAtom(xml);
});
}
function parseArxivAtom(xml: string): { papers: Paper[]; total: number } {
const parser = new DOMParser();
const doc = parser.parseFromString(xml, 'application/xml');
const totalEl = doc.querySelector('totalResults') ||
doc.getElementsByTagNameNS('http://a9.com/-/spec/opensearch/1.1/', 'totalResults')[0];
const total = totalEl ? parseInt(totalEl.textContent || '0') : 0;
const entries = doc.getElementsByTagName('entry');
const papers: Paper[] = [];
for (let i = 0; i < entries.length; i++) {
const entry = entries[i];
const getTag = (tag: string) => entry.getElementsByTagName(tag)[0]?.textContent?.trim() || '';
const idUrl = getTag('id');
const arxivId = idUrl.replace(/^https?:\/\/arxiv\.org\/abs\//, '').replace(/v\d+$/, '');
const title = getTag('title').replace(/\s+/g, ' ');
const abstract = getTag('summary').replace(/\s+/g, ' ');
const published = getTag('published');
const updated = getTag('updated');
const authorEls = entry.getElementsByTagName('author');
const authors: string[] = [];
for (let j = 0; j < authorEls.length; j++) {
const name = authorEls[j].getElementsByTagName('name')[0]?.textContent;
if (name) authors.push(name);
}
const catEls = entry.getElementsByTagName('category');
const categories: string[] = [];
for (let j = 0; j < catEls.length; j++) {
const term = catEls[j].getAttribute('term');
if (term) categories.push(term);
}
const linkEls = entry.getElementsByTagName('link');
let pdfLink = '';
for (let j = 0; j < linkEls.length; j++) {
if (linkEls[j].getAttribute('title') === 'pdf') {
pdfLink = linkEls[j].getAttribute('href') || '';
}
}
if (title) {
papers.push({
id: arxivId,
title,
authors,
abstract,
published,
updated,
categories,
pdfLink,
htmlLink: `https://ar5iv.labs.arxiv.org/html/${arxivId}`,
sectionsLoaded: false,
sectionsLoading: false,
});
}
}
return { papers, total };
}
// ==================== ar5iv Section Parser ====================
export async function fetchPaperSections(
arxivId: string
): Promise<{
introduction?: string;
relatedWork?: string;
methods?: string;
references?: Reference[];
}> {
return ar5ivLimiter.execute(async () => {
const url = `https://ar5iv.labs.arxiv.org/html/${arxivId}`;
const html = await fetchWithProxy(url);
return parseAr5ivHtml(html);
});
}
function parseAr5ivHtml(html: string): {
introduction?: string;
relatedWork?: string;
methods?: string;
references?: Reference[];
} {
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
doc.querySelectorAll('script, style, nav, header, footer').forEach((el) => el.remove());
const result: {
introduction?: string;
relatedWork?: string;
methods?: string;
references?: Reference[];
} = {};
// Try multiple selectors for sections
const sectionSelectors = [
'section.ltx_section',
'section.ltx_chapter',
'div.ltx_section',
'section[id]',
];
let sections: Element[] = [];
for (const sel of sectionSelectors) {
const found = doc.querySelectorAll(sel);
if (found.length > 0) {
sections = Array.from(found);
break;
}
}
// If no structured sections found, try to parse by headings
if (sections.length === 0) {
const headings = doc.querySelectorAll('h2, h3');
headings.forEach((h) => {
const text = h.textContent?.toLowerCase() || '';
const parent = h.parentElement;
if (parent) {
if (text.includes('introduction')) result.introduction = parent.innerHTML;
else if (text.includes('related work') || text.includes('background'))
result.relatedWork = parent.innerHTML;
else if (text.includes('method') || text.includes('approach'))
result.methods = parent.innerHTML;
}
});
} else {
for (const section of sections) {
const heading = section.querySelector('h1, h2, h3, h4, .ltx_title');
if (!heading) continue;
const headingText = heading.textContent?.toLowerCase() || '';
if (
headingText.includes('introduction') &&
!headingText.includes('related')
) {
result.introduction = section.innerHTML;
} else if (
headingText.includes('related work') ||
headingText.includes('related works') ||
headingText.includes('literature review') ||
headingText.includes('background and related') ||
(headingText.includes('background') && headingText.includes('work'))
) {
result.relatedWork = section.innerHTML;
} else if (
!result.methods &&
(headingText.includes('method') ||
headingText.includes('approach') ||
headingText.includes('proposed') ||
headingText.includes('architecture') ||
headingText.includes('framework') ||
headingText.includes('model description'))
) {
result.methods = section.innerHTML;
}
}
}
// Parse references
const bibItems = doc.querySelectorAll(
'.ltx_bibitem, li[id*="bib"], .ltx_biblist > li'
);
const references: Reference[] = [];
bibItems.forEach((item) => {
const tagEl = item.querySelector('.ltx_tag, .ltx_tag_bibitem');
const number = tagEl?.textContent?.replace(/[\[\]]/g, '').trim() || '';
const key = item.id || `ref-${number}`;
let text = '';
const blocks = item.querySelectorAll('.ltx_bibblock');
if (blocks.length > 0) {
blocks.forEach((block) => {
text += block.textContent + ' ';
});
} else {
text = item.textContent?.replace(tagEl?.textContent || '', '').trim() || '';
}
text = text.trim();
let arxivId: string | undefined;
const links = item.querySelectorAll('a[href]');
links.forEach((link) => {
const href = link.getAttribute('href') || '';
const match = href.match(/arxiv\.org\/abs\/(\d{4}\.\d{4,5})/);
if (match) arxivId = match[1];
});
if (!arxivId) {
const textMatch = text.match(/arXiv[:\s]*(\d{4}\.\d{4,5})/i);
if (textMatch) arxivId = textMatch[1];
}
if (number || text) {
references.push({ key, number, text, arxivId });
}
});
result.references = references;
return result;
}
// ==================== Translation ====================
export async function translateText(
text: string,
targetLang: string = 'zh-CN'
): Promise<string> {
if (!text || text.trim().length === 0) return '';
// Strip HTML tags for translation
const plainText = text.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
const chunks = splitIntoChunks(plainText, 4500);
const results: string[] = [];
for (const chunk of chunks) {
const translated = await translateChunk(chunk, targetLang);
results.push(translated);
if (chunks.length > 1) {
await new Promise((r) => setTimeout(r, 300));
}
}
return results.join('');
}
function splitIntoChunks(text: string, maxLen: number): string[] {
const chunks: string[] = [];
let remaining = text;
while (remaining.length > 0) {
if (remaining.length <= maxLen) {
chunks.push(remaining);
break;
}
let bp = maxLen;
const sentEnd = remaining.lastIndexOf('. ', maxLen);
if (sentEnd > maxLen * 0.5) bp = sentEnd + 2;
chunks.push(remaining.substring(0, bp));
remaining = remaining.substring(bp);
}
return chunks;
}
async function translateChunk(text: string, targetLang: string): Promise<string> {
// Try Google Translate unofficial API
try {
const url = `https://translate.googleapis.com/translate_a/single?client=gtx&sl=en&tl=${targetLang}&dt=t&q=${encodeURIComponent(text)}`;
const response = await fetch(url);
if (response.ok) {
const data = await response.json();
if (Array.isArray(data) && Array.isArray(data[0])) {
return data[0]
.filter((item: unknown) => Array.isArray(item) && item[0])
.map((item: unknown[]) => item[0])
.join('');
}
}
} catch {
// fallthrough
}
// Fallback: MyMemory
try {
const url = `https://api.mymemory.translated.net/get?q=${encodeURIComponent(text.substring(0, 500))}&langpair=en|${targetLang}`;
const response = await fetch(url);
if (response.ok) {
const data = await response.json();
if (data.responseStatus === 200) {
return data.responseData.translatedText;
}
}
} catch {
// fallthrough
}
throw new Error('翻译失败,请稍后重试 / Translation failed');
}
// ==================== Fetch Paper By ID ====================
export async function fetchPaperById(arxivId: string): Promise<Paper | null> {
return arxivLimiter.execute(async () => {
const cleanId = arxivId.replace(/^https?:\/\/arxiv\.org\/abs\//, '').replace(/v\d+$/, '');
const url = `https://export.arxiv.org/api/query?id_list=${encodeURIComponent(cleanId)}`;
const xml = await fetchWithProxy(url);
const { papers } = parseArxivAtom(xml);
return papers.length > 0 ? papers[0] : null;
});
}
// ==================== Helpers ====================
export function extractPlainText(html: string): string {
const div = document.createElement('div');
div.innerHTML = html;
return div.textContent || div.innerText || '';
}