Spaces:

Duplicated from Elysia-Suite/ivy-rss-hub

ijohn07
/

ivy-rss-hub

Running

App Files Files Community

ivy-rss-hub / scripts /rss-parser.js

ijohn07's picture

Upload 16 files

790eee5 verified about 2 months ago

history blame contribute delete

15.5 kB

	/* ============================================
	IVY'S RSS HUB — RSS Parser Module
	Fetches and parses RSS/Atom feeds
	Uses Dexie (IndexedDB) for caching 🌿
	============================================ */

	/**
	* RSSParser - Handles fetching and parsing RSS/Atom feeds
	*/
	class RSSParser {
	constructor() {
	this.corsProxies = window.FeedsConfig.CORS_PROXIES;
	this.currentProxyIndex = 0;
	this.cache = new Map(); // Memory cache for current session
	this.cacheTimeout = 5 * 60 * 1000; // 5 minutes cache
	this.maxArticlesPerFeed = 25; // Limit parsing to 25 articles per feed
	this.dbReady = false;

	// Initialize Dexie database
	this.initDatabase();
	}

	/**
	* Initialize Dexie database for persistent caching
	*/
	async initDatabase() {
	try {
	this.db = new Dexie("IvyRSSHubCache");
	this.db.version(1).stores({
	feeds: "url, data, timestamp"
	});

	// Load cached feeds into memory
	await this.loadPersistentCache();
	this.dbReady = true;
	console.log("📦 Dexie database ready");
	} catch (e) {
	console.warn("Failed to init Dexie:", e);
	this.dbReady = false;
	}
	}

	/**
	* Load cache from IndexedDB into memory
	*/
	async loadPersistentCache() {
	try {
	if (!this.db) return;

	const maxAge = 2 * 60 * 60 * 1000; // 2 hours max
	const now = Date.now();

	// Get all cached feeds
	const cached = await this.db.feeds.toArray();

	// Load valid ones into memory cache
	let loaded = 0;
	for (const item of cached) {
	if (now - item.timestamp < maxAge) {
	this.cache.set(item.url, {
	data: item.data,
	timestamp: item.timestamp
	});
	loaded++;
	} else {
	// Delete old entries
	await this.db.feeds.delete(item.url);
	}
	}

	if (loaded > 0) {
	console.log(`📦 Loaded ${loaded} cached feeds from IndexedDB`);
	}
	} catch (e) {
	console.warn("Failed to load cache:", e);
	}
	}

	/**
	* Save a feed to IndexedDB
	*/
	async saveToPersistentCache(url, data, timestamp) {
	try {
	if (!this.db \|\| !this.dbReady) return;

	await this.db.feeds.put({
	url: url,
	data: data,
	timestamp: timestamp
	});
	} catch (e) {
	console.warn("Failed to save to cache:", e);
	}
	}

	/**
	* Fetch feed with CORS proxy fallback
	* @param {string} feedUrl - The RSS feed URL
	* @returns {Promise<string>} - Raw XML content
	*/
	async fetchWithProxy(feedUrl) {
	// Check cache first
	const cached = this.cache.get(feedUrl);
	if (cached && Date.now() - cached.timestamp < this.cacheTimeout) {
	return { data: cached.data, fromCache: true };
	}

	let lastError = null;

	// Try each proxy in sequence
	for (let i = 0; i < this.corsProxies.length; i++) {
	const proxyIndex = (this.currentProxyIndex + i) % this.corsProxies.length;
	const proxyUrl = this.corsProxies[proxyIndex](feedUrl);

	try {
	const response = await fetch(proxyUrl, {
	headers: {
	Accept: "application/rss+xml, application/xml, text/xml, application/atom+xml"
	}
	});

	if (!response.ok) {
	throw new Error(`HTTP ${response.status}`);
	}

	const text = await response.text();

	// Validate it's XML
	if (!text.includes("<?xml") && !text.includes("<rss") && !text.includes("<feed")) {
	throw new Error("Invalid XML response");
	}

	const timestamp = Date.now();

	// Cache successful response in memory
	this.cache.set(feedUrl, {
	data: text,
	timestamp: timestamp
	});

	// Save to IndexedDB (async, don't await)
	this.saveToPersistentCache(feedUrl, text, timestamp);

	// Remember working proxy
	this.currentProxyIndex = proxyIndex;

	return { data: text, fromCache: false };
	} catch (error) {
	lastError = error;
	// Use debug level for fallback attempts (less noise in console)
	console.debug(`[RSS] Proxy ${proxyIndex} failed for ${feedUrl}:`, error.message);
	}
	}

	throw new Error(`All proxies failed: ${lastError?.message}`);
	}

	/**
	* Parse RSS/Atom XML to structured data
	* @param {string} xmlText - Raw XML content
	* @returns {Object} - Parsed feed data
	*/
	parseXML(xmlText) {
	const parser = new DOMParser();
	const doc = parser.parseFromString(xmlText, "text/xml");

	// Check for parsing errors
	const parseError = doc.querySelector("parsererror");
	if (parseError) {
	throw new Error("XML parsing failed");
	}

	// Detect feed type (RSS or Atom)
	const isAtom = doc.querySelector("feed") !== null;

	if (isAtom) {
	return this.parseAtom(doc);
	} else {
	return this.parseRSS(doc);
	}
	}

	/**
	* Parse RSS 2.0 format
	*/
	parseRSS(doc) {
	const channel = doc.querySelector("channel");
	if (!channel) {
	throw new Error("Invalid RSS: no channel element");
	}

	// Limit to maxArticlesPerFeed for performance
	const items = Array.from(doc.querySelectorAll("item"))
	.slice(0, this.maxArticlesPerFeed)
	.map(item => ({
	title: this.getTextContent(item, "title"),
	link: this.getTextContent(item, "link"),
	description: this.cleanDescription(this.getTextContent(item, "description")),
	pubDate: this.parseDate(this.getTextContent(item, "pubDate")),
	author: this.getTextContent(item, "author") \|\| this.getDCCreator(item)
	}));

	return {
	title: this.getTextContent(channel, "title"),
	description: this.getTextContent(channel, "description"),
	link: this.getTextContent(channel, "link"),
	items: items.filter(item => item.title && item.link)
	};
	}

	/**
	* Get dc:creator content (handles namespace properly)
	*/
	getDCCreator(item) {
	// Try different approaches for dc:creator namespace
	const creator = item.getElementsByTagName("dc:creator")[0] \|\| item.getElementsByTagName("creator")[0];
	return creator?.textContent?.trim() \|\| "";
	}

	/**
	* Parse Atom format (supports arXiv API format)
	*/
	parseAtom(doc) {
	const feed = doc.querySelector("feed");
	if (!feed) {
	throw new Error("Invalid Atom: no feed element");
	}

	// Limit to maxArticlesPerFeed for performance
	const items = Array.from(doc.querySelectorAll("entry"))
	.slice(0, this.maxArticlesPerFeed)
	.map(entry => {
	// Atom links can be in <link href="..."> format
	const linkElement = entry.querySelector('link[rel="alternate"]') \|\| entry.querySelector("link");
	const link = linkElement?.getAttribute("href") \|\| this.getTextContent(entry, "link");

	// Handle multiple authors (common in arXiv API)
	const authorElements = entry.querySelectorAll("author name");
	let author = "";
	if (authorElements.length > 0) {
	// Join first 3 authors, add "et al." if more
	const names = Array.from(authorElements)
	.map(el => el.textContent?.trim())
	.filter(Boolean);
	author = names.slice(0, 3).join(", ");
	if (names.length > 3) author += " et al.";
	}

	return {
	title: this.getTextContent(entry, "title"),
	link: link,
	description: this.cleanDescription(
	this.getTextContent(entry, "summary") \|\| this.getTextContent(entry, "content")
	),
	pubDate: this.parseDate(
	this.getTextContent(entry, "published") \|\| this.getTextContent(entry, "updated")
	),
	author: author
	};
	});

	const titleLink = feed.querySelector('link[rel="alternate"]') \|\| feed.querySelector("link");

	return {
	title: this.getTextContent(feed, "title"),
	description: this.getTextContent(feed, "subtitle"),
	link: titleLink?.getAttribute("href") \|\| "",
	items: items.filter(item => item.title && item.link)
	};
	}

	/**
	* Helper: Get text content of an element
	*/
	getTextContent(parent, selector) {
	const element = parent.querySelector(selector);
	return element?.textContent?.trim() \|\| "";
	}

	/**
	* Helper: Parse date string to Date object
	*/
	parseDate(dateStr) {
	if (!dateStr) return null;

	try {
	const date = new Date(dateStr);
	return isNaN(date.getTime()) ? null : date;
	} catch {
	return null;
	}
	}

	/**
	* Helper: Clean HTML from description
	* Uses DOMParser for safer HTML stripping (avoids script execution)
	*/
	cleanDescription(html) {
	if (!html) return "";

	// Use DOMParser for safer HTML parsing (doesn't execute scripts)
	try {
	const doc = new DOMParser().parseFromString(html, "text/html");
	let text = doc.body.textContent \|\| "";

	// Fallback for edge cases
	if (!text && html) {
	const temp = document.createElement("div");
	temp.textContent = html; // Use textContent to set, not innerHTML
	text = temp.textContent;
	}

	// Trim and normalize whitespace
	text = text.trim().replace(/\s+/g, " ");

	// Limit length
	if (text.length > 200) {
	text = text.substring(0, 200) + "...";
	}

	return text;
	} catch {
	// Ultimate fallback: just strip tags with regex
	return html.replace(/<[^>]*>/g, "").substring(0, 200);
	}
	}

	/**
	* Main method: Fetch and parse a feed
	* @param {Object} feedConfig - Feed configuration object
	* @returns {Promise<Object>} - Parsed feed with metadata
	*/
	async fetchFeed(feedConfig) {
	try {
	const { data: xmlText, fromCache } = await this.fetchWithProxy(feedConfig.url);
	const parsed = this.parseXML(xmlText);

	return {
	...feedConfig,
	feed: parsed,
	status: "success",
	fromCache: fromCache,
	lastFetched: new Date()
	};
	} catch (error) {
	console.error(`Failed to fetch ${feedConfig.name}:`, error);

	return {
	...feedConfig,
	feed: null,
	status: "error",
	error: error.message,
	lastFetched: new Date()
	};
	}
	}

	/**
	* Fetch multiple feeds with progressive callback
	* Includes rate limiting delay between batches for APIs like arXiv (3s rule)
	* @param {Array} feedConfigs - Array of feed configurations
	* @param {Function} onProgress - Callback called after each feed loads
	* @returns {Promise<Array>} - Array of parsed feeds
	*/
	async fetchAllFeedsProgressive(feedConfigs, onProgress) {
	const results = [];
	const batchSize = 5; // Fetch 5 at a time for balance
	const batchDelay = 1000; // 1 second delay between batches to be nice to APIs

	for (let i = 0; i < feedConfigs.length; i += batchSize) {
	const batch = feedConfigs.slice(i, i + batchSize);
	const batchResults = await Promise.allSettled(batch.map(config => this.fetchFeed(config)));

	batchResults.forEach((result, idx) => {
	if (result.status === "fulfilled") {
	results.push(result.value);
	} else {
	results.push({
	...batch[idx],
	feed: null,
	status: "error",
	error: result.reason?.message \|\| "Unknown error"
	});
	}
	});

	// Call progress callback
	if (onProgress) {
	onProgress(results, i + batch.length, feedConfigs.length);
	}

	// Add delay between batches to respect rate limits (arXiv = 3s rule)
	// Only delay if there are more batches to process
	if (i + batchSize < feedConfigs.length) {
	await new Promise(resolve => setTimeout(resolve, batchDelay));
	}
	}

	return results;
	}

	/**
	* Fetch multiple feeds in parallel (legacy)
	* @param {Array} feedConfigs - Array of feed configurations
	* @returns {Promise<Array>} - Array of parsed feeds
	*/
	async fetchAllFeeds(feedConfigs) {
	const results = await Promise.allSettled(feedConfigs.map(config => this.fetchFeed(config)));

	return results.map((result, index) => {
	if (result.status === "fulfilled") {
	return result.value;
	} else {
	return {
	...feedConfigs[index],
	feed: null,
	status: "error",
	error: result.reason?.message \|\| "Unknown error"
	};
	}
	});
	}

	/**
	* Clear the cache (memory only, keeps IndexedDB)
	*/
	clearCache() {
	this.cache.clear();
	}

	/**
	* Clear all cache including IndexedDB
	*/
	async clearAllCache() {
	this.cache.clear();
	try {
	if (this.db) {
	await this.db.feeds.clear();
	console.log("🗑️ IndexedDB cache cleared");
	}
	// Also clean up old localStorage if it exists
	localStorage.removeItem("ivy-rss-cache");
	} catch (e) {
	console.warn("Failed to clear cache:", e);
	}
	}
	}

	// Export for use in app
	window.RSSParser = RSSParser;