Spaces:
Running
Running
| /* ============================================ | |
| IVY'S RSS HUB — RSS Parser Module | |
| Fetches and parses RSS/Atom feeds | |
| Uses Dexie (IndexedDB) for caching 🌿 | |
| ============================================ */ | |
| /** | |
| * RSSParser - Handles fetching and parsing RSS/Atom feeds | |
| */ | |
| class RSSParser { | |
| constructor() { | |
| this.corsProxies = window.FeedsConfig.CORS_PROXIES; | |
| this.currentProxyIndex = 0; | |
| this.cache = new Map(); // Memory cache for current session | |
| this.cacheTimeout = 5 * 60 * 1000; // 5 minutes cache | |
| this.maxArticlesPerFeed = 25; // Limit parsing to 25 articles per feed | |
| this.dbReady = false; | |
| // Initialize Dexie database | |
| this.initDatabase(); | |
| } | |
| /** | |
| * Initialize Dexie database for persistent caching | |
| */ | |
| async initDatabase() { | |
| try { | |
| this.db = new Dexie("IvyRSSHubCache"); | |
| this.db.version(1).stores({ | |
| feeds: "url, data, timestamp" | |
| }); | |
| // Load cached feeds into memory | |
| await this.loadPersistentCache(); | |
| this.dbReady = true; | |
| console.log("📦 Dexie database ready"); | |
| } catch (e) { | |
| console.warn("Failed to init Dexie:", e); | |
| this.dbReady = false; | |
| } | |
| } | |
| /** | |
| * Load cache from IndexedDB into memory | |
| */ | |
| async loadPersistentCache() { | |
| try { | |
| if (!this.db) return; | |
| const maxAge = 2 * 60 * 60 * 1000; // 2 hours max | |
| const now = Date.now(); | |
| // Get all cached feeds | |
| const cached = await this.db.feeds.toArray(); | |
| // Load valid ones into memory cache | |
| let loaded = 0; | |
| for (const item of cached) { | |
| if (now - item.timestamp < maxAge) { | |
| this.cache.set(item.url, { | |
| data: item.data, | |
| timestamp: item.timestamp | |
| }); | |
| loaded++; | |
| } else { | |
| // Delete old entries | |
| await this.db.feeds.delete(item.url); | |
| } | |
| } | |
| if (loaded > 0) { | |
| console.log(`📦 Loaded ${loaded} cached feeds from IndexedDB`); | |
| } | |
| } catch (e) { | |
| console.warn("Failed to load cache:", e); | |
| } | |
| } | |
| /** | |
| * Save a feed to IndexedDB | |
| */ | |
| async saveToPersistentCache(url, data, timestamp) { | |
| try { | |
| if (!this.db || !this.dbReady) return; | |
| await this.db.feeds.put({ | |
| url: url, | |
| data: data, | |
| timestamp: timestamp | |
| }); | |
| } catch (e) { | |
| console.warn("Failed to save to cache:", e); | |
| } | |
| } | |
| /** | |
| * Fetch feed with CORS proxy fallback | |
| * @param {string} feedUrl - The RSS feed URL | |
| * @returns {Promise<string>} - Raw XML content | |
| */ | |
| async fetchWithProxy(feedUrl) { | |
| // Check cache first | |
| const cached = this.cache.get(feedUrl); | |
| if (cached && Date.now() - cached.timestamp < this.cacheTimeout) { | |
| return { data: cached.data, fromCache: true }; | |
| } | |
| let lastError = null; | |
| // Try each proxy in sequence | |
| for (let i = 0; i < this.corsProxies.length; i++) { | |
| const proxyIndex = (this.currentProxyIndex + i) % this.corsProxies.length; | |
| const proxyUrl = this.corsProxies[proxyIndex](feedUrl); | |
| try { | |
| const response = await fetch(proxyUrl, { | |
| headers: { | |
| Accept: "application/rss+xml, application/xml, text/xml, application/atom+xml" | |
| } | |
| }); | |
| if (!response.ok) { | |
| throw new Error(`HTTP ${response.status}`); | |
| } | |
| const text = await response.text(); | |
| // Validate it's XML | |
| if (!text.includes("<?xml") && !text.includes("<rss") && !text.includes("<feed")) { | |
| throw new Error("Invalid XML response"); | |
| } | |
| const timestamp = Date.now(); | |
| // Cache successful response in memory | |
| this.cache.set(feedUrl, { | |
| data: text, | |
| timestamp: timestamp | |
| }); | |
| // Save to IndexedDB (async, don't await) | |
| this.saveToPersistentCache(feedUrl, text, timestamp); | |
| // Remember working proxy | |
| this.currentProxyIndex = proxyIndex; | |
| return { data: text, fromCache: false }; | |
| } catch (error) { | |
| lastError = error; | |
| // Use debug level for fallback attempts (less noise in console) | |
| console.debug(`[RSS] Proxy ${proxyIndex} failed for ${feedUrl}:`, error.message); | |
| } | |
| } | |
| throw new Error(`All proxies failed: ${lastError?.message}`); | |
| } | |
| /** | |
| * Parse RSS/Atom XML to structured data | |
| * @param {string} xmlText - Raw XML content | |
| * @returns {Object} - Parsed feed data | |
| */ | |
| parseXML(xmlText) { | |
| const parser = new DOMParser(); | |
| const doc = parser.parseFromString(xmlText, "text/xml"); | |
| // Check for parsing errors | |
| const parseError = doc.querySelector("parsererror"); | |
| if (parseError) { | |
| throw new Error("XML parsing failed"); | |
| } | |
| // Detect feed type (RSS or Atom) | |
| const isAtom = doc.querySelector("feed") !== null; | |
| if (isAtom) { | |
| return this.parseAtom(doc); | |
| } else { | |
| return this.parseRSS(doc); | |
| } | |
| } | |
| /** | |
| * Parse RSS 2.0 format | |
| */ | |
| parseRSS(doc) { | |
| const channel = doc.querySelector("channel"); | |
| if (!channel) { | |
| throw new Error("Invalid RSS: no channel element"); | |
| } | |
| // Limit to maxArticlesPerFeed for performance | |
| const items = Array.from(doc.querySelectorAll("item")) | |
| .slice(0, this.maxArticlesPerFeed) | |
| .map(item => ({ | |
| title: this.getTextContent(item, "title"), | |
| link: this.getTextContent(item, "link"), | |
| description: this.cleanDescription(this.getTextContent(item, "description")), | |
| pubDate: this.parseDate(this.getTextContent(item, "pubDate")), | |
| author: this.getTextContent(item, "author") || this.getDCCreator(item) | |
| })); | |
| return { | |
| title: this.getTextContent(channel, "title"), | |
| description: this.getTextContent(channel, "description"), | |
| link: this.getTextContent(channel, "link"), | |
| items: items.filter(item => item.title && item.link) | |
| }; | |
| } | |
| /** | |
| * Get dc:creator content (handles namespace properly) | |
| */ | |
| getDCCreator(item) { | |
| // Try different approaches for dc:creator namespace | |
| const creator = item.getElementsByTagName("dc:creator")[0] || item.getElementsByTagName("creator")[0]; | |
| return creator?.textContent?.trim() || ""; | |
| } | |
| /** | |
| * Parse Atom format (supports arXiv API format) | |
| */ | |
| parseAtom(doc) { | |
| const feed = doc.querySelector("feed"); | |
| if (!feed) { | |
| throw new Error("Invalid Atom: no feed element"); | |
| } | |
| // Limit to maxArticlesPerFeed for performance | |
| const items = Array.from(doc.querySelectorAll("entry")) | |
| .slice(0, this.maxArticlesPerFeed) | |
| .map(entry => { | |
| // Atom links can be in <link href="..."> format | |
| const linkElement = entry.querySelector('link[rel="alternate"]') || entry.querySelector("link"); | |
| const link = linkElement?.getAttribute("href") || this.getTextContent(entry, "link"); | |
| // Handle multiple authors (common in arXiv API) | |
| const authorElements = entry.querySelectorAll("author name"); | |
| let author = ""; | |
| if (authorElements.length > 0) { | |
| // Join first 3 authors, add "et al." if more | |
| const names = Array.from(authorElements) | |
| .map(el => el.textContent?.trim()) | |
| .filter(Boolean); | |
| author = names.slice(0, 3).join(", "); | |
| if (names.length > 3) author += " et al."; | |
| } | |
| return { | |
| title: this.getTextContent(entry, "title"), | |
| link: link, | |
| description: this.cleanDescription( | |
| this.getTextContent(entry, "summary") || this.getTextContent(entry, "content") | |
| ), | |
| pubDate: this.parseDate( | |
| this.getTextContent(entry, "published") || this.getTextContent(entry, "updated") | |
| ), | |
| author: author | |
| }; | |
| }); | |
| const titleLink = feed.querySelector('link[rel="alternate"]') || feed.querySelector("link"); | |
| return { | |
| title: this.getTextContent(feed, "title"), | |
| description: this.getTextContent(feed, "subtitle"), | |
| link: titleLink?.getAttribute("href") || "", | |
| items: items.filter(item => item.title && item.link) | |
| }; | |
| } | |
| /** | |
| * Helper: Get text content of an element | |
| */ | |
| getTextContent(parent, selector) { | |
| const element = parent.querySelector(selector); | |
| return element?.textContent?.trim() || ""; | |
| } | |
| /** | |
| * Helper: Parse date string to Date object | |
| */ | |
| parseDate(dateStr) { | |
| if (!dateStr) return null; | |
| try { | |
| const date = new Date(dateStr); | |
| return isNaN(date.getTime()) ? null : date; | |
| } catch { | |
| return null; | |
| } | |
| } | |
| /** | |
| * Helper: Clean HTML from description | |
| * Uses DOMParser for safer HTML stripping (avoids script execution) | |
| */ | |
| cleanDescription(html) { | |
| if (!html) return ""; | |
| // Use DOMParser for safer HTML parsing (doesn't execute scripts) | |
| try { | |
| const doc = new DOMParser().parseFromString(html, "text/html"); | |
| let text = doc.body.textContent || ""; | |
| // Fallback for edge cases | |
| if (!text && html) { | |
| const temp = document.createElement("div"); | |
| temp.textContent = html; // Use textContent to set, not innerHTML | |
| text = temp.textContent; | |
| } | |
| // Trim and normalize whitespace | |
| text = text.trim().replace(/\s+/g, " "); | |
| // Limit length | |
| if (text.length > 200) { | |
| text = text.substring(0, 200) + "..."; | |
| } | |
| return text; | |
| } catch { | |
| // Ultimate fallback: just strip tags with regex | |
| return html.replace(/<[^>]*>/g, "").substring(0, 200); | |
| } | |
| } | |
| /** | |
| * Main method: Fetch and parse a feed | |
| * @param {Object} feedConfig - Feed configuration object | |
| * @returns {Promise<Object>} - Parsed feed with metadata | |
| */ | |
| async fetchFeed(feedConfig) { | |
| try { | |
| const { data: xmlText, fromCache } = await this.fetchWithProxy(feedConfig.url); | |
| const parsed = this.parseXML(xmlText); | |
| return { | |
| ...feedConfig, | |
| feed: parsed, | |
| status: "success", | |
| fromCache: fromCache, | |
| lastFetched: new Date() | |
| }; | |
| } catch (error) { | |
| console.error(`Failed to fetch ${feedConfig.name}:`, error); | |
| return { | |
| ...feedConfig, | |
| feed: null, | |
| status: "error", | |
| error: error.message, | |
| lastFetched: new Date() | |
| }; | |
| } | |
| } | |
| /** | |
| * Fetch multiple feeds with progressive callback | |
| * Includes rate limiting delay between batches for APIs like arXiv (3s rule) | |
| * @param {Array} feedConfigs - Array of feed configurations | |
| * @param {Function} onProgress - Callback called after each feed loads | |
| * @returns {Promise<Array>} - Array of parsed feeds | |
| */ | |
| async fetchAllFeedsProgressive(feedConfigs, onProgress) { | |
| const results = []; | |
| const batchSize = 5; // Fetch 5 at a time for balance | |
| const batchDelay = 1000; // 1 second delay between batches to be nice to APIs | |
| for (let i = 0; i < feedConfigs.length; i += batchSize) { | |
| const batch = feedConfigs.slice(i, i + batchSize); | |
| const batchResults = await Promise.allSettled(batch.map(config => this.fetchFeed(config))); | |
| batchResults.forEach((result, idx) => { | |
| if (result.status === "fulfilled") { | |
| results.push(result.value); | |
| } else { | |
| results.push({ | |
| ...batch[idx], | |
| feed: null, | |
| status: "error", | |
| error: result.reason?.message || "Unknown error" | |
| }); | |
| } | |
| }); | |
| // Call progress callback | |
| if (onProgress) { | |
| onProgress(results, i + batch.length, feedConfigs.length); | |
| } | |
| // Add delay between batches to respect rate limits (arXiv = 3s rule) | |
| // Only delay if there are more batches to process | |
| if (i + batchSize < feedConfigs.length) { | |
| await new Promise(resolve => setTimeout(resolve, batchDelay)); | |
| } | |
| } | |
| return results; | |
| } | |
| /** | |
| * Fetch multiple feeds in parallel (legacy) | |
| * @param {Array} feedConfigs - Array of feed configurations | |
| * @returns {Promise<Array>} - Array of parsed feeds | |
| */ | |
| async fetchAllFeeds(feedConfigs) { | |
| const results = await Promise.allSettled(feedConfigs.map(config => this.fetchFeed(config))); | |
| return results.map((result, index) => { | |
| if (result.status === "fulfilled") { | |
| return result.value; | |
| } else { | |
| return { | |
| ...feedConfigs[index], | |
| feed: null, | |
| status: "error", | |
| error: result.reason?.message || "Unknown error" | |
| }; | |
| } | |
| }); | |
| } | |
| /** | |
| * Clear the cache (memory only, keeps IndexedDB) | |
| */ | |
| clearCache() { | |
| this.cache.clear(); | |
| } | |
| /** | |
| * Clear all cache including IndexedDB | |
| */ | |
| async clearAllCache() { | |
| this.cache.clear(); | |
| try { | |
| if (this.db) { | |
| await this.db.feeds.clear(); | |
| console.log("🗑️ IndexedDB cache cleared"); | |
| } | |
| // Also clean up old localStorage if it exists | |
| localStorage.removeItem("ivy-rss-cache"); | |
| } catch (e) { | |
| console.warn("Failed to clear cache:", e); | |
| } | |
| } | |
| } | |
| // Export for use in app | |
| window.RSSParser = RSSParser; | |