ivy-rss-hub

Running

File size: 15,511 Bytes

/* ============================================

   IVY'S RSS HUB — RSS Parser Module

   Fetches and parses RSS/Atom feeds

   Uses Dexie (IndexedDB) for caching 🌿

   ============================================ */

/**

 * RSSParser - Handles fetching and parsing RSS/Atom feeds

 */
class RSSParser {
    constructor() {
        this.corsProxies = window.FeedsConfig.CORS_PROXIES;
        this.currentProxyIndex = 0;
        this.cache = new Map(); // Memory cache for current session
        this.cacheTimeout = 5 * 60 * 1000; // 5 minutes cache
        this.maxArticlesPerFeed = 25; // Limit parsing to 25 articles per feed
        this.dbReady = false;

        // Initialize Dexie database
        this.initDatabase();
    }

    /**

     * Initialize Dexie database for persistent caching

     */
    async initDatabase() {
        try {
            this.db = new Dexie("IvyRSSHubCache");
            this.db.version(1).stores({
                feeds: "url, data, timestamp"
            });

            // Load cached feeds into memory
            await this.loadPersistentCache();
            this.dbReady = true;
            console.log("📦 Dexie database ready");
        } catch (e) {
            console.warn("Failed to init Dexie:", e);
            this.dbReady = false;
        }
    }

    /**

     * Load cache from IndexedDB into memory

     */
    async loadPersistentCache() {
        try {
            if (!this.db) return;

            const maxAge = 2 * 60 * 60 * 1000; // 2 hours max
            const now = Date.now();

            // Get all cached feeds
            const cached = await this.db.feeds.toArray();

            // Load valid ones into memory cache
            let loaded = 0;
            for (const item of cached) {
                if (now - item.timestamp < maxAge) {
                    this.cache.set(item.url, {
                        data: item.data,
                        timestamp: item.timestamp
                    });
                    loaded++;
                } else {
                    // Delete old entries
                    await this.db.feeds.delete(item.url);
                }
            }

            if (loaded > 0) {
                console.log(`📦 Loaded ${loaded} cached feeds from IndexedDB`);
            }
        } catch (e) {
            console.warn("Failed to load cache:", e);
        }
    }

    /**

     * Save a feed to IndexedDB

     */
    async saveToPersistentCache(url, data, timestamp) {
        try {
            if (!this.db || !this.dbReady) return;

            await this.db.feeds.put({
                url: url,
                data: data,
                timestamp: timestamp
            });
        } catch (e) {
            console.warn("Failed to save to cache:", e);
        }
    }

    /**

     * Fetch feed with CORS proxy fallback

     * @param {string} feedUrl - The RSS feed URL

     * @returns {Promise<string>} - Raw XML content

     */
    async fetchWithProxy(feedUrl) {
        // Check cache first
        const cached = this.cache.get(feedUrl);
        if (cached && Date.now() - cached.timestamp < this.cacheTimeout) {
            return { data: cached.data, fromCache: true };
        }

        let lastError = null;

        // Try each proxy in sequence
        for (let i = 0; i < this.corsProxies.length; i++) {
            const proxyIndex = (this.currentProxyIndex + i) % this.corsProxies.length;
            const proxyUrl = this.corsProxies[proxyIndex](feedUrl);

            try {
                const response = await fetch(proxyUrl, {
                    headers: {
                        Accept: "application/rss+xml, application/xml, text/xml, application/atom+xml"
                    }
                });

                if (!response.ok) {
                    throw new Error(`HTTP ${response.status}`);
                }

                const text = await response.text();

                // Validate it's XML
                if (!text.includes("<?xml") && !text.includes("<rss") && !text.includes("<feed")) {
                    throw new Error("Invalid XML response");
                }

                const timestamp = Date.now();

                // Cache successful response in memory
                this.cache.set(feedUrl, {
                    data: text,
                    timestamp: timestamp
                });

                // Save to IndexedDB (async, don't await)
                this.saveToPersistentCache(feedUrl, text, timestamp);

                // Remember working proxy
                this.currentProxyIndex = proxyIndex;

                return { data: text, fromCache: false };
            } catch (error) {
                lastError = error;
                // Use debug level for fallback attempts (less noise in console)
                console.debug(`[RSS] Proxy ${proxyIndex} failed for ${feedUrl}:`, error.message);
            }
        }

        throw new Error(`All proxies failed: ${lastError?.message}`);
    }

    /**

     * Parse RSS/Atom XML to structured data

     * @param {string} xmlText - Raw XML content

     * @returns {Object} - Parsed feed data

     */
    parseXML(xmlText) {
        const parser = new DOMParser();
        const doc = parser.parseFromString(xmlText, "text/xml");

        // Check for parsing errors
        const parseError = doc.querySelector("parsererror");
        if (parseError) {
            throw new Error("XML parsing failed");
        }

        // Detect feed type (RSS or Atom)
        const isAtom = doc.querySelector("feed") !== null;

        if (isAtom) {
            return this.parseAtom(doc);
        } else {
            return this.parseRSS(doc);
        }
    }

    /**

     * Parse RSS 2.0 format

     */
    parseRSS(doc) {
        const channel = doc.querySelector("channel");
        if (!channel) {
            throw new Error("Invalid RSS: no channel element");
        }

        // Limit to maxArticlesPerFeed for performance
        const items = Array.from(doc.querySelectorAll("item"))
            .slice(0, this.maxArticlesPerFeed)
            .map(item => ({
                title: this.getTextContent(item, "title"),
                link: this.getTextContent(item, "link"),
                description: this.cleanDescription(this.getTextContent(item, "description")),
                pubDate: this.parseDate(this.getTextContent(item, "pubDate")),
                author: this.getTextContent(item, "author") || this.getDCCreator(item)
            }));

        return {
            title: this.getTextContent(channel, "title"),
            description: this.getTextContent(channel, "description"),
            link: this.getTextContent(channel, "link"),
            items: items.filter(item => item.title && item.link)
        };
    }

    /**

     * Get dc:creator content (handles namespace properly)

     */
    getDCCreator(item) {
        // Try different approaches for dc:creator namespace
        const creator = item.getElementsByTagName("dc:creator")[0] || item.getElementsByTagName("creator")[0];
        return creator?.textContent?.trim() || "";
    }

    /**

     * Parse Atom format (supports arXiv API format)

     */
    parseAtom(doc) {
        const feed = doc.querySelector("feed");
        if (!feed) {
            throw new Error("Invalid Atom: no feed element");
        }

        // Limit to maxArticlesPerFeed for performance
        const items = Array.from(doc.querySelectorAll("entry"))
            .slice(0, this.maxArticlesPerFeed)
            .map(entry => {
                // Atom links can be in <link href="..."> format
                const linkElement = entry.querySelector('link[rel="alternate"]') || entry.querySelector("link");
                const link = linkElement?.getAttribute("href") || this.getTextContent(entry, "link");

                // Handle multiple authors (common in arXiv API)
                const authorElements = entry.querySelectorAll("author name");
                let author = "";
                if (authorElements.length > 0) {
                    // Join first 3 authors, add "et al." if more
                    const names = Array.from(authorElements)
                        .map(el => el.textContent?.trim())
                        .filter(Boolean);
                    author = names.slice(0, 3).join(", ");
                    if (names.length > 3) author += " et al.";
                }

                return {
                    title: this.getTextContent(entry, "title"),
                    link: link,
                    description: this.cleanDescription(
                        this.getTextContent(entry, "summary") || this.getTextContent(entry, "content")
                    ),
                    pubDate: this.parseDate(
                        this.getTextContent(entry, "published") || this.getTextContent(entry, "updated")
                    ),
                    author: author
                };
            });

        const titleLink = feed.querySelector('link[rel="alternate"]') || feed.querySelector("link");

        return {
            title: this.getTextContent(feed, "title"),
            description: this.getTextContent(feed, "subtitle"),
            link: titleLink?.getAttribute("href") || "",
            items: items.filter(item => item.title && item.link)
        };
    }

    /**

     * Helper: Get text content of an element

     */
    getTextContent(parent, selector) {
        const element = parent.querySelector(selector);
        return element?.textContent?.trim() || "";
    }

    /**

     * Helper: Parse date string to Date object

     */
    parseDate(dateStr) {
        if (!dateStr) return null;

        try {
            const date = new Date(dateStr);
            return isNaN(date.getTime()) ? null : date;
        } catch {
            return null;
        }
    }

    /**

     * Helper: Clean HTML from description

     * Uses DOMParser for safer HTML stripping (avoids script execution)

     */
    cleanDescription(html) {
        if (!html) return "";

        // Use DOMParser for safer HTML parsing (doesn't execute scripts)
        try {
            const doc = new DOMParser().parseFromString(html, "text/html");
            let text = doc.body.textContent || "";

            // Fallback for edge cases
            if (!text && html) {
                const temp = document.createElement("div");
                temp.textContent = html; // Use textContent to set, not innerHTML
                text = temp.textContent;
            }

            // Trim and normalize whitespace
            text = text.trim().replace(/\s+/g, " ");

            // Limit length
            if (text.length > 200) {
                text = text.substring(0, 200) + "...";
            }

            return text;
        } catch {
            // Ultimate fallback: just strip tags with regex
            return html.replace(/<[^>]*>/g, "").substring(0, 200);
        }
    }

    /**

     * Main method: Fetch and parse a feed

     * @param {Object} feedConfig - Feed configuration object

     * @returns {Promise<Object>} - Parsed feed with metadata

     */
    async fetchFeed(feedConfig) {
        try {
            const { data: xmlText, fromCache } = await this.fetchWithProxy(feedConfig.url);
            const parsed = this.parseXML(xmlText);

            return {
                ...feedConfig,
                feed: parsed,
                status: "success",
                fromCache: fromCache,
                lastFetched: new Date()
            };
        } catch (error) {
            console.error(`Failed to fetch ${feedConfig.name}:`, error);

            return {
                ...feedConfig,
                feed: null,
                status: "error",
                error: error.message,
                lastFetched: new Date()
            };
        }
    }

    /**

     * Fetch multiple feeds with progressive callback

     * Includes rate limiting delay between batches for APIs like arXiv (3s rule)

     * @param {Array} feedConfigs - Array of feed configurations

     * @param {Function} onProgress - Callback called after each feed loads

     * @returns {Promise<Array>} - Array of parsed feeds

     */
    async fetchAllFeedsProgressive(feedConfigs, onProgress) {
        const results = [];
        const batchSize = 5; // Fetch 5 at a time for balance
        const batchDelay = 1000; // 1 second delay between batches to be nice to APIs

        for (let i = 0; i < feedConfigs.length; i += batchSize) {
            const batch = feedConfigs.slice(i, i + batchSize);
            const batchResults = await Promise.allSettled(batch.map(config => this.fetchFeed(config)));

            batchResults.forEach((result, idx) => {
                if (result.status === "fulfilled") {
                    results.push(result.value);
                } else {
                    results.push({
                        ...batch[idx],
                        feed: null,
                        status: "error",
                        error: result.reason?.message || "Unknown error"
                    });
                }
            });

            // Call progress callback
            if (onProgress) {
                onProgress(results, i + batch.length, feedConfigs.length);
            }

            // Add delay between batches to respect rate limits (arXiv = 3s rule)
            // Only delay if there are more batches to process
            if (i + batchSize < feedConfigs.length) {
                await new Promise(resolve => setTimeout(resolve, batchDelay));
            }
        }

        return results;
    }

    /**

     * Fetch multiple feeds in parallel (legacy)

     * @param {Array} feedConfigs - Array of feed configurations

     * @returns {Promise<Array>} - Array of parsed feeds

     */
    async fetchAllFeeds(feedConfigs) {
        const results = await Promise.allSettled(feedConfigs.map(config => this.fetchFeed(config)));

        return results.map((result, index) => {
            if (result.status === "fulfilled") {
                return result.value;
            } else {
                return {
                    ...feedConfigs[index],
                    feed: null,
                    status: "error",
                    error: result.reason?.message || "Unknown error"
                };
            }
        });
    }

    /**

     * Clear the cache (memory only, keeps IndexedDB)

     */
    clearCache() {
        this.cache.clear();
    }

    /**

     * Clear all cache including IndexedDB

     */
    async clearAllCache() {
        this.cache.clear();
        try {
            if (this.db) {
                await this.db.feeds.clear();
                console.log("🗑️ IndexedDB cache cleared");
            }
            // Also clean up old localStorage if it exists
            localStorage.removeItem("ivy-rss-cache");
        } catch (e) {
            console.warn("Failed to clear cache:", e);
        }
    }
}

// Export for use in app
window.RSSParser = RSSParser;