File size: 15,511 Bytes
98cd950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
790eee5
98cd950
 
 
 
 
 
 
 
 
 
 
790eee5
 
 
 
 
 
 
 
 
 
98cd950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
790eee5
 
 
 
 
 
 
 
 
 
 
 
98cd950
 
 
 
 
 
 
 
 
790eee5
98cd950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
790eee5
98cd950
 
 
 
790eee5
 
 
 
 
 
 
 
 
 
 
98cd950
790eee5
 
98cd950
790eee5
 
 
 
98cd950
790eee5
 
 
 
98cd950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
790eee5
98cd950
 
 
 
 
 
 
790eee5
98cd950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
790eee5
 
 
 
 
 
98cd950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
/* ============================================

   IVY'S RSS HUB — RSS Parser Module

   Fetches and parses RSS/Atom feeds

   Uses Dexie (IndexedDB) for caching 🌿

   ============================================ */

/**

 * RSSParser - Handles fetching and parsing RSS/Atom feeds

 */
class RSSParser {
    constructor() {
        this.corsProxies = window.FeedsConfig.CORS_PROXIES;
        this.currentProxyIndex = 0;
        this.cache = new Map(); // Memory cache for current session
        this.cacheTimeout = 5 * 60 * 1000; // 5 minutes cache
        this.maxArticlesPerFeed = 25; // Limit parsing to 25 articles per feed
        this.dbReady = false;

        // Initialize Dexie database
        this.initDatabase();
    }

    /**

     * Initialize Dexie database for persistent caching

     */
    async initDatabase() {
        try {
            this.db = new Dexie("IvyRSSHubCache");
            this.db.version(1).stores({
                feeds: "url, data, timestamp"
            });

            // Load cached feeds into memory
            await this.loadPersistentCache();
            this.dbReady = true;
            console.log("📦 Dexie database ready");
        } catch (e) {
            console.warn("Failed to init Dexie:", e);
            this.dbReady = false;
        }
    }

    /**

     * Load cache from IndexedDB into memory

     */
    async loadPersistentCache() {
        try {
            if (!this.db) return;

            const maxAge = 2 * 60 * 60 * 1000; // 2 hours max
            const now = Date.now();

            // Get all cached feeds
            const cached = await this.db.feeds.toArray();

            // Load valid ones into memory cache
            let loaded = 0;
            for (const item of cached) {
                if (now - item.timestamp < maxAge) {
                    this.cache.set(item.url, {
                        data: item.data,
                        timestamp: item.timestamp
                    });
                    loaded++;
                } else {
                    // Delete old entries
                    await this.db.feeds.delete(item.url);
                }
            }

            if (loaded > 0) {
                console.log(`📦 Loaded ${loaded} cached feeds from IndexedDB`);
            }
        } catch (e) {
            console.warn("Failed to load cache:", e);
        }
    }

    /**

     * Save a feed to IndexedDB

     */
    async saveToPersistentCache(url, data, timestamp) {
        try {
            if (!this.db || !this.dbReady) return;

            await this.db.feeds.put({
                url: url,
                data: data,
                timestamp: timestamp
            });
        } catch (e) {
            console.warn("Failed to save to cache:", e);
        }
    }

    /**

     * Fetch feed with CORS proxy fallback

     * @param {string} feedUrl - The RSS feed URL

     * @returns {Promise<string>} - Raw XML content

     */
    async fetchWithProxy(feedUrl) {
        // Check cache first
        const cached = this.cache.get(feedUrl);
        if (cached && Date.now() - cached.timestamp < this.cacheTimeout) {
            return { data: cached.data, fromCache: true };
        }

        let lastError = null;

        // Try each proxy in sequence
        for (let i = 0; i < this.corsProxies.length; i++) {
            const proxyIndex = (this.currentProxyIndex + i) % this.corsProxies.length;
            const proxyUrl = this.corsProxies[proxyIndex](feedUrl);

            try {
                const response = await fetch(proxyUrl, {
                    headers: {
                        Accept: "application/rss+xml, application/xml, text/xml, application/atom+xml"
                    }
                });

                if (!response.ok) {
                    throw new Error(`HTTP ${response.status}`);
                }

                const text = await response.text();

                // Validate it's XML
                if (!text.includes("<?xml") && !text.includes("<rss") && !text.includes("<feed")) {
                    throw new Error("Invalid XML response");
                }

                const timestamp = Date.now();

                // Cache successful response in memory
                this.cache.set(feedUrl, {
                    data: text,
                    timestamp: timestamp
                });

                // Save to IndexedDB (async, don't await)
                this.saveToPersistentCache(feedUrl, text, timestamp);

                // Remember working proxy
                this.currentProxyIndex = proxyIndex;

                return { data: text, fromCache: false };
            } catch (error) {
                lastError = error;
                // Use debug level for fallback attempts (less noise in console)
                console.debug(`[RSS] Proxy ${proxyIndex} failed for ${feedUrl}:`, error.message);
            }
        }

        throw new Error(`All proxies failed: ${lastError?.message}`);
    }

    /**

     * Parse RSS/Atom XML to structured data

     * @param {string} xmlText - Raw XML content

     * @returns {Object} - Parsed feed data

     */
    parseXML(xmlText) {
        const parser = new DOMParser();
        const doc = parser.parseFromString(xmlText, "text/xml");

        // Check for parsing errors
        const parseError = doc.querySelector("parsererror");
        if (parseError) {
            throw new Error("XML parsing failed");
        }

        // Detect feed type (RSS or Atom)
        const isAtom = doc.querySelector("feed") !== null;

        if (isAtom) {
            return this.parseAtom(doc);
        } else {
            return this.parseRSS(doc);
        }
    }

    /**

     * Parse RSS 2.0 format

     */
    parseRSS(doc) {
        const channel = doc.querySelector("channel");
        if (!channel) {
            throw new Error("Invalid RSS: no channel element");
        }

        // Limit to maxArticlesPerFeed for performance
        const items = Array.from(doc.querySelectorAll("item"))
            .slice(0, this.maxArticlesPerFeed)
            .map(item => ({
                title: this.getTextContent(item, "title"),
                link: this.getTextContent(item, "link"),
                description: this.cleanDescription(this.getTextContent(item, "description")),
                pubDate: this.parseDate(this.getTextContent(item, "pubDate")),
                author: this.getTextContent(item, "author") || this.getDCCreator(item)
            }));

        return {
            title: this.getTextContent(channel, "title"),
            description: this.getTextContent(channel, "description"),
            link: this.getTextContent(channel, "link"),
            items: items.filter(item => item.title && item.link)
        };
    }

    /**

     * Get dc:creator content (handles namespace properly)

     */
    getDCCreator(item) {
        // Try different approaches for dc:creator namespace
        const creator = item.getElementsByTagName("dc:creator")[0] || item.getElementsByTagName("creator")[0];
        return creator?.textContent?.trim() || "";
    }

    /**

     * Parse Atom format (supports arXiv API format)

     */
    parseAtom(doc) {
        const feed = doc.querySelector("feed");
        if (!feed) {
            throw new Error("Invalid Atom: no feed element");
        }

        // Limit to maxArticlesPerFeed for performance
        const items = Array.from(doc.querySelectorAll("entry"))
            .slice(0, this.maxArticlesPerFeed)
            .map(entry => {
                // Atom links can be in <link href="..."> format
                const linkElement = entry.querySelector('link[rel="alternate"]') || entry.querySelector("link");
                const link = linkElement?.getAttribute("href") || this.getTextContent(entry, "link");

                // Handle multiple authors (common in arXiv API)
                const authorElements = entry.querySelectorAll("author name");
                let author = "";
                if (authorElements.length > 0) {
                    // Join first 3 authors, add "et al." if more
                    const names = Array.from(authorElements)
                        .map(el => el.textContent?.trim())
                        .filter(Boolean);
                    author = names.slice(0, 3).join(", ");
                    if (names.length > 3) author += " et al.";
                }

                return {
                    title: this.getTextContent(entry, "title"),
                    link: link,
                    description: this.cleanDescription(
                        this.getTextContent(entry, "summary") || this.getTextContent(entry, "content")
                    ),
                    pubDate: this.parseDate(
                        this.getTextContent(entry, "published") || this.getTextContent(entry, "updated")
                    ),
                    author: author
                };
            });

        const titleLink = feed.querySelector('link[rel="alternate"]') || feed.querySelector("link");

        return {
            title: this.getTextContent(feed, "title"),
            description: this.getTextContent(feed, "subtitle"),
            link: titleLink?.getAttribute("href") || "",
            items: items.filter(item => item.title && item.link)
        };
    }

    /**

     * Helper: Get text content of an element

     */
    getTextContent(parent, selector) {
        const element = parent.querySelector(selector);
        return element?.textContent?.trim() || "";
    }

    /**

     * Helper: Parse date string to Date object

     */
    parseDate(dateStr) {
        if (!dateStr) return null;

        try {
            const date = new Date(dateStr);
            return isNaN(date.getTime()) ? null : date;
        } catch {
            return null;
        }
    }

    /**

     * Helper: Clean HTML from description

     * Uses DOMParser for safer HTML stripping (avoids script execution)

     */
    cleanDescription(html) {
        if (!html) return "";

        // Use DOMParser for safer HTML parsing (doesn't execute scripts)
        try {
            const doc = new DOMParser().parseFromString(html, "text/html");
            let text = doc.body.textContent || "";

            // Fallback for edge cases
            if (!text && html) {
                const temp = document.createElement("div");
                temp.textContent = html; // Use textContent to set, not innerHTML
                text = temp.textContent;
            }

            // Trim and normalize whitespace
            text = text.trim().replace(/\s+/g, " ");

            // Limit length
            if (text.length > 200) {
                text = text.substring(0, 200) + "...";
            }

            return text;
        } catch {
            // Ultimate fallback: just strip tags with regex
            return html.replace(/<[^>]*>/g, "").substring(0, 200);
        }
    }

    /**

     * Main method: Fetch and parse a feed

     * @param {Object} feedConfig - Feed configuration object

     * @returns {Promise<Object>} - Parsed feed with metadata

     */
    async fetchFeed(feedConfig) {
        try {
            const { data: xmlText, fromCache } = await this.fetchWithProxy(feedConfig.url);
            const parsed = this.parseXML(xmlText);

            return {
                ...feedConfig,
                feed: parsed,
                status: "success",
                fromCache: fromCache,
                lastFetched: new Date()
            };
        } catch (error) {
            console.error(`Failed to fetch ${feedConfig.name}:`, error);

            return {
                ...feedConfig,
                feed: null,
                status: "error",
                error: error.message,
                lastFetched: new Date()
            };
        }
    }

    /**

     * Fetch multiple feeds with progressive callback

     * Includes rate limiting delay between batches for APIs like arXiv (3s rule)

     * @param {Array} feedConfigs - Array of feed configurations

     * @param {Function} onProgress - Callback called after each feed loads

     * @returns {Promise<Array>} - Array of parsed feeds

     */
    async fetchAllFeedsProgressive(feedConfigs, onProgress) {
        const results = [];
        const batchSize = 5; // Fetch 5 at a time for balance
        const batchDelay = 1000; // 1 second delay between batches to be nice to APIs

        for (let i = 0; i < feedConfigs.length; i += batchSize) {
            const batch = feedConfigs.slice(i, i + batchSize);
            const batchResults = await Promise.allSettled(batch.map(config => this.fetchFeed(config)));

            batchResults.forEach((result, idx) => {
                if (result.status === "fulfilled") {
                    results.push(result.value);
                } else {
                    results.push({
                        ...batch[idx],
                        feed: null,
                        status: "error",
                        error: result.reason?.message || "Unknown error"
                    });
                }
            });

            // Call progress callback
            if (onProgress) {
                onProgress(results, i + batch.length, feedConfigs.length);
            }

            // Add delay between batches to respect rate limits (arXiv = 3s rule)
            // Only delay if there are more batches to process
            if (i + batchSize < feedConfigs.length) {
                await new Promise(resolve => setTimeout(resolve, batchDelay));
            }
        }

        return results;
    }

    /**

     * Fetch multiple feeds in parallel (legacy)

     * @param {Array} feedConfigs - Array of feed configurations

     * @returns {Promise<Array>} - Array of parsed feeds

     */
    async fetchAllFeeds(feedConfigs) {
        const results = await Promise.allSettled(feedConfigs.map(config => this.fetchFeed(config)));

        return results.map((result, index) => {
            if (result.status === "fulfilled") {
                return result.value;
            } else {
                return {
                    ...feedConfigs[index],
                    feed: null,
                    status: "error",
                    error: result.reason?.message || "Unknown error"
                };
            }
        });
    }

    /**

     * Clear the cache (memory only, keeps IndexedDB)

     */
    clearCache() {
        this.cache.clear();
    }

    /**

     * Clear all cache including IndexedDB

     */
    async clearAllCache() {
        this.cache.clear();
        try {
            if (this.db) {
                await this.db.feeds.clear();
                console.log("🗑️ IndexedDB cache cleared");
            }
            // Also clean up old localStorage if it exists
            localStorage.removeItem("ivy-rss-cache");
        } catch (e) {
            console.warn("Failed to clear cache:", e);
        }
    }
}

// Export for use in app
window.RSSParser = RSSParser;