ivy-rss-hub / scripts /rss-parser.js
ijohn07's picture
Upload 16 files
790eee5 verified
/* ============================================
IVY'S RSS HUB — RSS Parser Module
Fetches and parses RSS/Atom feeds
Uses Dexie (IndexedDB) for caching 🌿
============================================ */
/**
* RSSParser - Handles fetching and parsing RSS/Atom feeds
*/
class RSSParser {
constructor() {
this.corsProxies = window.FeedsConfig.CORS_PROXIES;
this.currentProxyIndex = 0;
this.cache = new Map(); // Memory cache for current session
this.cacheTimeout = 5 * 60 * 1000; // 5 minutes cache
this.maxArticlesPerFeed = 25; // Limit parsing to 25 articles per feed
this.dbReady = false;
// Initialize Dexie database
this.initDatabase();
}
/**
* Initialize Dexie database for persistent caching
*/
async initDatabase() {
try {
this.db = new Dexie("IvyRSSHubCache");
this.db.version(1).stores({
feeds: "url, data, timestamp"
});
// Load cached feeds into memory
await this.loadPersistentCache();
this.dbReady = true;
console.log("📦 Dexie database ready");
} catch (e) {
console.warn("Failed to init Dexie:", e);
this.dbReady = false;
}
}
/**
* Load cache from IndexedDB into memory
*/
async loadPersistentCache() {
try {
if (!this.db) return;
const maxAge = 2 * 60 * 60 * 1000; // 2 hours max
const now = Date.now();
// Get all cached feeds
const cached = await this.db.feeds.toArray();
// Load valid ones into memory cache
let loaded = 0;
for (const item of cached) {
if (now - item.timestamp < maxAge) {
this.cache.set(item.url, {
data: item.data,
timestamp: item.timestamp
});
loaded++;
} else {
// Delete old entries
await this.db.feeds.delete(item.url);
}
}
if (loaded > 0) {
console.log(`📦 Loaded ${loaded} cached feeds from IndexedDB`);
}
} catch (e) {
console.warn("Failed to load cache:", e);
}
}
/**
* Save a feed to IndexedDB
*/
async saveToPersistentCache(url, data, timestamp) {
try {
if (!this.db || !this.dbReady) return;
await this.db.feeds.put({
url: url,
data: data,
timestamp: timestamp
});
} catch (e) {
console.warn("Failed to save to cache:", e);
}
}
/**
* Fetch feed with CORS proxy fallback
* @param {string} feedUrl - The RSS feed URL
* @returns {Promise<string>} - Raw XML content
*/
async fetchWithProxy(feedUrl) {
// Check cache first
const cached = this.cache.get(feedUrl);
if (cached && Date.now() - cached.timestamp < this.cacheTimeout) {
return { data: cached.data, fromCache: true };
}
let lastError = null;
// Try each proxy in sequence
for (let i = 0; i < this.corsProxies.length; i++) {
const proxyIndex = (this.currentProxyIndex + i) % this.corsProxies.length;
const proxyUrl = this.corsProxies[proxyIndex](feedUrl);
try {
const response = await fetch(proxyUrl, {
headers: {
Accept: "application/rss+xml, application/xml, text/xml, application/atom+xml"
}
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
const text = await response.text();
// Validate it's XML
if (!text.includes("<?xml") && !text.includes("<rss") && !text.includes("<feed")) {
throw new Error("Invalid XML response");
}
const timestamp = Date.now();
// Cache successful response in memory
this.cache.set(feedUrl, {
data: text,
timestamp: timestamp
});
// Save to IndexedDB (async, don't await)
this.saveToPersistentCache(feedUrl, text, timestamp);
// Remember working proxy
this.currentProxyIndex = proxyIndex;
return { data: text, fromCache: false };
} catch (error) {
lastError = error;
// Use debug level for fallback attempts (less noise in console)
console.debug(`[RSS] Proxy ${proxyIndex} failed for ${feedUrl}:`, error.message);
}
}
throw new Error(`All proxies failed: ${lastError?.message}`);
}
/**
* Parse RSS/Atom XML to structured data
* @param {string} xmlText - Raw XML content
* @returns {Object} - Parsed feed data
*/
parseXML(xmlText) {
const parser = new DOMParser();
const doc = parser.parseFromString(xmlText, "text/xml");
// Check for parsing errors
const parseError = doc.querySelector("parsererror");
if (parseError) {
throw new Error("XML parsing failed");
}
// Detect feed type (RSS or Atom)
const isAtom = doc.querySelector("feed") !== null;
if (isAtom) {
return this.parseAtom(doc);
} else {
return this.parseRSS(doc);
}
}
/**
* Parse RSS 2.0 format
*/
parseRSS(doc) {
const channel = doc.querySelector("channel");
if (!channel) {
throw new Error("Invalid RSS: no channel element");
}
// Limit to maxArticlesPerFeed for performance
const items = Array.from(doc.querySelectorAll("item"))
.slice(0, this.maxArticlesPerFeed)
.map(item => ({
title: this.getTextContent(item, "title"),
link: this.getTextContent(item, "link"),
description: this.cleanDescription(this.getTextContent(item, "description")),
pubDate: this.parseDate(this.getTextContent(item, "pubDate")),
author: this.getTextContent(item, "author") || this.getDCCreator(item)
}));
return {
title: this.getTextContent(channel, "title"),
description: this.getTextContent(channel, "description"),
link: this.getTextContent(channel, "link"),
items: items.filter(item => item.title && item.link)
};
}
/**
* Get dc:creator content (handles namespace properly)
*/
getDCCreator(item) {
// Try different approaches for dc:creator namespace
const creator = item.getElementsByTagName("dc:creator")[0] || item.getElementsByTagName("creator")[0];
return creator?.textContent?.trim() || "";
}
/**
* Parse Atom format (supports arXiv API format)
*/
parseAtom(doc) {
const feed = doc.querySelector("feed");
if (!feed) {
throw new Error("Invalid Atom: no feed element");
}
// Limit to maxArticlesPerFeed for performance
const items = Array.from(doc.querySelectorAll("entry"))
.slice(0, this.maxArticlesPerFeed)
.map(entry => {
// Atom links can be in <link href="..."> format
const linkElement = entry.querySelector('link[rel="alternate"]') || entry.querySelector("link");
const link = linkElement?.getAttribute("href") || this.getTextContent(entry, "link");
// Handle multiple authors (common in arXiv API)
const authorElements = entry.querySelectorAll("author name");
let author = "";
if (authorElements.length > 0) {
// Join first 3 authors, add "et al." if more
const names = Array.from(authorElements)
.map(el => el.textContent?.trim())
.filter(Boolean);
author = names.slice(0, 3).join(", ");
if (names.length > 3) author += " et al.";
}
return {
title: this.getTextContent(entry, "title"),
link: link,
description: this.cleanDescription(
this.getTextContent(entry, "summary") || this.getTextContent(entry, "content")
),
pubDate: this.parseDate(
this.getTextContent(entry, "published") || this.getTextContent(entry, "updated")
),
author: author
};
});
const titleLink = feed.querySelector('link[rel="alternate"]') || feed.querySelector("link");
return {
title: this.getTextContent(feed, "title"),
description: this.getTextContent(feed, "subtitle"),
link: titleLink?.getAttribute("href") || "",
items: items.filter(item => item.title && item.link)
};
}
/**
* Helper: Get text content of an element
*/
getTextContent(parent, selector) {
const element = parent.querySelector(selector);
return element?.textContent?.trim() || "";
}
/**
* Helper: Parse date string to Date object
*/
parseDate(dateStr) {
if (!dateStr) return null;
try {
const date = new Date(dateStr);
return isNaN(date.getTime()) ? null : date;
} catch {
return null;
}
}
/**
* Helper: Clean HTML from description
* Uses DOMParser for safer HTML stripping (avoids script execution)
*/
cleanDescription(html) {
if (!html) return "";
// Use DOMParser for safer HTML parsing (doesn't execute scripts)
try {
const doc = new DOMParser().parseFromString(html, "text/html");
let text = doc.body.textContent || "";
// Fallback for edge cases
if (!text && html) {
const temp = document.createElement("div");
temp.textContent = html; // Use textContent to set, not innerHTML
text = temp.textContent;
}
// Trim and normalize whitespace
text = text.trim().replace(/\s+/g, " ");
// Limit length
if (text.length > 200) {
text = text.substring(0, 200) + "...";
}
return text;
} catch {
// Ultimate fallback: just strip tags with regex
return html.replace(/<[^>]*>/g, "").substring(0, 200);
}
}
/**
* Main method: Fetch and parse a feed
* @param {Object} feedConfig - Feed configuration object
* @returns {Promise<Object>} - Parsed feed with metadata
*/
async fetchFeed(feedConfig) {
try {
const { data: xmlText, fromCache } = await this.fetchWithProxy(feedConfig.url);
const parsed = this.parseXML(xmlText);
return {
...feedConfig,
feed: parsed,
status: "success",
fromCache: fromCache,
lastFetched: new Date()
};
} catch (error) {
console.error(`Failed to fetch ${feedConfig.name}:`, error);
return {
...feedConfig,
feed: null,
status: "error",
error: error.message,
lastFetched: new Date()
};
}
}
/**
* Fetch multiple feeds with progressive callback
* Includes rate limiting delay between batches for APIs like arXiv (3s rule)
* @param {Array} feedConfigs - Array of feed configurations
* @param {Function} onProgress - Callback called after each feed loads
* @returns {Promise<Array>} - Array of parsed feeds
*/
async fetchAllFeedsProgressive(feedConfigs, onProgress) {
const results = [];
const batchSize = 5; // Fetch 5 at a time for balance
const batchDelay = 1000; // 1 second delay between batches to be nice to APIs
for (let i = 0; i < feedConfigs.length; i += batchSize) {
const batch = feedConfigs.slice(i, i + batchSize);
const batchResults = await Promise.allSettled(batch.map(config => this.fetchFeed(config)));
batchResults.forEach((result, idx) => {
if (result.status === "fulfilled") {
results.push(result.value);
} else {
results.push({
...batch[idx],
feed: null,
status: "error",
error: result.reason?.message || "Unknown error"
});
}
});
// Call progress callback
if (onProgress) {
onProgress(results, i + batch.length, feedConfigs.length);
}
// Add delay between batches to respect rate limits (arXiv = 3s rule)
// Only delay if there are more batches to process
if (i + batchSize < feedConfigs.length) {
await new Promise(resolve => setTimeout(resolve, batchDelay));
}
}
return results;
}
/**
* Fetch multiple feeds in parallel (legacy)
* @param {Array} feedConfigs - Array of feed configurations
* @returns {Promise<Array>} - Array of parsed feeds
*/
async fetchAllFeeds(feedConfigs) {
const results = await Promise.allSettled(feedConfigs.map(config => this.fetchFeed(config)));
return results.map((result, index) => {
if (result.status === "fulfilled") {
return result.value;
} else {
return {
...feedConfigs[index],
feed: null,
status: "error",
error: result.reason?.message || "Unknown error"
};
}
});
}
/**
* Clear the cache (memory only, keeps IndexedDB)
*/
clearCache() {
this.cache.clear();
}
/**
* Clear all cache including IndexedDB
*/
async clearAllCache() {
this.cache.clear();
try {
if (this.db) {
await this.db.feeds.clear();
console.log("🗑️ IndexedDB cache cleared");
}
// Also clean up old localStorage if it exists
localStorage.removeItem("ivy-rss-cache");
} catch (e) {
console.warn("Failed to clear cache:", e);
}
}
}
// Export for use in app
window.RSSParser = RSSParser;