Spaces:
Paused
Paused
| import * as cheerio from 'cheerio'; | |
| import { URL } from 'url'; | |
| /** | |
| * Content Cleaner Service | |
| * | |
| * Strips noise (ads, nav, footer) from HTML content to prepare it for Vector DB ingestion. | |
| * Inspired by 'css-stripper-pro-hardened' robust crawling logic. | |
| */ | |
| export class ContentCleaner { | |
| private noiseSelectors = [ | |
| 'nav', 'header', 'footer', 'aside', | |
| '.ads', '.ad-container', '.advertisement', | |
| '#cookie-banner', '.cookie-consent', | |
| '.social-share', '.comments', | |
| 'script', 'style', 'noscript', 'iframe' | |
| ]; | |
| /** | |
| * Clean HTML content and return plain text | |
| */ | |
| clean(html: string, baseUrl?: string): string { | |
| if (!html) return ''; | |
| const $ = cheerio.load(html); | |
| // Remove noise | |
| this.noiseSelectors.forEach(selector => { | |
| $(selector).remove(); | |
| }); | |
| // Resolve relative URLs if base URL provided | |
| if (baseUrl) { | |
| $('a').each((_, el) => { | |
| const href = $(el).attr('href'); | |
| if (href) { | |
| try { | |
| $(el).attr('href', new URL(href, baseUrl).toString()); | |
| } catch { | |
| // Ignore invalid URLs | |
| } | |
| } | |
| }); | |
| $('img').each((_, el) => { | |
| const src = $(el).attr('src'); | |
| if (src) { | |
| try { | |
| $(el).attr('src', new URL(src, baseUrl).toString()); | |
| } catch { | |
| // Ignore | |
| } | |
| } | |
| }); | |
| } | |
| // Extract main text | |
| // We prefer 'main', 'article', or '#content' if available | |
| let mainContent = $('main, article, #content, .content, .post-body').first(); | |
| if (mainContent.length === 0) { | |
| // Fallback to body | |
| mainContent = $('body'); | |
| } | |
| // Convert to text, collapsing whitespace | |
| let text = mainContent.text() | |
| .replace(/\s+/g, ' ') | |
| .trim(); | |
| return text; | |
| } | |
| /** | |
| * Extract metadata (title, description, og:image) | |
| */ | |
| extractMetadata(html: string): Record<string, string> { | |
| const $ = cheerio.load(html); | |
| return { | |
| title: $('title').text() || $('meta[property="og:title"]').attr('content') || '', | |
| description: $('meta[name="description"]').attr('content') || $('meta[property="og:description"]').attr('content') || '', | |
| image: $('meta[property="og:image"]').attr('content') || '' | |
| }; | |
| } | |
| } | |
| // Singleton | |
| export const contentCleaner = new ContentCleaner(); | |