Kraft102's picture
Initial deployment - WidgeTDC Cortex Backend v2.1.0
529090e
import * as cheerio from 'cheerio';
import { URL } from 'url';
/**
* Content Cleaner Service
*
* Strips noise (ads, nav, footer) from HTML content to prepare it for Vector DB ingestion.
* Inspired by 'css-stripper-pro-hardened' robust crawling logic.
*/
export class ContentCleaner {
private noiseSelectors = [
'nav', 'header', 'footer', 'aside',
'.ads', '.ad-container', '.advertisement',
'#cookie-banner', '.cookie-consent',
'.social-share', '.comments',
'script', 'style', 'noscript', 'iframe'
];
/**
* Clean HTML content and return plain text
*/
clean(html: string, baseUrl?: string): string {
if (!html) return '';
const $ = cheerio.load(html);
// Remove noise
this.noiseSelectors.forEach(selector => {
$(selector).remove();
});
// Resolve relative URLs if base URL provided
if (baseUrl) {
$('a').each((_, el) => {
const href = $(el).attr('href');
if (href) {
try {
$(el).attr('href', new URL(href, baseUrl).toString());
} catch {
// Ignore invalid URLs
}
}
});
$('img').each((_, el) => {
const src = $(el).attr('src');
if (src) {
try {
$(el).attr('src', new URL(src, baseUrl).toString());
} catch {
// Ignore
}
}
});
}
// Extract main text
// We prefer 'main', 'article', or '#content' if available
let mainContent = $('main, article, #content, .content, .post-body').first();
if (mainContent.length === 0) {
// Fallback to body
mainContent = $('body');
}
// Convert to text, collapsing whitespace
let text = mainContent.text()
.replace(/\s+/g, ' ')
.trim();
return text;
}
/**
* Extract metadata (title, description, og:image)
*/
extractMetadata(html: string): Record<string, string> {
const $ = cheerio.load(html);
return {
title: $('title').text() || $('meta[property="og:title"]').attr('content') || '',
description: $('meta[name="description"]').attr('content') || $('meta[property="og:description"]').attr('content') || '',
image: $('meta[property="og:image"]').attr('content') || ''
};
}
}
// Singleton
export const contentCleaner = new ContentCleaner();