""" HTML → Markdown conversion with noise removal for Janus crawler. Strips ads, nav, footer, scripts, styles — keeps the content. """ import re import logging from typing import List, Dict, Any from urllib.parse import urljoin logger = logging.getLogger(__name__) NOISE_SELECTORS = [ "script", "style", "noscript", "iframe", "svg", "canvas", "nav", "footer", "header", "aside", ".ad", ".ads", ".advertisement", ".ad-container", ".sidebar", ".navigation", ".menu", ".breadcrumb", ".cookie", ".cookie-banner", ".gdpr", ".social-share", ".share-buttons", ".comments", ".comment-section", ".newsletter", ".subscribe", ".popup", ".modal", ".overlay", "#ad", "#ads", "#sidebar", "#navigation", "#footer", ".footer", ".header", ] CONTENT_SELECTORS = [ "article", "main", ".content", ".article", ".post", ".entry", "#content", "#main", "#article", ".post-content", ".article-body", ".story-body", ] class ContentProcessor: """Convert HTML to clean Markdown with noise removal.""" def process(self, html: str) -> str: """Convert HTML to clean Markdown.""" if not html: return "" html = self._strip_noise(html) markdown = self._html_to_markdown(html) markdown = self._clean_markdown(markdown) return markdown def _strip_noise(self, html: str) -> str: """Remove noise elements from HTML.""" for selector in NOISE_SELECTORS: if selector.startswith("."): pattern = rf'<[^>]*class="[^"]*{re.escape(selector[1:])}[^"]*"[^>]*>.*?]+>' html = re.sub(pattern, "", html, flags=re.DOTALL | re.IGNORECASE) elif selector.startswith("#"): pattern = ( rf'<[^>]*id="[^"]*{re.escape(selector[1:])}[^"]*"[^>]*>.*?]+>' ) html = re.sub(pattern, "", html, flags=re.DOTALL | re.IGNORECASE) else: pattern = rf"<{selector}[^>]*>.*?" html = re.sub(pattern, "", html, flags=re.DOTALL | re.IGNORECASE) html = re.sub(r"", "", html, flags=re.DOTALL) html = re.sub(r"\s+", " ", html) return html def _html_to_markdown(self, html: str) -> str: """Convert HTML to Markdown.""" try: from markdownify import markdownify as md return md(html, heading_style="ATX", bullets="-", strip=["img"]) except ImportError: return self._fallback_html_to_text(html) def _fallback_html_to_text(self, html: str) -> str: """Fallback HTML to text conversion without markdownify.""" text = re.sub(r"", "\n", html) text = re.sub(r"", "\n\n", text, flags=re.IGNORECASE) text = re.sub( r"]*>", lambda m: f"\n\n{'#' * int(m.group(1))} ", text, flags=re.IGNORECASE, ) text = re.sub(r"", "**", text, flags=re.IGNORECASE) text = re.sub(r"", "*", text, flags=re.IGNORECASE) text = re.sub( r']*href="([^"]*)"[^>]*>(.*?)', r"\2 (\1)", text, flags=re.IGNORECASE, ) text = re.sub(r"<[^>]+>", "", text) text = re.sub(r" ", " ", text) text = re.sub(r"&", "&", text) text = re.sub(r"<", "<", text) text = re.sub(r">", ">", text) text = re.sub(r""", '"', text) text = re.sub(r"'", "'", text) text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def _clean_markdown(self, markdown: str) -> str: """Clean up the markdown output.""" lines = markdown.split("\n") cleaned = [] prev_blank = False for line in lines: line = line.strip() if not line: if not prev_blank: cleaned.append("") prev_blank = True else: cleaned.append(line) prev_blank = False result = "\n".join(cleaned).strip() if len(result) > 50000: result = ( result[:50000] + "\n\n[Content truncated — too long for full extraction]" ) return result def extract_links(self, html: str, base_url: str = "") -> List[str]: """Extract all links from HTML.""" links = [] pattern = r']*href=["\']([^"\']+)["\'][^>]*>' for match in re.finditer(pattern, html, re.IGNORECASE): url = match.group(1) if url and not url.startswith(("#", "javascript:", "mailto:")): if base_url and not url.startswith(("http://", "https://")): url = urljoin(base_url, url) links.append(url) return list(set(links)) def extract_metadata(self, html: str, title: str = "") -> Dict[str, Any]: """Extract metadata from HTML.""" metadata = {} if title: metadata["title"] = title og_title = re.search( r']*property="og:title"[^>]*content="([^"]*)"', html, re.IGNORECASE ) if og_title: metadata["og_title"] = og_title.group(1) og_desc = re.search( r']*property="og:description"[^>]*content="([^"]*)"', html, re.IGNORECASE, ) if og_desc: metadata["og_description"] = og_desc.group(1) og_image = re.search( r']*property="og:image"[^>]*content="([^"]*)"', html, re.IGNORECASE ) if og_image: metadata["og_image"] = og_image.group(1) author = re.search( r']*name="author"[^>]*content="([^"]*)"', html, re.IGNORECASE ) if author: metadata["author"] = author.group(1) pub_date = re.search( r']*property="article:published_time"[^>]*content="([^"]*)"', html, re.IGNORECASE, ) if pub_date: metadata["published_at"] = pub_date.group(1) return metadata