Spaces:
Sleeping
Sleeping
| from bs4 import BeautifulSoup, Comment | |
| from typing import Dict, List, Optional | |
| import re | |
| from urllib.parse import urljoin, urlparse | |
| class DataExtractor: | |
| def __init__(self): | |
| self.ignore_selectors = [ | |
| '.advertisement', | |
| '.ad', | |
| '.banner', | |
| '.popup', | |
| '#footer', | |
| '.footer', | |
| '.sidebar', | |
| 'nav', | |
| '.navbar', | |
| '.menu', | |
| 'header', | |
| '#header', | |
| 'script', | |
| 'style', | |
| 'noscript', | |
| 'iframe', | |
| 'meta', | |
| 'link', | |
| '[class*="ad-"]', | |
| '[id*="ad-"]', | |
| '.cookie-notice', | |
| '.modal', | |
| 'form', | |
| 'input', | |
| 'button', | |
| '.social-media', | |
| '.comments-section', | |
| '.widget' | |
| ] | |
| self.content_selectors = [ | |
| '.main-content', | |
| 'article', | |
| 'p', | |
| 'h1', | |
| 'h2', | |
| 'h3', | |
| 'h4', | |
| 'h5', | |
| 'h6', | |
| 'div.content', | |
| '.post', | |
| '.article-body', | |
| '.content-body', | |
| 'section', | |
| 'main', | |
| 'ul', | |
| 'ol', | |
| 'li', | |
| 'table', | |
| 'td', | |
| 'th', | |
| 'blockquote', | |
| 'pre', | |
| '.text', | |
| '[class*="content"]', | |
| '[class*="post"]', | |
| '[class*="article"]', | |
| 'div:not([class*="ad"]):not([class*="banner"]):not([class*="sidebar"])' | |
| ] | |
| self.min_text_length = 200 | |
| def extract_structured_data(self, html: str, url: str) -> Dict: | |
| """Extract structured data from HTML for LLM consumption""" | |
| soup = BeautifulSoup(html, 'lxml') | |
| # Remove unwanted elements | |
| self._clean_html(soup) | |
| return { | |
| "content": self._extract_content(soup), | |
| "metadata": self._extract_metadata(soup, url), | |
| "structure": self._extract_structure(soup), | |
| "links": self._extract_links(soup, url), | |
| "images": self._extract_images(soup, url), | |
| "text_summary": self._extract_text_summary(soup) | |
| } | |
| def _clean_html(self, soup: BeautifulSoup): | |
| """Remove unwanted elements for cleaner extraction""" | |
| for selector in self.ignore_selectors: | |
| for element in soup.select(selector): | |
| element.decompose() | |
| # Remove comments and scripts | |
| for element in soup(text=lambda text: isinstance(text, Comment)): | |
| element.extract() | |
| def _extract_content(self, soup: BeautifulSoup) -> List[Dict]: | |
| """Extract main content blocks""" | |
| content_blocks = [] | |
| for selector in self.content_selectors: | |
| elements = soup.select(selector) | |
| for elem in elements: | |
| text = elem.get_text(strip=True) | |
| if len(text) >= self.min_text_length: | |
| content_blocks.append({ | |
| "tag": elem.name, | |
| "text": text, | |
| "html": str(elem), | |
| "attributes": dict(elem.attrs) if elem.attrs else {} | |
| }) | |
| return content_blocks | |
| def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict: | |
| """Extract page metadata""" | |
| title = soup.find('title') | |
| meta_desc = soup.find('meta', attrs={'name': 'description'}) | |
| return { | |
| "title": title.get_text().strip() if title else "", | |
| "description": meta_desc.get('content', '') if meta_desc else "", | |
| "url": url, | |
| "domain": urlparse(url).netloc, | |
| "headings": self._extract_headings(soup) | |
| } | |
| def _extract_headings(self, soup: BeautifulSoup) -> List[Dict]: | |
| """Extract heading hierarchy for structure""" | |
| headings = [] | |
| for i in range(1, 7): | |
| for heading in soup.find_all(f'h{i}'): | |
| headings.append({ | |
| "level": i, | |
| "text": heading.get_text().strip(), | |
| "id": heading.get('id', '') | |
| }) | |
| return headings | |
| def _extract_structure(self, soup: BeautifulSoup) -> Dict: | |
| """Extract DOM structure for relationships""" | |
| return { | |
| "sections": len(soup.find_all(['section', 'article', 'div'])), | |
| "paragraphs": len(soup.find_all('p')), | |
| "lists": len(soup.find_all(['ul', 'ol'])), | |
| "tables": len(soup.find_all('table')), | |
| "forms": len(soup.find_all('form')) | |
| } | |
| def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]: | |
| """Extract all links for relationship mapping""" | |
| links = [] | |
| for link in soup.find_all('a', href=True): | |
| href = urljoin(base_url, link['href']) | |
| links.append({ | |
| "url": href, | |
| "text": link.get_text().strip(), | |
| "internal": urlparse(href).netloc == urlparse(base_url).netloc | |
| }) | |
| return links[:50] # Limit for performance | |
| def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict]: | |
| """Extract images with context""" | |
| images = [] | |
| for img in soup.find_all('img', src=True): | |
| images.append({ | |
| "src": urljoin(base_url, img['src']), | |
| "alt": img.get('alt', ''), | |
| "caption": img.get('title', '') | |
| }) | |
| return images[:20] # Limit for performance | |
| def _extract_text_summary(self, soup: BeautifulSoup) -> str: | |
| """Extract clean text for LLM processing""" | |
| text = soup.get_text() | |
| # Clean whitespace and normalize | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text[:5000] # Limit for token efficiency |