Spaces:

WD101
/

OneServerToRuleThemAll

Sleeping

File size: 5,996 Bytes

feea636

from bs4 import BeautifulSoup, Comment
from typing import Dict, List, Optional
import re
from urllib.parse import urljoin, urlparse

class DataExtractor:
    def __init__(self):
        self.ignore_selectors = [
            '.advertisement',
            '.ad',
            '.banner',
            '.popup',
            '#footer',
            '.footer',
            '.sidebar',
            'nav',
            '.navbar',
            '.menu',
            'header',
            '#header',
            'script',
            'style',
            'noscript',
            'iframe',
            'meta',
            'link',
            '[class*="ad-"]',
            '[id*="ad-"]',
            '.cookie-notice',
            '.modal',
            'form',
            'input',
            'button',
            '.social-media',
            '.comments-section',
            '.widget'
        ]
        self.content_selectors = [
            '.main-content',
            'article',
            'p',
            'h1',
            'h2',
            'h3',
            'h4',
            'h5',
            'h6',
            'div.content',
            '.post',
            '.article-body',
            '.content-body',
            'section',
            'main',
            'ul',
            'ol',
            'li',
            'table',
            'td',
            'th',
            'blockquote',
            'pre',
            '.text',
            '[class*="content"]',
            '[class*="post"]',
            '[class*="article"]',
            'div:not([class*="ad"]):not([class*="banner"]):not([class*="sidebar"])'
        ]
        self.min_text_length = 200
    def extract_structured_data(self, html: str, url: str) -> Dict:
        """Extract structured data from HTML for LLM consumption"""
        soup = BeautifulSoup(html, 'lxml')
        
        # Remove unwanted elements
        self._clean_html(soup)
        
        return {
            "content": self._extract_content(soup),
            "metadata": self._extract_metadata(soup, url),
            "structure": self._extract_structure(soup),
            "links": self._extract_links(soup, url),
            "images": self._extract_images(soup, url),
            "text_summary": self._extract_text_summary(soup)
        }
    
    def _clean_html(self, soup: BeautifulSoup):
        """Remove unwanted elements for cleaner extraction"""
        for selector in self.ignore_selectors:
            for element in soup.select(selector):
                element.decompose()
        
        # Remove comments and scripts
        for element in soup(text=lambda text: isinstance(text, Comment)):
            element.extract()
    
    def _extract_content(self, soup: BeautifulSoup) -> List[Dict]:
        """Extract main content blocks"""
        content_blocks = []
        
        for selector in self.content_selectors:
            elements = soup.select(selector)
            for elem in elements:
                text = elem.get_text(strip=True)
                if len(text) >= self.min_text_length:
                    content_blocks.append({
                        "tag": elem.name,
                        "text": text,
                        "html": str(elem),
                        "attributes": dict(elem.attrs) if elem.attrs else {}
                    })
        
        return content_blocks
    
    def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
        """Extract page metadata"""
        title = soup.find('title')
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        
        return {
            "title": title.get_text().strip() if title else "",
            "description": meta_desc.get('content', '') if meta_desc else "",
            "url": url,
            "domain": urlparse(url).netloc,
            "headings": self._extract_headings(soup)
        }
    
    def _extract_headings(self, soup: BeautifulSoup) -> List[Dict]:
        """Extract heading hierarchy for structure"""
        headings = []
        for i in range(1, 7):
            for heading in soup.find_all(f'h{i}'):
                headings.append({
                    "level": i,
                    "text": heading.get_text().strip(),
                    "id": heading.get('id', '')
                })
        return headings
    
    def _extract_structure(self, soup: BeautifulSoup) -> Dict:
        """Extract DOM structure for relationships"""
        return {
            "sections": len(soup.find_all(['section', 'article', 'div'])),
            "paragraphs": len(soup.find_all('p')),
            "lists": len(soup.find_all(['ul', 'ol'])),
            "tables": len(soup.find_all('table')),
            "forms": len(soup.find_all('form'))
        }
    
    def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
        """Extract all links for relationship mapping"""
        links = []
        for link in soup.find_all('a', href=True):
            href = urljoin(base_url, link['href'])
            links.append({
                "url": href,
                "text": link.get_text().strip(),
                "internal": urlparse(href).netloc == urlparse(base_url).netloc
            })
        return links[:50]  # Limit for performance
    
    def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
        """Extract images with context"""
        images = []
        for img in soup.find_all('img', src=True):
            images.append({
                "src": urljoin(base_url, img['src']),
                "alt": img.get('alt', ''),
                "caption": img.get('title', '')
            })
        return images[:20]  # Limit for performance
    
    def _extract_text_summary(self, soup: BeautifulSoup) -> str:
        """Extract clean text for LLM processing"""
        text = soup.get_text()
        # Clean whitespace and normalize
        text = re.sub(r'\s+', ' ', text).strip()
        return text[:5000]  # Limit for token efficiency