Spaces:

WD101
/

OneServerToRuleThemAll

Sleeping

App Files Files Community

OneServerToRuleThemAll / scraper /data_extractor.py

etukurudinesh

add files

feea636 8 months ago

raw

history blame contribute delete

6 kB

	from bs4 import BeautifulSoup, Comment
	from typing import Dict, List, Optional
	import re
	from urllib.parse import urljoin, urlparse

	class DataExtractor:
	def __init__(self):
	self.ignore_selectors = [
	'.advertisement',
	'.ad',
	'.banner',
	'.popup',
	'#footer',
	'.footer',
	'.sidebar',
	'nav',
	'.navbar',
	'.menu',
	'header',
	'#header',
	'script',
	'style',
	'noscript',
	'iframe',
	'meta',
	'link',
	'[class*="ad-"]',
	'[id*="ad-"]',
	'.cookie-notice',
	'.modal',
	'form',
	'input',
	'button',
	'.social-media',
	'.comments-section',
	'.widget'
	]
	self.content_selectors = [
	'.main-content',
	'article',
	'p',
	'h1',
	'h2',
	'h3',
	'h4',
	'h5',
	'h6',
	'div.content',
	'.post',
	'.article-body',
	'.content-body',
	'section',
	'main',
	'ul',
	'ol',
	'li',
	'table',
	'td',
	'th',
	'blockquote',
	'pre',
	'.text',
	'[class*="content"]',
	'[class*="post"]',
	'[class*="article"]',
	'div:not([class="ad"]):not([class="banner"]):not([class*="sidebar"])'
	]
	self.min_text_length = 200
	def extract_structured_data(self, html: str, url: str) -> Dict:
	"""Extract structured data from HTML for LLM consumption"""
	soup = BeautifulSoup(html, 'lxml')

	# Remove unwanted elements
	self._clean_html(soup)

	return {
	"content": self._extract_content(soup),
	"metadata": self._extract_metadata(soup, url),
	"structure": self._extract_structure(soup),
	"links": self._extract_links(soup, url),
	"images": self._extract_images(soup, url),
	"text_summary": self._extract_text_summary(soup)
	}

	def _clean_html(self, soup: BeautifulSoup):
	"""Remove unwanted elements for cleaner extraction"""
	for selector in self.ignore_selectors:
	for element in soup.select(selector):
	element.decompose()

	# Remove comments and scripts
	for element in soup(text=lambda text: isinstance(text, Comment)):
	element.extract()

	def _extract_content(self, soup: BeautifulSoup) -> List[Dict]:
	"""Extract main content blocks"""
	content_blocks = []

	for selector in self.content_selectors:
	elements = soup.select(selector)
	for elem in elements:
	text = elem.get_text(strip=True)
	if len(text) >= self.min_text_length:
	content_blocks.append({
	"tag": elem.name,
	"text": text,
	"html": str(elem),
	"attributes": dict(elem.attrs) if elem.attrs else {}
	})

	return content_blocks

	def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
	"""Extract page metadata"""
	title = soup.find('title')
	meta_desc = soup.find('meta', attrs={'name': 'description'})

	return {
	"title": title.get_text().strip() if title else "",
	"description": meta_desc.get('content', '') if meta_desc else "",
	"url": url,
	"domain": urlparse(url).netloc,
	"headings": self._extract_headings(soup)
	}

	def _extract_headings(self, soup: BeautifulSoup) -> List[Dict]:
	"""Extract heading hierarchy for structure"""
	headings = []
	for i in range(1, 7):
	for heading in soup.find_all(f'h{i}'):
	headings.append({
	"level": i,
	"text": heading.get_text().strip(),
	"id": heading.get('id', '')
	})
	return headings

	def _extract_structure(self, soup: BeautifulSoup) -> Dict:
	"""Extract DOM structure for relationships"""
	return {
	"sections": len(soup.find_all(['section', 'article', 'div'])),
	"paragraphs": len(soup.find_all('p')),
	"lists": len(soup.find_all(['ul', 'ol'])),
	"tables": len(soup.find_all('table')),
	"forms": len(soup.find_all('form'))
	}

	def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
	"""Extract all links for relationship mapping"""
	links = []
	for link in soup.find_all('a', href=True):
	href = urljoin(base_url, link['href'])
	links.append({
	"url": href,
	"text": link.get_text().strip(),
	"internal": urlparse(href).netloc == urlparse(base_url).netloc
	})
	return links[:50] # Limit for performance

	def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
	"""Extract images with context"""
	images = []
	for img in soup.find_all('img', src=True):
	images.append({
	"src": urljoin(base_url, img['src']),
	"alt": img.get('alt', ''),
	"caption": img.get('title', '')
	})
	return images[:20] # Limit for performance

	def _extract_text_summary(self, soup: BeautifulSoup) -> str:
	"""Extract clean text for LLM processing"""
	text = soup.get_text()
	# Clean whitespace and normalize
	text = re.sub(r'\s+', ' ', text).strip()
	return text[:5000] # Limit for token efficiency