Spaces:

NeerajCodz
/

scrapeRL

Sleeping

App Files Files Community

scrapeRL / backend /app /utils /html.py

NeerajCodz

feat: add API routes and utility modules

27cde0c 2 months ago

raw

history blame

8.08 kB

	"""HTML processing utilities for ScrapeRL backend."""

	import re
	from typing import Any, Optional
	from bs4 import BeautifulSoup, Tag, NavigableString

	from app.utils.logging import get_logger

	logger = get_logger(__name__)


	def parse_html(html: str, parser: str = "html.parser") -> BeautifulSoup:
	"""
	Parse HTML string into a BeautifulSoup object.

	Args:
	html: Raw HTML string
	parser: Parser to use (html.parser, lxml, html5lib)

	Returns:
	Parsed BeautifulSoup object
	"""
	return BeautifulSoup(html, parser)


	def clean_html(
	html: str,
	remove_scripts: bool = True,
	remove_styles: bool = True,
	remove_comments: bool = True,
	remove_tags: Optional[list[str]] = None,
	) -> str:
	"""
	Clean HTML by removing unwanted elements.

	Args:
	html: Raw HTML string
	remove_scripts: Remove <script> tags
	remove_styles: Remove <style> tags
	remove_comments: Remove HTML comments
	remove_tags: Additional tags to remove

	Returns:
	Cleaned HTML string
	"""
	soup = parse_html(html)

	# Remove script tags
	if remove_scripts:
	for script in soup.find_all("script"):
	script.decompose()

	# Remove style tags
	if remove_styles:
	for style in soup.find_all("style"):
	style.decompose()

	# Remove comments
	if remove_comments:
	from bs4 import Comment

	for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
	comment.extract()

	# Remove additional specified tags
	if remove_tags:
	for tag_name in remove_tags:
	for tag in soup.find_all(tag_name):
	tag.decompose()

	return str(soup)


	def extract_text(
	html: str,
	separator: str = " ",
	strip: bool = True,
	) -> str:
	"""
	Extract plain text from HTML.

	Args:
	html: Raw HTML string
	separator: String to join text segments
	strip: Strip whitespace from result

	Returns:
	Extracted plain text
	"""
	soup = parse_html(html)

	# Remove script and style elements
	for element in soup(["script", "style", "noscript"]):
	element.decompose()

	text = soup.get_text(separator=separator)

	if strip:
	# Normalize whitespace
	text = re.sub(r"\s+", " ", text).strip()

	return text


	def semantic_chunk(
	html: str,
	max_chunk_size: int = 4000,
	overlap: int = 200,
	) -> list[dict[str, Any]]:
	"""
	Split HTML content into semantic chunks based on structure.

	Args:
	html: Raw HTML string
	max_chunk_size: Maximum characters per chunk
	overlap: Number of characters to overlap between chunks

	Returns:
	List of chunk dictionaries with text and metadata
	"""
	soup = parse_html(html)
	chunks: list[dict[str, Any]] = []

	# Remove non-content elements
	for element in soup(["script", "style", "noscript", "nav", "footer", "header"]):
	element.decompose()

	# Find semantic boundaries
	semantic_tags = ["article", "section", "div", "p", "h1", "h2", "h3", "h4", "h5", "h6"]

	def get_text_content(element: Tag \| NavigableString) -> str:
	if isinstance(element, NavigableString):
	return str(element).strip()
	return element.get_text(separator=" ", strip=True)

	current_chunk = ""
	current_metadata: dict[str, Any] = {"tags": [], "headings": []}

	for element in soup.find_all(semantic_tags):
	text = get_text_content(element)
	if not text:
	continue

	tag_name = element.name if isinstance(element, Tag) else "text"

	# Check if adding this would exceed max size
	if len(current_chunk) + len(text) + 1 > max_chunk_size:
	if current_chunk:
	chunks.append({
	"text": current_chunk.strip(),
	"metadata": current_metadata.copy(),
	"char_count": len(current_chunk),
	})
	# Start new chunk with overlap
	if overlap > 0 and current_chunk:
	current_chunk = current_chunk[-overlap:] + " " + text
	else:
	current_chunk = text
	current_metadata = {"tags": [tag_name], "headings": []}
	else:
	current_chunk += " " + text if current_chunk else text
	current_metadata["tags"].append(tag_name)

	# Track headings
	if tag_name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
	current_metadata["headings"].append(text[:100])

	# Add remaining content
	if current_chunk.strip():
	chunks.append({
	"text": current_chunk.strip(),
	"metadata": current_metadata,
	"char_count": len(current_chunk),
	})

	# If no semantic chunks found, fall back to simple chunking
	if not chunks:
	text = extract_text(html)
	for i in range(0, len(text), max_chunk_size - overlap):
	chunk_text = text[i : i + max_chunk_size]
	if chunk_text.strip():
	chunks.append({
	"text": chunk_text.strip(),
	"metadata": {"tags": [], "headings": []},
	"char_count": len(chunk_text),
	})

	return chunks


	def extract_links(
	html: str,
	base_url: Optional[str] = None,
	include_text: bool = True,
	) -> list[dict[str, str]]:
	"""
	Extract all links from HTML.

	Args:
	html: Raw HTML string
	base_url: Base URL for resolving relative links
	include_text: Include link text in results

	Returns:
	List of link dictionaries with href and optionally text
	"""
	from urllib.parse import urljoin

	soup = parse_html(html)
	links: list[dict[str, str]] = []

	for anchor in soup.find_all("a", href=True):
	href = anchor.get("href", "")
	if not href or href.startswith("#") or href.startswith("javascript:"):
	continue

	# Resolve relative URLs
	if base_url and not href.startswith(("http://", "https://", "//")):
	href = urljoin(base_url, href)

	link_data: dict[str, str] = {"href": href}

	if include_text:
	link_data["text"] = anchor.get_text(strip=True)

	# Include title if present
	title = anchor.get("title")
	if title:
	link_data["title"] = title

	links.append(link_data)

	return links


	def extract_tables(
	html: str,
	include_headers: bool = True,
	) -> list[dict[str, Any]]:
	"""
	Extract tables from HTML as structured data.

	Args:
	html: Raw HTML string
	include_headers: Try to identify and include header rows

	Returns:
	List of table dictionaries with headers and rows
	"""
	soup = parse_html(html)
	tables: list[dict[str, Any]] = []

	for table in soup.find_all("table"):
	table_data: dict[str, Any] = {
	"headers": [],
	"rows": [],
	}

	# Extract headers from thead or first row
	if include_headers:
	thead = table.find("thead")
	if thead:
	header_row = thead.find("tr")
	if header_row:
	table_data["headers"] = [
	th.get_text(strip=True)
	for th in header_row.find_all(["th", "td"])
	]

	# Extract body rows
	tbody = table.find("tbody") or table
	for row in tbody.find_all("tr"):
	cells = row.find_all(["td", "th"])
	row_data = [cell.get_text(strip=True) for cell in cells]

	# If no headers yet and this looks like a header row
	if include_headers and not table_data["headers"] and row.find("th"):
	table_data["headers"] = row_data
	else:
	if row_data: # Skip empty rows
	table_data["rows"].append(row_data)

	if table_data["rows"] or table_data["headers"]:
	tables.append(table_data)

	return tables