Spaces:

rnyx
/

ecom-qa-bert-v2

Running

App Files Files Community

ecom-qa-bert-v2 / src /sites /_base.py

rnyx

v3: Multi-site support (Amazon, Flipkart, Myntra, Meesho, TataCliq, Nykaa)

ba70fd1 about 2 months ago

raw

history blame contribute delete

4.03 kB

	"""
	Shared helpers + Parser registry used by every site module.

	Each site module (sites/amazon.py, sites/myntra.py, etc.) exports a
	`parse(soup) -> ProductData` function. The registry maps URL patterns
	to those parsers.
	"""
	import re
	from dataclasses import dataclass, field, asdict
	from typing import Callable, Optional

	from bs4 import BeautifulSoup


	# ── Shared data structure every parser must return ──────────────────

	@dataclass
	class ProductData:
	title: str = ""
	description: str = ""
	features: str = "" # bullet points / key features
	specs: str = "" # key:value pairs joined with " \| "
	materials: str = "" # composition / fabric / ingredients
	sizes: str = "" # available sizes (clothing)
	return_policy: str = "" # return/exchange policy text
	rating_text: str = "" # "4.3 out of 5 stars · 1,234 ratings"
	reviews: list = field(default_factory=list) # [{title, text, rating}]
	source: str = "generic"

	def to_dict(self) -> dict:
	return asdict(self)


	# ── Text helpers ────────────────────────────────────────────────────

	MAX_FIELD_CHARS = 8000


	def clean(text: str, limit: int = MAX_FIELD_CHARS) -> str:
	if not text:
	return ""
	text = re.sub(r"\s+", " ", str(text)).strip()
	return text[:limit]


	def first_text(soup: BeautifulSoup, *selectors: str) -> str:
	"""Return text from the first matching CSS selector, or ''."""
	for sel in selectors:
	try:
	tag = soup.select_one(sel)
	except Exception:
	continue
	if tag:
	txt = tag.get_text(" ", strip=True)
	if txt:
	return txt
	return ""


	def all_text_join(soup: BeautifulSoup, selector: str, sep: str = " · ",
	limit_items: int = 30) -> str:
	"""Join the text of all matching elements."""
	try:
	tags = soup.select(selector)[:limit_items]
	except Exception:
	return ""
	return sep.join(t.get_text(" ", strip=True) for t in tags
	if t.get_text(strip=True))


	def extract_kv_table(soup: BeautifulSoup, selector: str) -> str:
	"""For tables of key:value spec rows."""
	rows = []
	try:
	tables = soup.select(selector)
	except Exception:
	return ""
	for table in tables:
	for row in table.select("tr"):
	cells = row.find_all(["th", "td"])
	if len(cells) >= 2:
	k = cells[0].get_text(" ", strip=True)
	v = cells[-1].get_text(" ", strip=True)
	if k and v and k != v:
	entry = f"{k}: {v}"
	if entry not in rows:
	rows.append(entry)
	return " \| ".join(rows)


	def parse_rating_number(text: str) -> Optional[float]:
	"""Extract first float from text. '4.5 out of 5' -> 4.5"""
	if not text:
	return None
	m = re.search(r"([0-9]+\.?[0-9]*)", text)
	if not m:
	return None
	try:
	return float(m.group(1))
	except ValueError:
	return None


	# ── Registry ────────────────────────────────────────────────────────

	ParserFn = Callable[[BeautifulSoup], ProductData]
	_REGISTRY: dict[str, ParserFn] = {}


	def register(*url_keywords: str):
	"""Decorator: register a parser for one or more URL substring keys."""
	def deco(fn: ParserFn) -> ParserFn:
	for key in url_keywords:
	_REGISTRY[key.lower()] = fn
	return fn
	return deco


	def find_parser(url: str) -> Optional[ParserFn]:
	url_lower = (url or "").lower()
	for key, fn in _REGISTRY.items():
	if key in url_lower:
	return fn
	return None


	def get_registered_sites() -> list[str]:
	return sorted(_REGISTRY.keys())