ecom-qa-bert-v2 / src /sites /_base.py
rnyx's picture
v3: Multi-site support (Amazon, Flipkart, Myntra, Meesho, TataCliq, Nykaa)
ba70fd1
"""
Shared helpers + Parser registry used by every site module.
Each site module (sites/amazon.py, sites/myntra.py, etc.) exports a
`parse(soup) -> ProductData` function. The registry maps URL patterns
to those parsers.
"""
import re
from dataclasses import dataclass, field, asdict
from typing import Callable, Optional
from bs4 import BeautifulSoup
# ── Shared data structure every parser must return ──────────────────
@dataclass
class ProductData:
title: str = ""
description: str = ""
features: str = "" # bullet points / key features
specs: str = "" # key:value pairs joined with " | "
materials: str = "" # composition / fabric / ingredients
sizes: str = "" # available sizes (clothing)
return_policy: str = "" # return/exchange policy text
rating_text: str = "" # "4.3 out of 5 stars Β· 1,234 ratings"
reviews: list = field(default_factory=list) # [{title, text, rating}]
source: str = "generic"
def to_dict(self) -> dict:
return asdict(self)
# ── Text helpers ────────────────────────────────────────────────────
MAX_FIELD_CHARS = 8000
def clean(text: str, limit: int = MAX_FIELD_CHARS) -> str:
if not text:
return ""
text = re.sub(r"\s+", " ", str(text)).strip()
return text[:limit]
def first_text(soup: BeautifulSoup, *selectors: str) -> str:
"""Return text from the first matching CSS selector, or ''."""
for sel in selectors:
try:
tag = soup.select_one(sel)
except Exception:
continue
if tag:
txt = tag.get_text(" ", strip=True)
if txt:
return txt
return ""
def all_text_join(soup: BeautifulSoup, selector: str, sep: str = " Β· ",
limit_items: int = 30) -> str:
"""Join the text of all matching elements."""
try:
tags = soup.select(selector)[:limit_items]
except Exception:
return ""
return sep.join(t.get_text(" ", strip=True) for t in tags
if t.get_text(strip=True))
def extract_kv_table(soup: BeautifulSoup, selector: str) -> str:
"""For tables of key:value spec rows."""
rows = []
try:
tables = soup.select(selector)
except Exception:
return ""
for table in tables:
for row in table.select("tr"):
cells = row.find_all(["th", "td"])
if len(cells) >= 2:
k = cells[0].get_text(" ", strip=True)
v = cells[-1].get_text(" ", strip=True)
if k and v and k != v:
entry = f"{k}: {v}"
if entry not in rows:
rows.append(entry)
return " | ".join(rows)
def parse_rating_number(text: str) -> Optional[float]:
"""Extract first float from text. '4.5 out of 5' -> 4.5"""
if not text:
return None
m = re.search(r"([0-9]+\.?[0-9]*)", text)
if not m:
return None
try:
return float(m.group(1))
except ValueError:
return None
# ── Registry ────────────────────────────────────────────────────────
ParserFn = Callable[[BeautifulSoup], ProductData]
_REGISTRY: dict[str, ParserFn] = {}
def register(*url_keywords: str):
"""Decorator: register a parser for one or more URL substring keys."""
def deco(fn: ParserFn) -> ParserFn:
for key in url_keywords:
_REGISTRY[key.lower()] = fn
return fn
return deco
def find_parser(url: str) -> Optional[ParserFn]:
url_lower = (url or "").lower()
for key, fn in _REGISTRY.items():
if key in url_lower:
return fn
return None
def get_registered_sites() -> list[str]:
return sorted(_REGISTRY.keys())