Spaces:

jessejohnson
/

plg4-dev-server

Paused

plg4-dev-server / backend /data_minning /yummy_medley_scraper.py

Jesse Johnson

New commit for backend deployment: 2025-09-25_13-24-03

c59d808 7 months ago

8.81 kB

	import re
	from typing import Iterable, Optional
	from urllib.parse import urljoin, urlparse
	from bs4 import BeautifulSoup
	from .base_scrapper import BaseRecipeScraper, RecipeDoc
	from backend.utils.sanitization import clean

	class YummyMedleyScraper(BaseRecipeScraper):
	"""Specifically targets WPRM (WP Recipe Maker) blocks on yummymedley.com"""

	# Only collect tags from the home page tag-cloud widget
	TAG_CLOUD_SELECTORS = [
	"#tag_cloud-4 .tagcloud a[href*='/tag/']",
	"div.widget_tag_cloud .tagcloud a[href*='/tag/']",
	]

	# How we find post links on a tag page (grid cards & headers)
	POST_LINK_SELECTORS = [
	"#main ul.sp-grid li article .post-header a[href]", # header link
	"#main ul.sp-grid li article .post-img a[href]", # image link
	"#main article .post-header a[href]", # fallback
	]

	TAG_RE = re.compile(r"/tag/[^/]+/?$")

	def __init__(self, base_domain="www.yummymedley.com"):
	super().__init__(base_domain)

	# --- NEW: get tag URLs only from home page tag cloud ---
	def _discover_tag_urls_from_home(self) -> list[str]:
	soup = self.fetch_soup(self.base_url)
	tags = set()

	# prefer strict selector; gracefully fall back
	anchors = []
	for sel in self.TAG_CLOUD_SELECTORS:
	anchors = soup.select(sel)
	if anchors:
	break

	if not anchors:
	# final fallback: any /tag/... link on home page
	anchors = soup.find_all("a", href=self.TAG_RE)

	for a in anchors:
	href = a.get("href")
	if href:
	tags.add(urljoin(self.base_url, href))
	return sorted(tags)

	# --- NEW: extract post links from a single tag page soup ---
	def _extract_post_links_from_tag_page(self, soup: BeautifulSoup) -> set[str]:
	links = set()
	for sel in self.POST_LINK_SELECTORS:
	for a in soup.select(sel):
	href = a.get("href")
	if href:
	links.add(urljoin(self.base_url, href))
	if links:
	break # got some via this selector
	return links

	# --- helper: is this URL an article we should open? ---

	# --- REWRITTEN: discover only from home-page tag cloud, then crawl each tag ---
	def discover_urls(self) -> Iterable[str]:
	tags = self._discover_tag_urls_from_home()
	if not tags:
	# Safety: if tag cloud not found, bail early (or fallback to /recipes/)
	self.logger.warning("No tags discovered from home page tag cloud; falling back to /recipes/")
	tags = [urljoin(self.base_url, "/recipes/")]

	seen = set()
	for tag_url in tags:
	page = 1
	while page <= 20: # hard cap to avoid runaway pagination
	url = tag_url if page == 1 else f"{tag_url.rstrip('/')}/page/{page}/"
	try:
	soup = self.fetch_soup(url)
	except Exception as e:
	self.logger.warning(f"[tag] fetch failed {url}: {e}")
	break

	post_links = self._extract_post_links_from_tag_page(soup)
	if not post_links:
	# no posts found -> stop paginating this tag
	break

	for u in sorted(post_links):
	if u not in seen and self._looks_like_article(u):
	seen.add(u)
	yield u

	# pagination: look for 'next' or 'older'
	next_link = (
	soup.find("a", string=re.compile(r"next\|older", re.I)) or
	soup.find("a", rel="next")
	)
	if not next_link:
	break
	page += 1

	def _looks_like_article(self, u: str) -> bool:
	sp = urlparse(u)
	if not self.same_domain(u): return False
	if re.search(r"/(tag\|category\|author\|page\|feed)/", sp.path, re.I): return False
	if sp.path.endswith((".xml",".jpg",".png",".pdf",".webp",".zip")): return False
	segs = [s for s in sp.path.strip("/").split("/") if s]
	return 1 <= len(segs) <= 3

	def extract_recipe(self, soup: BeautifulSoup, url: str, category: Optional[str] = None) -> RecipeDoc:
	doc = RecipeDoc.make(url)
	# JSON-LD first (many WPRM pages also embed it)
	j = self.extract_jsonld(soup)
	if j:
	for k, v in j.items():
	if not hasattr(doc, k):
	continue
	# skip empty values from JSON-LD
	if v in (None, "", [], {}):
	continue
	# never overwrite an already-set url/source
	if k in ("url", "source") and getattr(doc, k):
	continue
	setattr(doc, k, v)
	# WPRM block
	w = soup.find("div", class_="wprm-recipe-container")
	if w:
	if not doc.title:
	t = w.find(class_="wprm-recipe-name")
	doc.title = clean(t.get_text()) if t else doc.title
	# image
	if not doc.image_url:
	img = w.find("img")
	doc.image_url = clean(img.get("src") or img.get("data-src")) if img else clean(self.get_meta_image(soup))
	# rating
	r = w.find(class_="wprm-recipe-rating-average")
	if r:
	try: doc.rating = float(r.get_text().strip())
	except: pass
	rc = w.find(class_="wprm-recipe-rating-count")
	if rc:
	try: doc.rating_count = int(rc.get_text().strip())
	except: pass
	# times
	def pick(c):
	x = w.find(class_=c)
	return clean(x.get_text()) if x else None
	doc.prep_time = pick("wprm-recipe-prep_time") or doc.prep_time
	doc.cook_time = pick("wprm-recipe-cook_time") or doc.cook_time
	doc.total_time= pick("wprm-recipe-total_time") or doc.total_time
	# servings
	s = w.find(class_="wprm-recipe-servings")
	if s:
	txt = s.get_text().strip()
	doc.servings = int(txt) if txt.isdigit() else clean(txt)
	# calories
	cal = w.find(class_="wprm-recipe-calories")
	if cal:
	try: doc.calories = float(cal.get_text().strip())
	except: pass
	# course/cuisine
	cse = w.find(class_="wprm-recipe-course"); doc.course = clean(cse.get_text()) if cse else doc.course
	cui = w.find(class_="wprm-recipe-cuisine"); doc.cuisine = clean(cui.get_text()) if cui else doc.cuisine
	# ingredients
	ings = []
	ic = w.find(class_="wprm-recipe-ingredients-container")
	if ic:
	for ing in ic.find_all(class_="wprm-recipe-ingredient"):
	parts=[]
	for cls in ("wprm-recipe-ingredient-amount","wprm-recipe-ingredient-unit","wprm-recipe-ingredient-name","wprm-recipe-ingredient-notes"):
	el = ing.find(class_=cls)
	if el:
	t = el.get_text().strip()
	if t:
	parts.append(t if "notes" not in cls else f"({t})")
	txt = clean(" ".join(parts))
	if txt: ings.append(txt)
	if ings: doc.ingredients = ings
	# instructions
	steps=[]
	ic2 = w.find(class_="wprm-recipe-instructions-container")
	if ic2:
	for ins in ic2.find_all(class_="wprm-recipe-instruction"):
	t = ins.find(class_="wprm-recipe-instruction-text") or ins
	txt = clean(t.get_text().strip())
	if txt: steps.append(txt)
	if steps: doc.instructions = steps

	ingredients_text = self._to_ingredients_text(doc.ingredients)
	instructions_text = self._to_instructions_text(doc.instructions)

	# Store as text (team requirement)
	doc.ingredients = ingredients_text
	doc.instructions = instructions_text

	# Fallback title & image
	if not doc.title:
	h1 = soup.find("h1") or soup.find("title")
	doc.title = clean(h1.get_text()) if h1 else None
	if not doc.image_url:
	doc.image_url = clean(self.get_meta_image(soup))

	# Optional category (yours wants "Afro-tropical Recipes" etc. — grab from breadcrumbs or page tags if available)
	cat = soup.find("a", href=re.compile(r"/category/\|/tag/"))
	doc.category = clean(cat.get_text()) if cat else doc.category

	return doc