import re import os import io import requests from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup from PIL import Image from typing import Optional, List, Dict, Any HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0 Safari/537.36" ) } # ------------------ Basic utils ------------------ # def validate_url(url: str) -> str: if not url or not isinstance(url, str): raise ValueError("URL must be a non-empty string.") parsed = urlparse(url) if parsed.scheme not in ("http", "https"): raise ValueError("URL must start with http:// or https://") blocked_hosts = {"localhost", "127.0.0.1", "0.0.0.0", "::1"} if parsed.hostname in blocked_hosts: raise ValueError("Local addresses are not allowed.") if parsed.hostname and parsed.hostname.startswith(("10.", "192.168.", "172.16.")): raise ValueError("Private network addresses are not allowed.") return url def allowed_by_robots(url: str) -> bool: parsed = urlparse(url) robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt" try: r = requests.get(robots_url, headers=HEADERS, timeout=5) if not r.ok: return True content = r.text.lower() lines = content.splitlines() applies = False disallows: List[str] = [] for line in lines: line = line.strip() if not line or line.startswith("#"): continue if line.startswith("user-agent:"): ua = line.split(":", 1)[1].strip() applies = ua in ("*", "scraper-bot") elif applies and line.startswith("disallow:"): rule = line.split(":", 1)[1].strip() disallows.append(rule) path = parsed.path or "/" for rule in disallows: if rule and path.startswith(rule): return False return True except Exception: return True def fetch_html(url: str) -> str: r = requests.get(url, headers=HEADERS, timeout=10) r.raise_for_status() return r.text def normalize_url(src: str, base: str) -> Optional[str]: try: return urljoin(base, src) except Exception: return None def is_image_candidate(url: str) -> bool: if not url: return False if url.startswith(("data:", "blob:", "about:")): return False lower = url.lower() if lower.endswith(".svg"): return False exts = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".avif", ".jfif"] if any(ext in lower for ext in exts): return True return lower.startswith("http://") or lower.startswith("https://") # ------------------ Author/headshot-focused HTML heuristics ------------------ # AUTHOR_SELECTORS: List[str] = [ ".author", ".author-info", ".author-bio", ".author-box", ".about", ".about-author", ".byline", ".post-author", ".entry-author", ".site-author", ".profile", ".profile-card", ".user-info", "[rel='author']", "[itemprop='author']", "[itemtype*='Person']", ] def extract_author_candidates( soup: BeautifulSoup, base_url: str ) -> List[Dict[str, Any]]: """ Find images likely to be author / headshot images. Returns a list of dicts: [{url, score, source, width_attr, height_attr, width_px, height_px}, ...] """ candidates: List[Dict[str, Any]] = [] seen = set() def add_image( src: Optional[str], source: str, score_boost: float = 0.0, tag=None ) -> None: if not src: return abs_url = normalize_url(src, base_url) if not abs_url or abs_url in seen: return if not is_image_candidate(abs_url): return score = 0.0 + score_boost width_attr = None height_attr = None if tag is not None: width_attr = tag.get("width") height_attr = tag.get("height") try: w = int(width_attr or 0) h = int(height_attr or 0) if w > 0 and h > 0: aspect = float(h) / float(max(w, 1)) if 0.8 <= aspect <= 2.0: score += 2.0 if w > 800 and w > h * 2: score -= 2.0 except ValueError: pass seen.add(abs_url) candidates.append( { "url": abs_url, "score": score, "source": source, "width_attr": width_attr, "height_attr": height_attr, "width_px": None, "height_px": None, } ) # 1) Inside author/about-ish containers for selector in AUTHOR_SELECTORS: for container in soup.select(selector): imgs = container.find_all("img") for img in imgs: src = ( img.get("src") or img.get("data-src") or img.get("data-lazy-src") or img.get("data-original") ) add_image(src, "author:" + selector, score_boost=5.0, tag=img) style = container.get("style") or "" match = re.search(r"background-image\s*:\s*url\(([^)]+)\)", style, re.I) if match: raw = match.group(1).strip("'\" ") add_image(raw, "author-bg:" + selector, score_boost=4.0, tag=None) # 2) Fallback: near "about"/"author" text text_blocks = soup.find_all( lambda tag: tag.name in ("p", "div", "section") and tag.get_text(strip=True) ) for block in text_blocks: txt = block.get_text(" ", strip=True).lower() if any( phrase in txt for phrase in [ "about me", "about the author", "about the creator", "meet", "author", ] ): for img in block.find_all("img"): src = ( img.get("src") or img.get("data-src") or img.get("data-lazy-src") or img.get("data-original") ) add_image(src, "near-about-text", score_boost=3.0, tag=img) return candidates # ------------------ Size-based filtering via Pillow ------------------ # def measure_image_dimensions( url: str, timeout: int = 10 ) -> (Optional[int], Optional[int]): """ Fetch the image and read dimensions with Pillow. Returns (width, height) or (None, None) on failure. """ try: resp = requests.get(url, headers=HEADERS, timeout=timeout) resp.raise_for_status() data = io.BytesIO(resp.content) with Image.open(data) as img: return img.width, img.height except Exception: return None, None def refine_candidates_with_dimensions( candidates: List[Dict[str, Any]], max_to_check: int = 10 ) -> List[Dict[str, Any]]: """ For up to `max_to_check` candidates, compute real width/height. Adjust scores based on real dimensions. """ candidates_sorted = sorted(candidates, key=lambda c: c["score"], reverse=True) for idx, c in enumerate(candidates_sorted): if idx >= max_to_check: break w, h = measure_image_dimensions(c["url"]) c["width_px"] = w c["height_px"] = h if not w or not h: continue if w < 80 or h < 80: c["score"] -= 4.0 continue aspect = float(h) / float(max(w, 1)) if 0.7 <= aspect <= 1.8: c["score"] += 4.0 if w > 1000 and w > h * 2.5: c["score"] -= 5.0 if w > 2500 or h > 2500: c["score"] -= 3.0 return sorted(candidates_sorted, key=lambda c: c["score"], reverse=True) def pick_best_author_image( candidates: List[Dict[str, Any]], ) -> Optional[Dict[str, Any]]: if not candidates: return None return sorted(candidates, key=lambda c: c["score"], reverse=True)[0] # ------------------ Main entry: scrape_author_image ------------------ # def scrape_author_image(url: str) -> Dict[str, Any]: """ Given a URL, return the most likely author/headshot image. """ url = validate_url(url) if not allowed_by_robots(url): raise PermissionError("Blocked by robots.txt") html = fetch_html(url) soup = BeautifulSoup(html, "lxml") title_tag = ( soup.select_one("meta[property='og:title']") or soup.select_one("meta[name='twitter:title']") or soup.find("h1") or soup.title ) if title_tag: if hasattr(title_tag, "get"): title = title_tag.get("content", "") or title_tag.get_text("", strip=True) else: title = title_tag.get_text("", strip=True) else: title = urlparse(url).hostname candidates = extract_author_candidates(soup, url) candidates_refined = refine_candidates_with_dimensions(candidates, max_to_check=10) best = pick_best_author_image(candidates_refined) return { "title": title, "author_image_url": best["url"] if best else None, "debug_candidates": candidates_refined, } # ------------------ Download helpers ------------------ # def download_image( image_url: str, out_dir: str = "author_images", filename: Optional[str] = None ) -> str: """ Download a single image URL to `out_dir`, return local file path. If `filename` is provided, it is used instead of the remote filename. """ os.makedirs(out_dir, exist_ok=True) parsed = urlparse(image_url) fallback = os.path.basename(parsed.path) or "image" if filename: base_name = filename else: base_name = fallback # Ensure file extension if not any( base_name.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".gif", ".webp"] ): base_name += ".jpg" file_path = os.path.join(out_dir, base_name) resp = requests.get(image_url, headers=HEADERS, timeout=15) resp.raise_for_status() with open(file_path, "wb") as f: f.write(resp.content) return file_path def download_author_image( page_url: str, out_dir: str = "author_images" ) -> Dict[str, Any]: """ High-level helper: 1. Scrape the page for the best author/headshot image. 2. Download that image to `out_dir`. """ result = scrape_author_image(page_url) author_url = result["author_image_url"] if not author_url: return { "title": result["title"], "author_image_url": None, "local_path": None, } local_path = download_image(author_url, out_dir=out_dir) return { "title": result["title"], "author_image_url": author_url, "local_path": local_path, } # ---------- Site normalization & about-page discovery ---------- # def normalize_site_input(site: str) -> str: """ Allow user to input: - damndelicious.net - https://damndelicious.net - http://www.damndelicious.net/ and normalize to a base URL like: - https://damndelicious.net """ site = site.strip() if not site: raise ValueError("Site must be a non-empty string.") if not site.startswith(("http://", "https://")): site = "https://" + site parsed = urlparse(site) netloc = parsed.netloc if netloc.startswith("www."): netloc = netloc[4:] base_url = f"{parsed.scheme}://{netloc}" return base_url ABOUT_PATH_GUESSES: List[str] = [ "/about", "/about/", "/about-me", "/about-me/", "/about-us", "/about-us/", "/about-the-author", "/about-the-author/", "/about-the-creator", "/about-the-creator/", "/meet-the-team", "/meet-the-team/", "/meet-the-author", "/meet-the-author/", ] ABOUT_TEXT_KEYWORDS: List[str] = [ "about", "about me", "about us", "about the author", "about the creator", "our story", "my story", "meet", "meet the author", "meet the team", "bio", ] def url_is_html(url: str) -> bool: """ Quick check that a URL returns HTML (not 404, not an image, etc.). """ try: r = requests.get(url, headers=HEADERS, timeout=8) if not r.ok: return False ctype = r.headers.get("Content-Type", "").lower() return "text/html" in ctype or "application/xhtml+xml" in ctype except Exception: return False def find_about_like_urls(base_url: str, max_links: int = 20) -> List[str]: """ Strategy: 1. Try common about paths relative to base_url. 2. Fetch the homepage and look for internal links whose text or href contains about-ish keywords. """ candidates: List[str] = [] # 1) Common about paths for path in ABOUT_PATH_GUESSES: candidates.append(urljoin(base_url, path)) # 2) Extract from homepage try: html = fetch_html(base_url) except Exception: html = "" if html: soup = BeautifulSoup(html, "lxml") for a in soup.find_all("a", href=True): href = a["href"] text = a.get_text(" ", strip=True).lower() href_lower = href.lower() if any(kw in text for kw in ABOUT_TEXT_KEYWORDS) or any( kw in href_lower for kw in ["about", "our-story", "my-story", "meet"] ): abs_url = normalize_url(href, base_url) if not abs_url: continue candidates.append(abs_url) # Deduplicate while preserving order seen = set() unique: List[str] = [] for c in candidates: if c not in seen: seen.add(c) unique.append(c) return unique[:max_links] def pick_best_about_url(site_input: str) -> Optional[str]: """ Given a site name or URL, try to find the 'best' about page URL. """ base_url = normalize_site_input(site_input) base_url = validate_url(base_url) if not allowed_by_robots(base_url): raise PermissionError("Blocked by robots.txt for base site.") candidates = find_about_like_urls(base_url) for cand in candidates: if not url_is_html(cand): continue return cand return base_url def clean_site_name(site: str) -> str: """ Convert a site or URL into a safe filename component. Examples: https://www.damndelicious.net -> damndelicious_net foodblog.com -> foodblog_com """ site = site.strip().lower() if site.startswith("http://"): site = site[7:] elif site.startswith("https://"): site = site[8:] site = site.split("/")[0] return site.replace(".", "_").replace("-", "_") def download_author_image_for_site( site_input: str, out_dir: str = "author_images" ) -> Dict[str, Any]: """ 1. Convert site input into a normalized base URL. 2. Locate the site's best 'About' page. 3. Extract the author image. 4. Download it using a filename that includes the site name. """ base_url = normalize_site_input(site_input) about_url = pick_best_about_url(site_input) if not about_url: return { "site_base_url": base_url, "about_url": None, "title": None, "author_image_url": None, "local_path": None, } info = scrape_author_image(about_url) author_url = info["author_image_url"] if not author_url: return { "site_base_url": base_url, "about_url": about_url, "title": info["title"], "author_image_url": None, "local_path": None, } safe_name = clean_site_name(base_url) filename = safe_name + "_author" local_path = download_image(author_url, out_dir=out_dir, filename=filename) return { "site_base_url": base_url, "about_url": about_url, "title": info["title"], "author_image_url": author_url, "local_path": local_path, } # ------------------ Optional CLI test ------------------ # if __name__ == "__main__": site_or_url = input("Enter site (e.g. 'damndelicious.net' or full URL): ").strip() result = download_author_image_for_site(site_or_url, out_dir="author_images") print("\nBase site:", result["site_base_url"]) print("About URL:", result["about_url"]) print("Page title:", result["title"]) print("Headshot URL:", result["author_image_url"]) print("Saved to:", result["local_path"])