Spaces:
Sleeping
Sleeping
| import re | |
| import os | |
| import io | |
| import requests | |
| from urllib.parse import urljoin, urlparse | |
| from bs4 import BeautifulSoup | |
| from PIL import Image | |
| from typing import Optional, List, Dict, Any | |
| HEADERS = { | |
| "User-Agent": ( | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/120.0 Safari/537.36" | |
| ) | |
| } | |
| # ------------------ Basic utils ------------------ # | |
| def validate_url(url: str) -> str: | |
| if not url or not isinstance(url, str): | |
| raise ValueError("URL must be a non-empty string.") | |
| parsed = urlparse(url) | |
| if parsed.scheme not in ("http", "https"): | |
| raise ValueError("URL must start with http:// or https://") | |
| blocked_hosts = {"localhost", "127.0.0.1", "0.0.0.0", "::1"} | |
| if parsed.hostname in blocked_hosts: | |
| raise ValueError("Local addresses are not allowed.") | |
| if parsed.hostname and parsed.hostname.startswith(("10.", "192.168.", "172.16.")): | |
| raise ValueError("Private network addresses are not allowed.") | |
| return url | |
| def allowed_by_robots(url: str) -> bool: | |
| parsed = urlparse(url) | |
| robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt" | |
| try: | |
| r = requests.get(robots_url, headers=HEADERS, timeout=5) | |
| if not r.ok: | |
| return True | |
| content = r.text.lower() | |
| lines = content.splitlines() | |
| applies = False | |
| disallows: List[str] = [] | |
| for line in lines: | |
| line = line.strip() | |
| if not line or line.startswith("#"): | |
| continue | |
| if line.startswith("user-agent:"): | |
| ua = line.split(":", 1)[1].strip() | |
| applies = ua in ("*", "scraper-bot") | |
| elif applies and line.startswith("disallow:"): | |
| rule = line.split(":", 1)[1].strip() | |
| disallows.append(rule) | |
| path = parsed.path or "/" | |
| for rule in disallows: | |
| if rule and path.startswith(rule): | |
| return False | |
| return True | |
| except Exception: | |
| return True | |
| def fetch_html(url: str) -> str: | |
| r = requests.get(url, headers=HEADERS, timeout=10) | |
| r.raise_for_status() | |
| return r.text | |
| def normalize_url(src: str, base: str) -> Optional[str]: | |
| try: | |
| return urljoin(base, src) | |
| except Exception: | |
| return None | |
| def is_image_candidate(url: str) -> bool: | |
| if not url: | |
| return False | |
| if url.startswith(("data:", "blob:", "about:")): | |
| return False | |
| lower = url.lower() | |
| if lower.endswith(".svg"): | |
| return False | |
| exts = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".avif", ".jfif"] | |
| if any(ext in lower for ext in exts): | |
| return True | |
| return lower.startswith("http://") or lower.startswith("https://") | |
| # ------------------ Author/headshot-focused HTML heuristics ------------------ # | |
| AUTHOR_SELECTORS: List[str] = [ | |
| ".author", | |
| ".author-info", | |
| ".author-bio", | |
| ".author-box", | |
| ".about", | |
| ".about-author", | |
| ".byline", | |
| ".post-author", | |
| ".entry-author", | |
| ".site-author", | |
| ".profile", | |
| ".profile-card", | |
| ".user-info", | |
| "[rel='author']", | |
| "[itemprop='author']", | |
| "[itemtype*='Person']", | |
| ] | |
| def extract_author_candidates( | |
| soup: BeautifulSoup, base_url: str | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Find images likely to be author / headshot images. | |
| Returns a list of dicts: | |
| [{url, score, source, width_attr, height_attr, width_px, height_px}, ...] | |
| """ | |
| candidates: List[Dict[str, Any]] = [] | |
| seen = set() | |
| def add_image( | |
| src: Optional[str], source: str, score_boost: float = 0.0, tag=None | |
| ) -> None: | |
| if not src: | |
| return | |
| abs_url = normalize_url(src, base_url) | |
| if not abs_url or abs_url in seen: | |
| return | |
| if not is_image_candidate(abs_url): | |
| return | |
| score = 0.0 + score_boost | |
| width_attr = None | |
| height_attr = None | |
| if tag is not None: | |
| width_attr = tag.get("width") | |
| height_attr = tag.get("height") | |
| try: | |
| w = int(width_attr or 0) | |
| h = int(height_attr or 0) | |
| if w > 0 and h > 0: | |
| aspect = float(h) / float(max(w, 1)) | |
| if 0.8 <= aspect <= 2.0: | |
| score += 2.0 | |
| if w > 800 and w > h * 2: | |
| score -= 2.0 | |
| except ValueError: | |
| pass | |
| seen.add(abs_url) | |
| candidates.append( | |
| { | |
| "url": abs_url, | |
| "score": score, | |
| "source": source, | |
| "width_attr": width_attr, | |
| "height_attr": height_attr, | |
| "width_px": None, | |
| "height_px": None, | |
| } | |
| ) | |
| # 1) Inside author/about-ish containers | |
| for selector in AUTHOR_SELECTORS: | |
| for container in soup.select(selector): | |
| imgs = container.find_all("img") | |
| for img in imgs: | |
| src = ( | |
| img.get("src") | |
| or img.get("data-src") | |
| or img.get("data-lazy-src") | |
| or img.get("data-original") | |
| ) | |
| add_image(src, "author:" + selector, score_boost=5.0, tag=img) | |
| style = container.get("style") or "" | |
| match = re.search(r"background-image\s*:\s*url\(([^)]+)\)", style, re.I) | |
| if match: | |
| raw = match.group(1).strip("'\" ") | |
| add_image(raw, "author-bg:" + selector, score_boost=4.0, tag=None) | |
| # 2) Fallback: near "about"/"author" text | |
| text_blocks = soup.find_all( | |
| lambda tag: tag.name in ("p", "div", "section") and tag.get_text(strip=True) | |
| ) | |
| for block in text_blocks: | |
| txt = block.get_text(" ", strip=True).lower() | |
| if any( | |
| phrase in txt | |
| for phrase in [ | |
| "about me", | |
| "about the author", | |
| "about the creator", | |
| "meet", | |
| "author", | |
| ] | |
| ): | |
| for img in block.find_all("img"): | |
| src = ( | |
| img.get("src") | |
| or img.get("data-src") | |
| or img.get("data-lazy-src") | |
| or img.get("data-original") | |
| ) | |
| add_image(src, "near-about-text", score_boost=3.0, tag=img) | |
| return candidates | |
| # ------------------ Size-based filtering via Pillow ------------------ # | |
| def measure_image_dimensions( | |
| url: str, timeout: int = 10 | |
| ) -> (Optional[int], Optional[int]): | |
| """ | |
| Fetch the image and read dimensions with Pillow. | |
| Returns (width, height) or (None, None) on failure. | |
| """ | |
| try: | |
| resp = requests.get(url, headers=HEADERS, timeout=timeout) | |
| resp.raise_for_status() | |
| data = io.BytesIO(resp.content) | |
| with Image.open(data) as img: | |
| return img.width, img.height | |
| except Exception: | |
| return None, None | |
| def refine_candidates_with_dimensions( | |
| candidates: List[Dict[str, Any]], max_to_check: int = 10 | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| For up to `max_to_check` candidates, compute real width/height. | |
| Adjust scores based on real dimensions. | |
| """ | |
| candidates_sorted = sorted(candidates, key=lambda c: c["score"], reverse=True) | |
| for idx, c in enumerate(candidates_sorted): | |
| if idx >= max_to_check: | |
| break | |
| w, h = measure_image_dimensions(c["url"]) | |
| c["width_px"] = w | |
| c["height_px"] = h | |
| if not w or not h: | |
| continue | |
| if w < 80 or h < 80: | |
| c["score"] -= 4.0 | |
| continue | |
| aspect = float(h) / float(max(w, 1)) | |
| if 0.7 <= aspect <= 1.8: | |
| c["score"] += 4.0 | |
| if w > 1000 and w > h * 2.5: | |
| c["score"] -= 5.0 | |
| if w > 2500 or h > 2500: | |
| c["score"] -= 3.0 | |
| return sorted(candidates_sorted, key=lambda c: c["score"], reverse=True) | |
| def pick_best_author_image( | |
| candidates: List[Dict[str, Any]], | |
| ) -> Optional[Dict[str, Any]]: | |
| if not candidates: | |
| return None | |
| return sorted(candidates, key=lambda c: c["score"], reverse=True)[0] | |
| # ------------------ Main entry: scrape_author_image ------------------ # | |
| def scrape_author_image(url: str) -> Dict[str, Any]: | |
| """ | |
| Given a URL, return the most likely author/headshot image. | |
| """ | |
| url = validate_url(url) | |
| if not allowed_by_robots(url): | |
| raise PermissionError("Blocked by robots.txt") | |
| html = fetch_html(url) | |
| soup = BeautifulSoup(html, "lxml") | |
| title_tag = ( | |
| soup.select_one("meta[property='og:title']") | |
| or soup.select_one("meta[name='twitter:title']") | |
| or soup.find("h1") | |
| or soup.title | |
| ) | |
| if title_tag: | |
| if hasattr(title_tag, "get"): | |
| title = title_tag.get("content", "") or title_tag.get_text("", strip=True) | |
| else: | |
| title = title_tag.get_text("", strip=True) | |
| else: | |
| title = urlparse(url).hostname | |
| candidates = extract_author_candidates(soup, url) | |
| candidates_refined = refine_candidates_with_dimensions(candidates, max_to_check=10) | |
| best = pick_best_author_image(candidates_refined) | |
| return { | |
| "title": title, | |
| "author_image_url": best["url"] if best else None, | |
| "debug_candidates": candidates_refined, | |
| } | |
| # ------------------ Download helpers ------------------ # | |
| def download_image( | |
| image_url: str, out_dir: str = "author_images", filename: Optional[str] = None | |
| ) -> str: | |
| """ | |
| Download a single image URL to `out_dir`, return local file path. | |
| If `filename` is provided, it is used instead of the remote filename. | |
| """ | |
| os.makedirs(out_dir, exist_ok=True) | |
| parsed = urlparse(image_url) | |
| fallback = os.path.basename(parsed.path) or "image" | |
| if filename: | |
| base_name = filename | |
| else: | |
| base_name = fallback | |
| # Ensure file extension | |
| if not any( | |
| base_name.lower().endswith(ext) | |
| for ext in [".jpg", ".jpeg", ".png", ".gif", ".webp"] | |
| ): | |
| base_name += ".jpg" | |
| file_path = os.path.join(out_dir, base_name) | |
| resp = requests.get(image_url, headers=HEADERS, timeout=15) | |
| resp.raise_for_status() | |
| with open(file_path, "wb") as f: | |
| f.write(resp.content) | |
| return file_path | |
| def download_author_image( | |
| page_url: str, out_dir: str = "author_images" | |
| ) -> Dict[str, Any]: | |
| """ | |
| High-level helper: | |
| 1. Scrape the page for the best author/headshot image. | |
| 2. Download that image to `out_dir`. | |
| """ | |
| result = scrape_author_image(page_url) | |
| author_url = result["author_image_url"] | |
| if not author_url: | |
| return { | |
| "title": result["title"], | |
| "author_image_url": None, | |
| "local_path": None, | |
| } | |
| local_path = download_image(author_url, out_dir=out_dir) | |
| return { | |
| "title": result["title"], | |
| "author_image_url": author_url, | |
| "local_path": local_path, | |
| } | |
| # ---------- Site normalization & about-page discovery ---------- # | |
| def normalize_site_input(site: str) -> str: | |
| """ | |
| Allow user to input: | |
| - damndelicious.net | |
| - https://damndelicious.net | |
| - http://www.damndelicious.net/ | |
| and normalize to a base URL like: | |
| - https://damndelicious.net | |
| """ | |
| site = site.strip() | |
| if not site: | |
| raise ValueError("Site must be a non-empty string.") | |
| if not site.startswith(("http://", "https://")): | |
| site = "https://" + site | |
| parsed = urlparse(site) | |
| netloc = parsed.netloc | |
| if netloc.startswith("www."): | |
| netloc = netloc[4:] | |
| base_url = f"{parsed.scheme}://{netloc}" | |
| return base_url | |
| ABOUT_PATH_GUESSES: List[str] = [ | |
| "/about", | |
| "/about/", | |
| "/about-me", | |
| "/about-me/", | |
| "/about-us", | |
| "/about-us/", | |
| "/about-the-author", | |
| "/about-the-author/", | |
| "/about-the-creator", | |
| "/about-the-creator/", | |
| "/meet-the-team", | |
| "/meet-the-team/", | |
| "/meet-the-author", | |
| "/meet-the-author/", | |
| ] | |
| ABOUT_TEXT_KEYWORDS: List[str] = [ | |
| "about", | |
| "about me", | |
| "about us", | |
| "about the author", | |
| "about the creator", | |
| "our story", | |
| "my story", | |
| "meet", | |
| "meet the author", | |
| "meet the team", | |
| "bio", | |
| ] | |
| def url_is_html(url: str) -> bool: | |
| """ | |
| Quick check that a URL returns HTML (not 404, not an image, etc.). | |
| """ | |
| try: | |
| r = requests.get(url, headers=HEADERS, timeout=8) | |
| if not r.ok: | |
| return False | |
| ctype = r.headers.get("Content-Type", "").lower() | |
| return "text/html" in ctype or "application/xhtml+xml" in ctype | |
| except Exception: | |
| return False | |
| def find_about_like_urls(base_url: str, max_links: int = 20) -> List[str]: | |
| """ | |
| Strategy: | |
| 1. Try common about paths relative to base_url. | |
| 2. Fetch the homepage and look for internal <a> links whose | |
| text or href contains about-ish keywords. | |
| """ | |
| candidates: List[str] = [] | |
| # 1) Common about paths | |
| for path in ABOUT_PATH_GUESSES: | |
| candidates.append(urljoin(base_url, path)) | |
| # 2) Extract from homepage | |
| try: | |
| html = fetch_html(base_url) | |
| except Exception: | |
| html = "" | |
| if html: | |
| soup = BeautifulSoup(html, "lxml") | |
| for a in soup.find_all("a", href=True): | |
| href = a["href"] | |
| text = a.get_text(" ", strip=True).lower() | |
| href_lower = href.lower() | |
| if any(kw in text for kw in ABOUT_TEXT_KEYWORDS) or any( | |
| kw in href_lower for kw in ["about", "our-story", "my-story", "meet"] | |
| ): | |
| abs_url = normalize_url(href, base_url) | |
| if not abs_url: | |
| continue | |
| candidates.append(abs_url) | |
| # Deduplicate while preserving order | |
| seen = set() | |
| unique: List[str] = [] | |
| for c in candidates: | |
| if c not in seen: | |
| seen.add(c) | |
| unique.append(c) | |
| return unique[:max_links] | |
| def pick_best_about_url(site_input: str) -> Optional[str]: | |
| """ | |
| Given a site name or URL, try to find the 'best' about page URL. | |
| """ | |
| base_url = normalize_site_input(site_input) | |
| base_url = validate_url(base_url) | |
| if not allowed_by_robots(base_url): | |
| raise PermissionError("Blocked by robots.txt for base site.") | |
| candidates = find_about_like_urls(base_url) | |
| for cand in candidates: | |
| if not url_is_html(cand): | |
| continue | |
| return cand | |
| return base_url | |
| def clean_site_name(site: str) -> str: | |
| """ | |
| Convert a site or URL into a safe filename component. | |
| Examples: | |
| https://www.damndelicious.net -> damndelicious_net | |
| foodblog.com -> foodblog_com | |
| """ | |
| site = site.strip().lower() | |
| if site.startswith("http://"): | |
| site = site[7:] | |
| elif site.startswith("https://"): | |
| site = site[8:] | |
| site = site.split("/")[0] | |
| return site.replace(".", "_").replace("-", "_") | |
| def download_author_image_for_site( | |
| site_input: str, out_dir: str = "author_images" | |
| ) -> Dict[str, Any]: | |
| """ | |
| 1. Convert site input into a normalized base URL. | |
| 2. Locate the site's best 'About' page. | |
| 3. Extract the author image. | |
| 4. Download it using a filename that includes the site name. | |
| """ | |
| base_url = normalize_site_input(site_input) | |
| about_url = pick_best_about_url(site_input) | |
| if not about_url: | |
| return { | |
| "site_base_url": base_url, | |
| "about_url": None, | |
| "title": None, | |
| "author_image_url": None, | |
| "local_path": None, | |
| } | |
| info = scrape_author_image(about_url) | |
| author_url = info["author_image_url"] | |
| if not author_url: | |
| return { | |
| "site_base_url": base_url, | |
| "about_url": about_url, | |
| "title": info["title"], | |
| "author_image_url": None, | |
| "local_path": None, | |
| } | |
| safe_name = clean_site_name(base_url) | |
| filename = safe_name + "_author" | |
| local_path = download_image(author_url, out_dir=out_dir, filename=filename) | |
| return { | |
| "site_base_url": base_url, | |
| "about_url": about_url, | |
| "title": info["title"], | |
| "author_image_url": author_url, | |
| "local_path": local_path, | |
| } | |
| # ------------------ Optional CLI test ------------------ # | |
| if __name__ == "__main__": | |
| site_or_url = input("Enter site (e.g. 'damndelicious.net' or full URL): ").strip() | |
| result = download_author_image_for_site(site_or_url, out_dir="author_images") | |
| print("\nBase site:", result["site_base_url"]) | |
| print("About URL:", result["about_url"]) | |
| print("Page title:", result["title"]) | |
| print("Headshot URL:", result["author_image_url"]) | |
| print("Saved to:", result["local_path"]) | |