import re from bs4 import BeautifulSoup from typing import Dict, List, Optional, Any from markdownify import markdownify as md from urllib.parse import urljoin # Import centralized image URL utilities from utils from src.utils import upgrade_medium_image_url, get_medium_image_url, MEDIUM_IMAGE_DEFAULT_WIDTH def extract_search_results(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]: """ Extracts article metadata from search result cards. """ results = [] # Selectors for article cards # Try multiple selectors as Medium's DOM changes cards = soup.select("article") or \ soup.select('div[role="article"]') or \ soup.select(".postArticle") or \ soup.select(".js-block") for card in cards: data = _extract_from_card(card, base_url) if data.get("url"): results.append(data) return results def _extract_from_card(card, base_url: str) -> Dict[str, Any]: """Helper to extract data from a single card element.""" # 1. URL & Title # Look for tags that link to the article # Usually the first

inside an is the title title_tag = card.find("h2") title = title_tag.get_text(strip=True) if title_tag else None # Find the link associated with the title or the card link_tag = card.find("a", href=True) if title_tag and title_tag.find_parent("a"): link_tag = title_tag.find_parent("a") url = None if link_tag: href = link_tag["href"] # Clean up URL (remove query params usually) if "?" in href: href = href.split("?")[0] url = urljoin(base_url, href) # 2. Author # Heuristic: Look for links that go to a user profile (/@username or /u/username) # but aren't the main article link. author = None # Try specific selectors first author_tag = card.select_one('a[data-action="show-user-card"]') or \ card.select_one('.ds-link') or \ card.select_one('a[href="/@"]') if author_tag: # Verify it's not the title link if title_tag and author_tag == title_tag.find_parent("a"): pass # It's the title else: author = author_tag.get_text(strip=True) # Fallback: Look for a
or that contains the author name # Usually it's the first piece of text in the card meta area if not author: # Find the meta div (often has date/read time) # We look for text that is NOT the date or read time for p in card.find_all(["p", "span"]): txt = p.get_text(strip=True) # Skip empty, date-like, or read-time strings if not txt or "min read" in txt or any(m in txt for m in ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "ago"]): continue # Skip title if title and txt in title: continue # If it looks like a name (2-3 words, capitalized), take it if 0 < len(txt.split()) <= 3 and txt[0].isupper(): author = txt break # 3. Date / Reading Time # Often spans spans = card.find_all("span") pub_date = None reading_time = None for s in spans: txt = s.get_text(strip=True) # Reading time usually ends with "min read" if "min read" in txt: try: reading_time = float(txt.replace("min read", "").strip()) except ValueError: pass # Date heuristic: "Nov 7" or "2 days ago" # Hard to parse perfectly without regex, but we can grab it if it looks like a date # For now, we might skip complex date parsing or just take the first span that isn't reading time elif not pub_date and len(txt) < 15 and any(c.isdigit() for c in txt): # Very rough heuristic pub_date = txt # 4. Image URL # Priority: # 1. inside the card (often has specific classes for covers) # 2. First tag in the card # Note: Search results don't always have og:image tags (those are in the head), so we must rely on the card's HTML. image_url = None # Try to find the main article image (often has specific classes or sizes) # Medium uses responsive images, often in or with srcset. # We'll look for the largest image or the first one that isn't an avatar. images = card.find_all("img") for img in images: src = img.get("src", "") # Skip small avatars (often 20x20 or similar in URL) if "1dmbNkD5D-u45r44go_cf0g.png" in src: # Common default avatar continue if "resize:fill:20:20" in src: # Tiny thumbnail continue # If it's a valid image, take it. # Medium images often have 'cdn-images-1.medium.com' if src: image_url = src break if not image_url: # Fallback to any img img_tag = card.find("img") if img_tag and img_tag.get("src"): image_url = img_tag["src"] # Upgrade image URL to high resolution image_url = upgrade_medium_image_url(image_url, target_width=1400) return { "url": url, "title": title, "author": {"name": author} if author else None, "publishingDate": pub_date, "readingTime": reading_time, "imageUrl": image_url, } def extract_article_content(soup: BeautifulSoup, url: Optional[str] = None) -> Dict[str, Any]: """ Extracts full content, claps, and responses from an article page. If extraction fails (Cloudflare/paywall), falls back to URL parsing. """ content_data = { "markdownContent": None, "claps": None, "responses": None, "title": None, "author": None, "publication": None # New field to track publication separately from author } # Extract Title (with fallbacks) # Try h1 first title_tag = soup.find("h1") if title_tag: content_data["title"] = title_tag.get_text(strip=True) # Try og:title if not content_data["title"]: og_title = soup.find("meta", property="og:title") if og_title and og_title.get("content"): content_data["title"] = og_title.get("content") # Try URL parsing if title is empty or generic (Cloudflare/Medium homepage) is_generic_title = content_data["title"] in [None, "", "Just a moment...", "medium.com", "Medium"] if is_generic_title and url: # Medium URLs are like: https://medium.com/@author/article-title-slug-hash # or https://medium.com/publication/article-title-slug-hash try: from urllib.parse import urlparse path_parts = urlparse(url).path.strip("/").split("/") if len(path_parts) >= 2: # Last part is the article slug article_slug = path_parts[-1] # Remove hash (last part after last hyphen if it's alphanumeric) slug_parts = article_slug.rsplit("-", 1) if len(slug_parts) > 1 and len(slug_parts[-1]) == 12: # Medium hash is 12 chars article_slug = slug_parts[0] # Convert slug to title: replace-hyphens-with-spaces title = article_slug.replace("-", " ").title() content_data["title"] = title except Exception: pass # Last resort: Try page title element (will be "medium.com" or "Just a moment..." for Cloudflare) if not content_data["title"]: title_elem = soup.find("title") if title_elem: page_title = title_elem.get_text(strip=True) # Only use if it's not a Cloudflare/generic page if page_title and page_title not in ["Just a moment...", "medium.com", "Medium"]: content_data["title"] = page_title # Extract Author # Meta tag is reliable: meta_author = soup.find("meta", attrs={"name": "author"}) if meta_author and meta_author.get("content"): content_data["author"] = {"name": meta_author.get("content")} else: # Fallback to selectors author_tag = soup.select_one('a[data-action="show-user-card"]') or soup.select_one('.ds-link') if author_tag: author_text = author_tag.get_text(strip=True) if author_text: # Only set if we got actual text content_data["author"] = {"name": author_text} # Extract publication or author from URL (metadata extraction) if url: try: from urllib.parse import urlparse path_parts = urlparse(url).path.strip("/").split("/") if len(path_parts) >= 1: first_part = path_parts[0] # Check for @username format (personal blog) if first_part.startswith("@"): username = first_part[1:] # Remove @ symbol formatted_name = username.replace("-", " ").title() # If we don't have an author yet, use the username if not content_data["author"]: content_data["author"] = {"name": formatted_name} # Otherwise it's a publication name (like "ai-in-plain-english") else: pub_name = first_part.replace("-", " ").title() content_data["publication"] = pub_name # Only use publication as author if we have absolutely no author info # (Note: This is not ideal but better than nothing for blocked pages) except Exception: pass # Pre-extract og:description for fallback (before attempting main extraction) og_description = soup.find("meta", property="og:description") fallback_description = og_description.get("content") if og_description else None # Extract Claps try: clap_el = soup.select_one('button[data-testid="clapCount"]') or soup.select_one('.clapCount') if clap_el: txt = clap_el.get_text(strip=True) if "K" in txt: content_data["claps"] = int(float(txt.replace("K", "")) * 1000) else: content_data["claps"] = int(txt) except Exception: pass # Extract Responses try: resp_el = soup.select_one('button[data-testid="responsesCount"]') or soup.select_one('.responsesCount') if resp_el: txt = resp_el.get_text(strip=True) content_data["responses"] = int(txt) except Exception: pass # Extract Content article = soup.find("article") or soup.find("section") if article: # Remove clutter for tag in article.select("button, .speechify-btn, .metabar, footer"): tag.decompose() html = str(article) content_data["markdownContent"] = md(html, heading_style="ATX") # Fallback 1: Try to extract first few paragraphs (intro/header) even if article tag failed if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 100: # Look for any paragraphs in the page (might be intro text that loaded before paywall) paragraphs = soup.find_all("p") if paragraphs: # Get first 3-5 paragraphs that have substantial content intro_text = [] for p in paragraphs[:10]: # Check first 10 paragraphs text = p.get_text(strip=True) # Skip short paragraphs (likely meta info) and certain patterns if len(text) > 50 and "min read" not in text.lower() and "ago" not in text: intro_text.append(text) if len(intro_text) >= 3: # Got enough intro paragraphs break if intro_text: combined_intro = "\n\n".join(intro_text) if not content_data["markdownContent"]: content_data["markdownContent"] = combined_intro else: # Append intro to existing content if it was too short content_data["markdownContent"] += "\n\n" + combined_intro # Fallback 2: Meta Description (if still no content) if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 50: if fallback_description: desc_text = f"Summary: {fallback_description}" if content_data["markdownContent"]: content_data["markdownContent"] = desc_text + "\n\n" + content_data["markdownContent"] else: content_data["markdownContent"] = desc_text else: # Last resort: try name="description" meta_desc = soup.find("meta", attrs={"name": "description"}) if meta_desc: content_data["markdownContent"] = f"Summary: {meta_desc.get('content', '')}" return content_data