Spaces:
Sleeping
Sleeping
| import re | |
| from bs4 import BeautifulSoup | |
| from typing import Dict, List, Optional, Any | |
| from markdownify import markdownify as md | |
| from urllib.parse import urljoin | |
| # Import centralized image URL utilities from utils | |
| from src.utils import upgrade_medium_image_url, get_medium_image_url, MEDIUM_IMAGE_DEFAULT_WIDTH | |
| def extract_search_results(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]: | |
| """ | |
| Extracts article metadata from search result cards. | |
| """ | |
| results = [] | |
| # Selectors for article cards | |
| # Try multiple selectors as Medium's DOM changes | |
| cards = soup.select("article") or \ | |
| soup.select('div[role="article"]') or \ | |
| soup.select(".postArticle") or \ | |
| soup.select(".js-block") | |
| for card in cards: | |
| data = _extract_from_card(card, base_url) | |
| if data.get("url"): | |
| results.append(data) | |
| return results | |
| def _extract_from_card(card, base_url: str) -> Dict[str, Any]: | |
| """Helper to extract data from a single card element.""" | |
| # 1. URL & Title | |
| # Look for <a> tags that link to the article | |
| # Usually the first <h2> inside an <a> is the title | |
| title_tag = card.find("h2") | |
| title = title_tag.get_text(strip=True) if title_tag else None | |
| # Find the link associated with the title or the card | |
| link_tag = card.find("a", href=True) | |
| if title_tag and title_tag.find_parent("a"): | |
| link_tag = title_tag.find_parent("a") | |
| url = None | |
| if link_tag: | |
| href = link_tag["href"] | |
| # Clean up URL (remove query params usually) | |
| if "?" in href: | |
| href = href.split("?")[0] | |
| url = urljoin(base_url, href) | |
| # 2. Author | |
| # Heuristic: Look for links that go to a user profile (/@username or /u/username) | |
| # but aren't the main article link. | |
| author = None | |
| # Try specific selectors first | |
| author_tag = card.select_one('a[data-action="show-user-card"]') or \ | |
| card.select_one('.ds-link') or \ | |
| card.select_one('a[href*="/@"]') | |
| if author_tag: | |
| # Verify it's not the title link | |
| if title_tag and author_tag == title_tag.find_parent("a"): | |
| pass # It's the title | |
| else: | |
| author = author_tag.get_text(strip=True) | |
| # Fallback: Look for a <p> or <span> that contains the author name | |
| # Usually it's the first piece of text in the card meta area | |
| if not author: | |
| # Find the meta div (often has date/read time) | |
| # We look for text that is NOT the date or read time | |
| for p in card.find_all(["p", "span"]): | |
| txt = p.get_text(strip=True) | |
| # Skip empty, date-like, or read-time strings | |
| if not txt or "min read" in txt or any(m in txt for m in ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "ago"]): | |
| continue | |
| # Skip title | |
| if title and txt in title: | |
| continue | |
| # If it looks like a name (2-3 words, capitalized), take it | |
| if 0 < len(txt.split()) <= 3 and txt[0].isupper(): | |
| author = txt | |
| break | |
| # 3. Date / Reading Time | |
| # Often spans | |
| spans = card.find_all("span") | |
| pub_date = None | |
| reading_time = None | |
| for s in spans: | |
| txt = s.get_text(strip=True) | |
| # Reading time usually ends with "min read" | |
| if "min read" in txt: | |
| try: | |
| reading_time = float(txt.replace("min read", "").strip()) | |
| except ValueError: | |
| pass | |
| # Date heuristic: "Nov 7" or "2 days ago" | |
| # Hard to parse perfectly without regex, but we can grab it if it looks like a date | |
| # For now, we might skip complex date parsing or just take the first span that isn't reading time | |
| elif not pub_date and len(txt) < 15 and any(c.isdigit() for c in txt): | |
| # Very rough heuristic | |
| pub_date = txt | |
| # 4. Image URL | |
| # Priority: | |
| # 1. <img src="..." class="..."/> inside the card (often has specific classes for covers) | |
| # 2. First <img> tag in the card | |
| # Note: Search results don't always have og:image tags (those are in the head), so we must rely on the card's HTML. | |
| image_url = None | |
| # Try to find the main article image (often has specific classes or sizes) | |
| # Medium uses responsive images, often in <picture> or <img> with srcset. | |
| # We'll look for the largest image or the first one that isn't an avatar. | |
| images = card.find_all("img") | |
| for img in images: | |
| src = img.get("src", "") | |
| # Skip small avatars (often 20x20 or similar in URL) | |
| if "1*dmbNkD5D-u45r44go_cf0g.png" in src: # Common default avatar | |
| continue | |
| if "resize:fill:20:20" in src: # Tiny thumbnail | |
| continue | |
| # If it's a valid image, take it. | |
| # Medium images often have 'cdn-images-1.medium.com' | |
| if src: | |
| image_url = src | |
| break | |
| if not image_url: | |
| # Fallback to any img | |
| img_tag = card.find("img") | |
| if img_tag and img_tag.get("src"): | |
| image_url = img_tag["src"] | |
| # Upgrade image URL to high resolution | |
| image_url = upgrade_medium_image_url(image_url, target_width=1400) | |
| return { | |
| "url": url, | |
| "title": title, | |
| "author": {"name": author} if author else None, | |
| "publishingDate": pub_date, | |
| "readingTime": reading_time, | |
| "imageUrl": image_url, | |
| } | |
| def extract_article_content(soup: BeautifulSoup, url: Optional[str] = None) -> Dict[str, Any]: | |
| """ | |
| Extracts full content, claps, and responses from an article page. | |
| If extraction fails (Cloudflare/paywall), falls back to URL parsing. | |
| """ | |
| content_data = { | |
| "markdownContent": None, | |
| "claps": None, | |
| "responses": None, | |
| "title": None, | |
| "author": None, | |
| "publication": None # New field to track publication separately from author | |
| } | |
| # Extract Title (with fallbacks) | |
| # Try h1 first | |
| title_tag = soup.find("h1") | |
| if title_tag: | |
| content_data["title"] = title_tag.get_text(strip=True) | |
| # Try og:title | |
| if not content_data["title"]: | |
| og_title = soup.find("meta", property="og:title") | |
| if og_title and og_title.get("content"): | |
| content_data["title"] = og_title.get("content") | |
| # Try URL parsing if title is empty or generic (Cloudflare/Medium homepage) | |
| is_generic_title = content_data["title"] in [None, "", "Just a moment...", "medium.com", "Medium"] | |
| if is_generic_title and url: | |
| # Medium URLs are like: https://medium.com/@author/article-title-slug-hash | |
| # or https://medium.com/publication/article-title-slug-hash | |
| try: | |
| from urllib.parse import urlparse | |
| path_parts = urlparse(url).path.strip("/").split("/") | |
| if len(path_parts) >= 2: | |
| # Last part is the article slug | |
| article_slug = path_parts[-1] | |
| # Remove hash (last part after last hyphen if it's alphanumeric) | |
| slug_parts = article_slug.rsplit("-", 1) | |
| if len(slug_parts) > 1 and len(slug_parts[-1]) == 12: # Medium hash is 12 chars | |
| article_slug = slug_parts[0] | |
| # Convert slug to title: replace-hyphens-with-spaces | |
| title = article_slug.replace("-", " ").title() | |
| content_data["title"] = title | |
| except Exception: | |
| pass | |
| # Last resort: Try page title element (will be "medium.com" or "Just a moment..." for Cloudflare) | |
| if not content_data["title"]: | |
| title_elem = soup.find("title") | |
| if title_elem: | |
| page_title = title_elem.get_text(strip=True) | |
| # Only use if it's not a Cloudflare/generic page | |
| if page_title and page_title not in ["Just a moment...", "medium.com", "Medium"]: | |
| content_data["title"] = page_title | |
| # Extract Author | |
| # Meta tag is reliable: <meta name="author" content="..."> | |
| meta_author = soup.find("meta", attrs={"name": "author"}) | |
| if meta_author and meta_author.get("content"): | |
| content_data["author"] = {"name": meta_author.get("content")} | |
| else: | |
| # Fallback to selectors | |
| author_tag = soup.select_one('a[data-action="show-user-card"]') or soup.select_one('.ds-link') | |
| if author_tag: | |
| author_text = author_tag.get_text(strip=True) | |
| if author_text: # Only set if we got actual text | |
| content_data["author"] = {"name": author_text} | |
| # Extract publication or author from URL (metadata extraction) | |
| if url: | |
| try: | |
| from urllib.parse import urlparse | |
| path_parts = urlparse(url).path.strip("/").split("/") | |
| if len(path_parts) >= 1: | |
| first_part = path_parts[0] | |
| # Check for @username format (personal blog) | |
| if first_part.startswith("@"): | |
| username = first_part[1:] # Remove @ symbol | |
| formatted_name = username.replace("-", " ").title() | |
| # If we don't have an author yet, use the username | |
| if not content_data["author"]: | |
| content_data["author"] = {"name": formatted_name} | |
| # Otherwise it's a publication name (like "ai-in-plain-english") | |
| else: | |
| pub_name = first_part.replace("-", " ").title() | |
| content_data["publication"] = pub_name | |
| # Only use publication as author if we have absolutely no author info | |
| # (Note: This is not ideal but better than nothing for blocked pages) | |
| except Exception: | |
| pass | |
| # Pre-extract og:description for fallback (before attempting main extraction) | |
| og_description = soup.find("meta", property="og:description") | |
| fallback_description = og_description.get("content") if og_description else None | |
| # Extract Claps | |
| try: | |
| clap_el = soup.select_one('button[data-testid="clapCount"]') or soup.select_one('.clapCount') | |
| if clap_el: | |
| txt = clap_el.get_text(strip=True) | |
| if "K" in txt: | |
| content_data["claps"] = int(float(txt.replace("K", "")) * 1000) | |
| else: | |
| content_data["claps"] = int(txt) | |
| except Exception: | |
| pass | |
| # Extract Responses | |
| try: | |
| resp_el = soup.select_one('button[data-testid="responsesCount"]') or soup.select_one('.responsesCount') | |
| if resp_el: | |
| txt = resp_el.get_text(strip=True) | |
| content_data["responses"] = int(txt) | |
| except Exception: | |
| pass | |
| # Extract Content | |
| article = soup.find("article") or soup.find("section") | |
| if article: | |
| # Remove clutter | |
| for tag in article.select("button, .speechify-btn, .metabar, footer"): | |
| tag.decompose() | |
| html = str(article) | |
| content_data["markdownContent"] = md(html, heading_style="ATX") | |
| # Fallback 1: Try to extract first few paragraphs (intro/header) even if article tag failed | |
| if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 100: | |
| # Look for any paragraphs in the page (might be intro text that loaded before paywall) | |
| paragraphs = soup.find_all("p") | |
| if paragraphs: | |
| # Get first 3-5 paragraphs that have substantial content | |
| intro_text = [] | |
| for p in paragraphs[:10]: # Check first 10 paragraphs | |
| text = p.get_text(strip=True) | |
| # Skip short paragraphs (likely meta info) and certain patterns | |
| if len(text) > 50 and "min read" not in text.lower() and "ago" not in text: | |
| intro_text.append(text) | |
| if len(intro_text) >= 3: # Got enough intro paragraphs | |
| break | |
| if intro_text: | |
| combined_intro = "\n\n".join(intro_text) | |
| if not content_data["markdownContent"]: | |
| content_data["markdownContent"] = combined_intro | |
| else: | |
| # Append intro to existing content if it was too short | |
| content_data["markdownContent"] += "\n\n" + combined_intro | |
| # Fallback 2: Meta Description (if still no content) | |
| if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 50: | |
| if fallback_description: | |
| desc_text = f"Summary: {fallback_description}" | |
| if content_data["markdownContent"]: | |
| content_data["markdownContent"] = desc_text + "\n\n" + content_data["markdownContent"] | |
| else: | |
| content_data["markdownContent"] = desc_text | |
| else: | |
| # Last resort: try name="description" | |
| meta_desc = soup.find("meta", attrs={"name": "description"}) | |
| if meta_desc: | |
| content_data["markdownContent"] = f"Summary: {meta_desc.get('content', '')}" | |
| return content_data | |