Spaces:

T0X1N
/

Medium-MCP

Sleeping

File size: 13,239 Bytes

import re
from bs4 import BeautifulSoup
from typing import Dict, List, Optional, Any
from markdownify import markdownify as md
from urllib.parse import urljoin

# Import centralized image URL utilities from utils
from src.utils import upgrade_medium_image_url, get_medium_image_url, MEDIUM_IMAGE_DEFAULT_WIDTH


def extract_search_results(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]:
    """
    Extracts article metadata from search result cards.
    """
    results = []
    
    # Selectors for article cards
    # Try multiple selectors as Medium's DOM changes
    cards = soup.select("article") or \
            soup.select('div[role="article"]') or \
            soup.select(".postArticle") or \
            soup.select(".js-block")
            
    for card in cards:
        data = _extract_from_card(card, base_url)
        if data.get("url"):
            results.append(data)
            
    return results

def _extract_from_card(card, base_url: str) -> Dict[str, Any]:
    """Helper to extract data from a single card element."""
    # 1. URL & Title
    # Look for <a> tags that link to the article
    # Usually the first <h2> inside an <a> is the title
    title_tag = card.find("h2")
    title = title_tag.get_text(strip=True) if title_tag else None
    
    # Find the link associated with the title or the card
    link_tag = card.find("a", href=True)
    if title_tag and title_tag.find_parent("a"):
        link_tag = title_tag.find_parent("a")
    
    url = None
    if link_tag:
        href = link_tag["href"]
        # Clean up URL (remove query params usually)
        if "?" in href:
            href = href.split("?")[0]
        url = urljoin(base_url, href)

    # 2. Author
    # Heuristic: Look for links that go to a user profile (/@username or /u/username)
    # but aren't the main article link.
    author = None
    
    # Try specific selectors first
    author_tag = card.select_one('a[data-action="show-user-card"]') or \
                 card.select_one('.ds-link') or \
                 card.select_one('a[href*="/@"]')
                 
    if author_tag:
        # Verify it's not the title link
        if title_tag and author_tag == title_tag.find_parent("a"):
             pass # It's the title
        else:
            author = author_tag.get_text(strip=True)

    # Fallback: Look for a <p> or <span> that contains the author name
    # Usually it's the first piece of text in the card meta area
    if not author:
        # Find the meta div (often has date/read time)
        # We look for text that is NOT the date or read time
        for p in card.find_all(["p", "span"]):
            txt = p.get_text(strip=True)
            # Skip empty, date-like, or read-time strings
            if not txt or "min read" in txt or any(m in txt for m in ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "ago"]):
                continue
            # Skip title
            if title and txt in title:
                continue
            
            # If it looks like a name (2-3 words, capitalized), take it
            if 0 < len(txt.split()) <= 3 and txt[0].isupper():
                author = txt
                break

    # 3. Date / Reading Time
    # Often spans
    spans = card.find_all("span")
    pub_date = None
    reading_time = None
    
    for s in spans:
        txt = s.get_text(strip=True)
        # Reading time usually ends with "min read"
        if "min read" in txt:
            try:
                reading_time = float(txt.replace("min read", "").strip())
            except ValueError:
                pass
        # Date heuristic: "Nov 7" or "2 days ago"
        # Hard to parse perfectly without regex, but we can grab it if it looks like a date
        # For now, we might skip complex date parsing or just take the first span that isn't reading time
        elif not pub_date and len(txt) < 15 and any(c.isdigit() for c in txt):
             # Very rough heuristic
             pub_date = txt

    # 4. Image URL
    # Priority:
    # 1. <img src="..." class="..."/> inside the card (often has specific classes for covers)
    # 2. First <img> tag in the card
    # Note: Search results don't always have og:image tags (those are in the head), so we must rely on the card's HTML.
    image_url = None
    
    # Try to find the main article image (often has specific classes or sizes)
    # Medium uses responsive images, often in <picture> or <img> with srcset.
    # We'll look for the largest image or the first one that isn't an avatar.
    
    images = card.find_all("img")
    for img in images:
        src = img.get("src", "")
        # Skip small avatars (often 20x20 or similar in URL)
        if "1*dmbNkD5D-u45r44go_cf0g.png" in src: # Common default avatar
            continue
        if "resize:fill:20:20" in src: # Tiny thumbnail
            continue
            
        # If it's a valid image, take it.
        # Medium images often have 'cdn-images-1.medium.com'
        if src:
            image_url = src
            break
            
    if not image_url:
        # Fallback to any img
        img_tag = card.find("img")
        if img_tag and img_tag.get("src"):
            image_url = img_tag["src"]

    # Upgrade image URL to high resolution
    image_url = upgrade_medium_image_url(image_url, target_width=1400)
    
    return {
        "url": url,
        "title": title,
        "author": {"name": author} if author else None,
        "publishingDate": pub_date,
        "readingTime": reading_time,
        "imageUrl": image_url,
    }

def extract_article_content(soup: BeautifulSoup, url: Optional[str] = None) -> Dict[str, Any]:
    """
    Extracts full content, claps, and responses from an article page.
    If extraction fails (Cloudflare/paywall), falls back to URL parsing.
    """
    content_data = {
        "markdownContent": None,
        "claps": None,
        "responses": None,
        "title": None,
        "author": None,
        "publication": None  # New field to track publication separately from author
    }
    
    # Extract Title (with fallbacks)
    # Try h1 first
    title_tag = soup.find("h1")
    if title_tag:
        content_data["title"] = title_tag.get_text(strip=True)
    
    # Try og:title
    if not content_data["title"]:
        og_title = soup.find("meta", property="og:title")
        if og_title and og_title.get("content"):
            content_data["title"] = og_title.get("content")
    
    # Try URL parsing if title is empty or generic (Cloudflare/Medium homepage)
    is_generic_title = content_data["title"] in [None, "", "Just a moment...", "medium.com", "Medium"]
    if is_generic_title and url:
        # Medium URLs are like: https://medium.com/@author/article-title-slug-hash
        # or https://medium.com/publication/article-title-slug-hash
        try:
            from urllib.parse import urlparse
            path_parts = urlparse(url).path.strip("/").split("/")
            if len(path_parts) >= 2:
                # Last part is the article slug
                article_slug = path_parts[-1]
                # Remove hash (last part after last hyphen if it's alphanumeric)
                slug_parts = article_slug.rsplit("-", 1)
                if len(slug_parts) > 1 and len(slug_parts[-1]) == 12:  # Medium hash is 12 chars
                    article_slug = slug_parts[0]
                # Convert slug to title: replace-hyphens-with-spaces
                title = article_slug.replace("-", " ").title()
                content_data["title"] = title
        except Exception:
            pass
    
    # Last resort: Try page title element (will be "medium.com" or "Just a moment..." for Cloudflare)
    if not content_data["title"]:
        title_elem = soup.find("title")
        if title_elem:
            page_title = title_elem.get_text(strip=True)
            # Only use if it's not a Cloudflare/generic page
            if page_title and page_title not in ["Just a moment...", "medium.com", "Medium"]:
                content_data["title"] = page_title
        
    # Extract Author
    # Meta tag is reliable: <meta name="author" content="...">
    meta_author = soup.find("meta", attrs={"name": "author"})
    if meta_author and meta_author.get("content"):
        content_data["author"] = {"name": meta_author.get("content")}
    else:
        # Fallback to selectors
        author_tag = soup.select_one('a[data-action="show-user-card"]') or soup.select_one('.ds-link')
        if author_tag:
             author_text = author_tag.get_text(strip=True)
             if author_text:  # Only set if we got actual text
                 content_data["author"] = {"name": author_text}
    
    # Extract publication or author from URL (metadata extraction)
    if url:
        try:
            from urllib.parse import urlparse
            path_parts = urlparse(url).path.strip("/").split("/")
            if len(path_parts) >= 1:
                first_part = path_parts[0]
                # Check for @username format (personal blog)
                if first_part.startswith("@"):
                    username = first_part[1:]  # Remove @ symbol
                    formatted_name = username.replace("-", " ").title()
                    # If we don't have an author yet, use the username
                    if not content_data["author"]:
                        content_data["author"] = {"name": formatted_name}
                # Otherwise it's a publication name (like "ai-in-plain-english")
                else:
                    pub_name = first_part.replace("-", " ").title()
                    content_data["publication"] = pub_name
                    # Only use publication as author if we have absolutely no author info
                    # (Note: This is not ideal but better than nothing for blocked pages)
        except Exception:
            pass
    
    # Pre-extract og:description for fallback (before attempting main extraction)
    og_description = soup.find("meta", property="og:description")
    fallback_description = og_description.get("content") if og_description else None
    
    # Extract Claps
    try:
        clap_el = soup.select_one('button[data-testid="clapCount"]') or soup.select_one('.clapCount')
        if clap_el:
            txt = clap_el.get_text(strip=True)
            if "K" in txt:
                content_data["claps"] = int(float(txt.replace("K", "")) * 1000)
            else:
                content_data["claps"] = int(txt)
    except Exception:
        pass

    # Extract Responses
    try:
        resp_el = soup.select_one('button[data-testid="responsesCount"]') or soup.select_one('.responsesCount')
        if resp_el:
             txt = resp_el.get_text(strip=True)
             content_data["responses"] = int(txt)
    except Exception:
        pass

    # Extract Content
    article = soup.find("article") or soup.find("section")
    if article:
        # Remove clutter
        for tag in article.select("button, .speechify-btn, .metabar, footer"):
            tag.decompose()
            
        html = str(article)
        content_data["markdownContent"] = md(html, heading_style="ATX")
        
    # Fallback 1: Try to extract first few paragraphs (intro/header) even if article tag failed
    if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 100:
        # Look for any paragraphs in the page (might be intro text that loaded before paywall)
        paragraphs = soup.find_all("p")
        if paragraphs:
            # Get first 3-5 paragraphs that have substantial content
            intro_text = []
            for p in paragraphs[:10]:  # Check first 10 paragraphs
                text = p.get_text(strip=True)
                # Skip short paragraphs (likely meta info) and certain patterns
                if len(text) > 50 and "min read" not in text.lower() and "ago" not in text:
                    intro_text.append(text)
                if len(intro_text) >= 3:  # Got enough intro paragraphs
                    break
            
            if intro_text:
                combined_intro = "\n\n".join(intro_text)
                if not content_data["markdownContent"]:
                    content_data["markdownContent"] = combined_intro
                else:
                    # Append intro to existing content if it was too short
                    content_data["markdownContent"] += "\n\n" + combined_intro
    
    # Fallback 2: Meta Description (if still no content)
    if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 50:
        if fallback_description:
            desc_text = f"Summary: {fallback_description}"
            if content_data["markdownContent"]:
                content_data["markdownContent"] = desc_text + "\n\n" + content_data["markdownContent"]
            else:
                content_data["markdownContent"] = desc_text
        else:
            # Last resort: try name="description"
            meta_desc = soup.find("meta", attrs={"name": "description"})
            if meta_desc:
                content_data["markdownContent"] = f"Summary: {meta_desc.get('content', '')}"
        
    return content_data