File size: 13,239 Bytes
60742a2
a80eeb8
 
 
 
 
60742a2
 
 
 
a80eeb8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae588db
a80eeb8
 
 
 
 
 
 
60742a2
 
 
a80eeb8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
import re
from bs4 import BeautifulSoup
from typing import Dict, List, Optional, Any
from markdownify import markdownify as md
from urllib.parse import urljoin

# Import centralized image URL utilities from utils
from src.utils import upgrade_medium_image_url, get_medium_image_url, MEDIUM_IMAGE_DEFAULT_WIDTH


def extract_search_results(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]:
    """
    Extracts article metadata from search result cards.
    """
    results = []
    
    # Selectors for article cards
    # Try multiple selectors as Medium's DOM changes
    cards = soup.select("article") or \
            soup.select('div[role="article"]') or \
            soup.select(".postArticle") or \
            soup.select(".js-block")
            
    for card in cards:
        data = _extract_from_card(card, base_url)
        if data.get("url"):
            results.append(data)
            
    return results

def _extract_from_card(card, base_url: str) -> Dict[str, Any]:
    """Helper to extract data from a single card element."""
    # 1. URL & Title
    # Look for <a> tags that link to the article
    # Usually the first <h2> inside an <a> is the title
    title_tag = card.find("h2")
    title = title_tag.get_text(strip=True) if title_tag else None
    
    # Find the link associated with the title or the card
    link_tag = card.find("a", href=True)
    if title_tag and title_tag.find_parent("a"):
        link_tag = title_tag.find_parent("a")
    
    url = None
    if link_tag:
        href = link_tag["href"]
        # Clean up URL (remove query params usually)
        if "?" in href:
            href = href.split("?")[0]
        url = urljoin(base_url, href)

    # 2. Author
    # Heuristic: Look for links that go to a user profile (/@username or /u/username)
    # but aren't the main article link.
    author = None
    
    # Try specific selectors first
    author_tag = card.select_one('a[data-action="show-user-card"]') or \
                 card.select_one('.ds-link') or \
                 card.select_one('a[href*="/@"]')
                 
    if author_tag:
        # Verify it's not the title link
        if title_tag and author_tag == title_tag.find_parent("a"):
             pass # It's the title
        else:
            author = author_tag.get_text(strip=True)

    # Fallback: Look for a <p> or <span> that contains the author name
    # Usually it's the first piece of text in the card meta area
    if not author:
        # Find the meta div (often has date/read time)
        # We look for text that is NOT the date or read time
        for p in card.find_all(["p", "span"]):
            txt = p.get_text(strip=True)
            # Skip empty, date-like, or read-time strings
            if not txt or "min read" in txt or any(m in txt for m in ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "ago"]):
                continue
            # Skip title
            if title and txt in title:
                continue
            
            # If it looks like a name (2-3 words, capitalized), take it
            if 0 < len(txt.split()) <= 3 and txt[0].isupper():
                author = txt
                break

    # 3. Date / Reading Time
    # Often spans
    spans = card.find_all("span")
    pub_date = None
    reading_time = None
    
    for s in spans:
        txt = s.get_text(strip=True)
        # Reading time usually ends with "min read"
        if "min read" in txt:
            try:
                reading_time = float(txt.replace("min read", "").strip())
            except ValueError:
                pass
        # Date heuristic: "Nov 7" or "2 days ago"
        # Hard to parse perfectly without regex, but we can grab it if it looks like a date
        # For now, we might skip complex date parsing or just take the first span that isn't reading time
        elif not pub_date and len(txt) < 15 and any(c.isdigit() for c in txt):
             # Very rough heuristic
             pub_date = txt

    # 4. Image URL
    # Priority:
    # 1. <img src="..." class="..."/> inside the card (often has specific classes for covers)
    # 2. First <img> tag in the card
    # Note: Search results don't always have og:image tags (those are in the head), so we must rely on the card's HTML.
    image_url = None
    
    # Try to find the main article image (often has specific classes or sizes)
    # Medium uses responsive images, often in <picture> or <img> with srcset.
    # We'll look for the largest image or the first one that isn't an avatar.
    
    images = card.find_all("img")
    for img in images:
        src = img.get("src", "")
        # Skip small avatars (often 20x20 or similar in URL)
        if "1*dmbNkD5D-u45r44go_cf0g.png" in src: # Common default avatar
            continue
        if "resize:fill:20:20" in src: # Tiny thumbnail
            continue
            
        # If it's a valid image, take it.
        # Medium images often have 'cdn-images-1.medium.com'
        if src:
            image_url = src
            break
            
    if not image_url:
        # Fallback to any img
        img_tag = card.find("img")
        if img_tag and img_tag.get("src"):
            image_url = img_tag["src"]

    # Upgrade image URL to high resolution
    image_url = upgrade_medium_image_url(image_url, target_width=1400)
    
    return {
        "url": url,
        "title": title,
        "author": {"name": author} if author else None,
        "publishingDate": pub_date,
        "readingTime": reading_time,
        "imageUrl": image_url,
    }

def extract_article_content(soup: BeautifulSoup, url: Optional[str] = None) -> Dict[str, Any]:
    """
    Extracts full content, claps, and responses from an article page.
    If extraction fails (Cloudflare/paywall), falls back to URL parsing.
    """
    content_data = {
        "markdownContent": None,
        "claps": None,
        "responses": None,
        "title": None,
        "author": None,
        "publication": None  # New field to track publication separately from author
    }
    
    # Extract Title (with fallbacks)
    # Try h1 first
    title_tag = soup.find("h1")
    if title_tag:
        content_data["title"] = title_tag.get_text(strip=True)
    
    # Try og:title
    if not content_data["title"]:
        og_title = soup.find("meta", property="og:title")
        if og_title and og_title.get("content"):
            content_data["title"] = og_title.get("content")
    
    # Try URL parsing if title is empty or generic (Cloudflare/Medium homepage)
    is_generic_title = content_data["title"] in [None, "", "Just a moment...", "medium.com", "Medium"]
    if is_generic_title and url:
        # Medium URLs are like: https://medium.com/@author/article-title-slug-hash
        # or https://medium.com/publication/article-title-slug-hash
        try:
            from urllib.parse import urlparse
            path_parts = urlparse(url).path.strip("/").split("/")
            if len(path_parts) >= 2:
                # Last part is the article slug
                article_slug = path_parts[-1]
                # Remove hash (last part after last hyphen if it's alphanumeric)
                slug_parts = article_slug.rsplit("-", 1)
                if len(slug_parts) > 1 and len(slug_parts[-1]) == 12:  # Medium hash is 12 chars
                    article_slug = slug_parts[0]
                # Convert slug to title: replace-hyphens-with-spaces
                title = article_slug.replace("-", " ").title()
                content_data["title"] = title
        except Exception:
            pass
    
    # Last resort: Try page title element (will be "medium.com" or "Just a moment..." for Cloudflare)
    if not content_data["title"]:
        title_elem = soup.find("title")
        if title_elem:
            page_title = title_elem.get_text(strip=True)
            # Only use if it's not a Cloudflare/generic page
            if page_title and page_title not in ["Just a moment...", "medium.com", "Medium"]:
                content_data["title"] = page_title
        
    # Extract Author
    # Meta tag is reliable: <meta name="author" content="...">
    meta_author = soup.find("meta", attrs={"name": "author"})
    if meta_author and meta_author.get("content"):
        content_data["author"] = {"name": meta_author.get("content")}
    else:
        # Fallback to selectors
        author_tag = soup.select_one('a[data-action="show-user-card"]') or soup.select_one('.ds-link')
        if author_tag:
             author_text = author_tag.get_text(strip=True)
             if author_text:  # Only set if we got actual text
                 content_data["author"] = {"name": author_text}
    
    # Extract publication or author from URL (metadata extraction)
    if url:
        try:
            from urllib.parse import urlparse
            path_parts = urlparse(url).path.strip("/").split("/")
            if len(path_parts) >= 1:
                first_part = path_parts[0]
                # Check for @username format (personal blog)
                if first_part.startswith("@"):
                    username = first_part[1:]  # Remove @ symbol
                    formatted_name = username.replace("-", " ").title()
                    # If we don't have an author yet, use the username
                    if not content_data["author"]:
                        content_data["author"] = {"name": formatted_name}
                # Otherwise it's a publication name (like "ai-in-plain-english")
                else:
                    pub_name = first_part.replace("-", " ").title()
                    content_data["publication"] = pub_name
                    # Only use publication as author if we have absolutely no author info
                    # (Note: This is not ideal but better than nothing for blocked pages)
        except Exception:
            pass
    
    # Pre-extract og:description for fallback (before attempting main extraction)
    og_description = soup.find("meta", property="og:description")
    fallback_description = og_description.get("content") if og_description else None
    
    # Extract Claps
    try:
        clap_el = soup.select_one('button[data-testid="clapCount"]') or soup.select_one('.clapCount')
        if clap_el:
            txt = clap_el.get_text(strip=True)
            if "K" in txt:
                content_data["claps"] = int(float(txt.replace("K", "")) * 1000)
            else:
                content_data["claps"] = int(txt)
    except Exception:
        pass

    # Extract Responses
    try:
        resp_el = soup.select_one('button[data-testid="responsesCount"]') or soup.select_one('.responsesCount')
        if resp_el:
             txt = resp_el.get_text(strip=True)
             content_data["responses"] = int(txt)
    except Exception:
        pass

    # Extract Content
    article = soup.find("article") or soup.find("section")
    if article:
        # Remove clutter
        for tag in article.select("button, .speechify-btn, .metabar, footer"):
            tag.decompose()
            
        html = str(article)
        content_data["markdownContent"] = md(html, heading_style="ATX")
        
    # Fallback 1: Try to extract first few paragraphs (intro/header) even if article tag failed
    if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 100:
        # Look for any paragraphs in the page (might be intro text that loaded before paywall)
        paragraphs = soup.find_all("p")
        if paragraphs:
            # Get first 3-5 paragraphs that have substantial content
            intro_text = []
            for p in paragraphs[:10]:  # Check first 10 paragraphs
                text = p.get_text(strip=True)
                # Skip short paragraphs (likely meta info) and certain patterns
                if len(text) > 50 and "min read" not in text.lower() and "ago" not in text:
                    intro_text.append(text)
                if len(intro_text) >= 3:  # Got enough intro paragraphs
                    break
            
            if intro_text:
                combined_intro = "\n\n".join(intro_text)
                if not content_data["markdownContent"]:
                    content_data["markdownContent"] = combined_intro
                else:
                    # Append intro to existing content if it was too short
                    content_data["markdownContent"] += "\n\n" + combined_intro
    
    # Fallback 2: Meta Description (if still no content)
    if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 50:
        if fallback_description:
            desc_text = f"Summary: {fallback_description}"
            if content_data["markdownContent"]:
                content_data["markdownContent"] = desc_text + "\n\n" + content_data["markdownContent"]
            else:
                content_data["markdownContent"] = desc_text
        else:
            # Last resort: try name="description"
            meta_desc = soup.find("meta", attrs={"name": "description"})
            if meta_desc:
                content_data["markdownContent"] = f"Summary: {meta_desc.get('content', '')}"
        
    return content_data