Medium-MCP / src /author_profile.py
Nikhil Pravin Pise
feat: comprehensive migration - merge Scraper + MCP Server
ae588db
import httpx
from typing import Dict, Any, Optional
import logging
from src.extractor import extract_from_apollo_state
from src.config import Config
logger = logging.getLogger("AuthorProfile")
async def scrape_author_profile(username: str) -> Optional[Dict[str, Any]]:
"""
Actively scrapes the author's profile and 'About' page to build a DNA profile.
"""
if not username:
return None
# Remove @ if present
username = username.replace("@", "")
profile_data = {
"username": username,
"about_page_data": None,
"latest_headlines": []
}
# 1. Scrape About Page
about_url = f"https://medium.com/@{username}/about"
try:
async with httpx.AsyncClient(timeout=Config.TIMEOUT_MS/1000, follow_redirects=True) as client:
resp = await client.get(about_url, headers=Config.get_headers())
if resp.status_code == 200:
# Try Apollo first
apollo_data = extract_from_apollo_state(resp.text)
if apollo_data:
# In Apollo state of About page, we look for the User object
# The extract_from_apollo_state function is tuned for Articles (looking for Post).
# We might need to extract User data manually here if the helper fails.
# But let's see if we can reuse the raw json_state.
raw_state = apollo_data.get("json_state") # This is actually the Post object in the helper.
# We need the FULL state.
pass
# If helper didn't give us what we want (because it looks for Post),
# we should probably parse the raw Apollo state ourselves here for User data.
# Let's do a custom extraction for User.
import re
import json
pattern = r'window\.__APOLLO_STATE__\s*=\s*'
match = re.search(pattern, resp.text)
if match:
json_str = resp.text[match.end():]
data, _ = json.JSONDecoder().raw_decode(json_str)
# Find User object
user_obj = None
for key, value in data.items():
if key.startswith("User:") and value.get("username") == username:
user_obj = value
break
if user_obj:
profile_data["bio"] = user_obj.get("bio")
profile_data["followerCount"] = user_obj.get("socialStats", {}).get("followerCount")
profile_data["createdAt"] = user_obj.get("createdAt")
profile_data["isBookAuthor"] = user_obj.get("isBookAuthor")
except Exception as e:
logger.warning(f"Failed to scrape About page for {username}: {e}")
# 2. Scrape Profile Page for Headlines
profile_url = f"https://medium.com/@{username}"
try:
async with httpx.AsyncClient(timeout=Config.TIMEOUT_MS/1000, follow_redirects=True) as client:
resp = await client.get(profile_url, headers=Config.get_headers())
if resp.status_code == 200:
import re
import json
pattern = r'window\.__APOLLO_STATE__\s*=\s*'
match = re.search(pattern, resp.text)
if match:
json_str = resp.text[match.end():]
data, _ = json.JSONDecoder().raw_decode(json_str)
# Find Posts
headlines = []
for key, value in data.items():
if key.startswith("Post:") and value.get("title"):
headlines.append(value.get("title"))
profile_data["latest_headlines"] = headlines[:10]
except Exception as e:
logger.warning(f"Failed to scrape Profile page for {username}: {e}")
return profile_data