Spaces:

T0X1N
/

Medium-MCP

Sleeping

Medium-MCP / src /author_profile.py

Nikhil Pravin Pise

feat: comprehensive migration - merge Scraper + MCP Server

ae588db 4 months ago

4.16 kB

	import httpx
	from typing import Dict, Any, Optional
	import logging
	from src.extractor import extract_from_apollo_state
	from src.config import Config

	logger = logging.getLogger("AuthorProfile")

	async def scrape_author_profile(username: str) -> Optional[Dict[str, Any]]:
	"""
	Actively scrapes the author's profile and 'About' page to build a DNA profile.
	"""
	if not username:
	return None

	# Remove @ if present
	username = username.replace("@", "")

	profile_data = {
	"username": username,
	"about_page_data": None,
	"latest_headlines": []
	}

	# 1. Scrape About Page
	about_url = f"https://medium.com/@{username}/about"
	try:
	async with httpx.AsyncClient(timeout=Config.TIMEOUT_MS/1000, follow_redirects=True) as client:
	resp = await client.get(about_url, headers=Config.get_headers())
	if resp.status_code == 200:
	# Try Apollo first
	apollo_data = extract_from_apollo_state(resp.text)
	if apollo_data:
	# In Apollo state of About page, we look for the User object
	# The extract_from_apollo_state function is tuned for Articles (looking for Post).
	# We might need to extract User data manually here if the helper fails.
	# But let's see if we can reuse the raw json_state.
	raw_state = apollo_data.get("json_state") # This is actually the Post object in the helper.
	# We need the FULL state.
	pass

	# If helper didn't give us what we want (because it looks for Post),
	# we should probably parse the raw Apollo state ourselves here for User data.
	# Let's do a custom extraction for User.
	import re
	import json
	pattern = r'window\.__APOLLO_STATE__\s=\s'
	match = re.search(pattern, resp.text)
	if match:
	json_str = resp.text[match.end():]
	data, _ = json.JSONDecoder().raw_decode(json_str)

	# Find User object
	user_obj = None
	for key, value in data.items():
	if key.startswith("User:") and value.get("username") == username:
	user_obj = value
	break

	if user_obj:
	profile_data["bio"] = user_obj.get("bio")
	profile_data["followerCount"] = user_obj.get("socialStats", {}).get("followerCount")
	profile_data["createdAt"] = user_obj.get("createdAt")
	profile_data["isBookAuthor"] = user_obj.get("isBookAuthor")

	except Exception as e:
	logger.warning(f"Failed to scrape About page for {username}: {e}")

	# 2. Scrape Profile Page for Headlines
	profile_url = f"https://medium.com/@{username}"
	try:
	async with httpx.AsyncClient(timeout=Config.TIMEOUT_MS/1000, follow_redirects=True) as client:
	resp = await client.get(profile_url, headers=Config.get_headers())
	if resp.status_code == 200:
	import re
	import json
	pattern = r'window\.__APOLLO_STATE__\s=\s'
	match = re.search(pattern, resp.text)
	if match:
	json_str = resp.text[match.end():]
	data, _ = json.JSONDecoder().raw_decode(json_str)

	# Find Posts
	headlines = []
	for key, value in data.items():
	if key.startswith("Post:") and value.get("title"):
	headlines.append(value.get("title"))

	profile_data["latest_headlines"] = headlines[:10]

	except Exception as e:
	logger.warning(f"Failed to scrape Profile page for {username}: {e}")

	return profile_data