Medium-MCP / src /url_resolver.py
Nikhil Pravin Pise
feat: comprehensive migration - merge Scraper + MCP Server
ae588db
"""
Medium URL Resolver
Comprehensive URL resolution for Medium articles including:
- Short link resolution (link.medium.com)
- Tracking URL parsing (Facebook, Google)
- Medium email redirect handling
- Domain validation (35+ known domains)
Ported from Freedium's medium-parser/utils.py
"""
import logging
import re
import string
from functools import lru_cache
from typing import Optional
from urllib.parse import parse_qs, urlparse
try:
import aiohttp
HAS_AIOHTTP = True
except ImportError:
HAS_AIOHTTP = False
import httpx
try:
import tld
HAS_TLD = True
except ImportError:
HAS_TLD = False
logger = logging.getLogger("URLResolver")
# Valid characters for Medium post IDs
VALID_ID_CHARS = set(string.ascii_letters + string.digits)
# Known Medium custom domains (subdomains)
# Source: medium-parser/utils.py
KNOWN_MEDIUM_CUSTOM_DOMAINS = (
"javascript.plainenglish.io",
"blog.llamaindex.ai",
"code.likeagirl.io",
"medium.datadriveninvestor.com",
"blog.det.life",
"python.plainenglish.io",
"blog.stackademic.com",
"ai.gopubby.com",
"blog.devops.dev",
"levelup.gitconnected.com",
"betterhumans.coach.me",
"ai.plainenglish.io",
)
# Known Medium main domains
# Source: medium-parser/utils.py
KNOWN_MEDIUM_DOMAINS = (
"medium.com",
"uxplanet.org",
"osintteam.blog",
"ahmedelfakharany.com",
"drlee.io",
"artificialcorner.com",
"generativeai.pub",
"productcoalition.com",
"towardsdev.com",
"infosecwriteups.com",
"towardsdatascience.com",
"thetaoist.online",
"devopsquare.com",
"laceydearie.com",
"bettermarketing.pub",
"itnext.io",
"eand.co",
"betterprogramming.pub",
"curiouse.co",
"betterhumans.pub",
"uxdesign.cc",
"thebolditalic.com",
"arcdigital.media",
"codeburst.io",
"psiloveyou.xyz",
"writingcooperative.com",
"entrepreneurshandbook.co",
"prototypr.io",
"theascent.pub",
"storiusmag.com",
)
# Domains that are NOT Medium (to avoid false positives)
# Source: medium-parser/utils.py
NOT_MEDIUM_DOMAINS = (
"github.com",
"yandex.ru",
"yandex.kz",
"youtube.com",
"nytimes.com",
"wsj.com",
"reddit.com",
"elpais.com",
"forbes.com",
"bloomberg.com",
"lesechos.fr",
"otz.de",
"businessinsider.com",
"buff.ly",
"delish.com",
"economist.com",
"wired.com",
"rollingstone.com",
)
# Domains that are proxies/redirects (need special handling)
REDIRECT_DOMAINS = ("12ft.io", "google.com", "facebook.com", "googleusercontent.com")
@lru_cache(maxsize=500)
def un_wwwify(url: str) -> str:
"""Remove 'www.' prefix from URL/domain."""
if url.startswith("www."):
return url.removeprefix("www.")
return url
def unquerify_url(url: str) -> str:
"""Remove all query parameters from URL."""
import urllib.parse
parsed_url = urllib.parse.urlparse(url)
if parsed_url.query:
parsed_url = parsed_url._replace(query="")
sanitized_url = urllib.parse.urlunparse(parsed_url)
return sanitized_url.removesuffix("/")
def unpaginate_url(url: str) -> str:
"""Remove page pagination from URL."""
sanitized_url = url.removesuffix("/page/2")
return sanitized_url.removesuffix("/")
def correct_url(url: str) -> str:
"""
Correct common URL issues.
- Removes query parameters
- Removes pagination suffixes
"""
unquerified_url = unquerify_url(url)
unpaginated_url = unpaginate_url(unquerified_url)
return unpaginated_url
@lru_cache(maxsize=100)
def basic_hex_check(hex_string: str) -> bool:
"""
Check if string is a valid Medium post ID.
Post IDs are 8-12 character alphanumeric strings.
"""
# Check if all characters are valid
for char in hex_string:
if char not in VALID_ID_CHARS:
return False
# Check length (8-12 characters)
if len(hex_string) not in range(8, 12 + 1):
return False
return True
@lru_cache(maxsize=100)
def extract_hex_string(input_string: str) -> Optional[str]:
"""
Extract Medium post ID from URL path.
Uses two-stage regex matching:
1. Find hex string preceded by '-' (most reliable)
2. Find any hex string (fallback)
Returns:
The extracted post ID, or None if not found
"""
# Stage 1: Find hex string preceded by '-'
match = re.findall(r"-(\b[a-fA-F0-9]{8,12}\b)", input_string)
if not match:
# Stage 2: Find hex string without '-'
match = re.findall(r"(\b[a-fA-F0-9]{8,12}\b)", input_string)
return match[-1] if match else None
@lru_cache(maxsize=100)
def is_valid_post_id(hex_string: str) -> bool:
"""Check if string is a valid Medium post ID."""
return extract_hex_string(hex_string) is not None
def get_fld(url: str) -> Optional[str]:
"""Get first-level domain from URL."""
if HAS_TLD:
try:
return tld.get_fld(url)
except Exception:
return None
else:
# Fallback: simple domain extraction
try:
parsed = urlparse(url)
parts = parsed.netloc.split(".")
if len(parts) >= 2:
return ".".join(parts[-2:])
return parsed.netloc
except Exception:
return None
def is_valid_url(url: str) -> bool:
"""Check if URL has valid scheme and netloc."""
fld = get_fld(url)
if not fld:
return False
parsed_url = urlparse(url)
return bool(parsed_url.scheme and parsed_url.netloc)
async def resolve_medium_short_link(short_url_id: str, timeout: int = 5) -> str:
"""
Resolve Medium short link (link.medium.com) to full URL.
Uses rsci.app.link service for resolution.
"""
resolve_url = f"https://rsci.app.link/{short_url_id}"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
}
if HAS_AIOHTTP:
async with aiohttp.ClientSession() as session:
async with session.get(
resolve_url,
headers=headers,
timeout=aiohttp.ClientTimeout(total=timeout),
allow_redirects=False,
) as response:
return response.headers.get("Location", "")
else:
async with httpx.AsyncClient(timeout=timeout, follow_redirects=False) as client:
response = await client.get(resolve_url, headers=headers)
return response.headers.get("Location", "")
async def resolve_medium_url(url: str, timeout: int = 5) -> Optional[str]:
"""
Resolve various URL formats to Medium post ID.
Handles:
- Mobile links (/p/post_id)
- Short links (link.medium.com)
- Facebook tracking links
- Google tracking/cache links
- 12ft.io proxy links
- Medium email redirect links
Args:
url: The URL to resolve
timeout: Request timeout for short link resolution
Returns:
The extracted post ID, or None if resolution failed
"""
logger.debug(f"Resolving URL: {url}")
parsed_url = urlparse(url)
parsed_netloc = un_wwwify(parsed_url.netloc)
# Mobile link: /p/post_id
if parsed_url.path.startswith("/p/"):
logger.debug("URL is Medium mobile link")
post_id = parsed_url.path.rsplit("/p/")[1]
# Clean any trailing path segments
post_id = post_id.split("/")[0]
if basic_hex_check(post_id):
return post_id
# Facebook tracking link: l.facebook.com/l.php?u=...
elif parsed_netloc == "l.facebook.com" and parsed_url.path.startswith("/l.php"):
logger.debug("URL is Facebook tracking link")
parsed_query = parse_qs(parsed_url.query)
if parsed_query.get("u") and len(parsed_query["u"]) == 1:
post_url = parsed_query["u"][0]
return await resolve_medium_url(post_url, timeout)
logger.warning("Facebook link missing 'u' parameter")
return None
# Google Web Cache: webcache.googleusercontent.com/search?q=cache:...
elif (
parsed_netloc == "webcache.googleusercontent.com"
and parsed_url.path.startswith("/search")
):
logger.debug("URL is Google Web Cache link")
parsed_query = parse_qs(parsed_url.query)
if parsed_query.get("q") and len(parsed_query["q"]) == 1:
post_url = parsed_query["q"][0].removeprefix("cache:")
return await resolve_medium_url(post_url, timeout)
logger.warning("Google cache link missing 'q' parameter")
return None
# Google tracking link: google.com/url?url=... or ?q=...
elif parsed_netloc == "google.com" and parsed_url.path.startswith("/url"):
logger.debug("URL is Google tracking link")
parsed_query = parse_qs(parsed_url.query)
if parsed_query.get("url") and len(parsed_query["url"]) == 1:
post_url = parsed_query["url"][0]
return await resolve_medium_url(post_url, timeout)
elif parsed_query.get("q") and len(parsed_query["q"]) == 1:
post_url = parsed_query["q"][0]
return await resolve_medium_url(post_url, timeout)
logger.warning("Google link missing 'url' or 'q' parameter")
return None
# 12ft.io proxy: 12ft.io?q=...
elif parsed_netloc == "12ft.io":
logger.debug("URL is 12ft.io proxy link")
parsed_query = parse_qs(parsed_url.query)
if parsed_query.get("q") and len(parsed_query["q"]) == 1:
post_url = parsed_query["q"][0]
return await resolve_medium_url(post_url, timeout)
logger.warning("12ft.io link missing 'q' parameter")
return None
# Medium email redirect: /m/global-identity-2?redirectUrl=...
elif parsed_url.path.startswith("/m/global-identity-2"):
logger.debug("URL is Medium email redirect link")
parsed_query = parse_qs(parsed_url.query)
if parsed_query.get("redirectUrl") and len(parsed_query["redirectUrl"]) == 1:
post_url = parsed_query["redirectUrl"][0]
return await resolve_medium_url(post_url, timeout)
logger.warning("Medium redirect missing 'redirectUrl' parameter")
return None
# Medium short link: link.medium.com/xyz
elif parsed_netloc == "link.medium.com":
logger.debug("URL is Medium short link")
short_url_id = parsed_url.path.removeprefix("/")
if short_url_id:
try:
post_url = await resolve_medium_short_link(short_url_id, timeout)
if post_url:
return await resolve_medium_url(post_url, timeout)
except Exception as e:
logger.warning(f"Failed to resolve short link: {e}")
return None
# Standard URL: extract post_id from end of path
else:
logger.debug("Extracting post ID from standard URL path")
post_url = parsed_url.path.split("/")[-1]
post_id = post_url.split("-")[-1]
if basic_hex_check(post_id):
return post_id
# Try multi-stage extraction
extracted = extract_hex_string(parsed_url.path)
if extracted:
return extracted
logger.warning(f"Could not extract valid post ID from URL: {url}")
return None
async def is_valid_medium_url(url: str) -> bool:
"""
Check if URL is a valid Medium article URL.
Checks domain against known Medium domains and custom domains.
Returns False for known non-Medium domains.
Args:
url: The URL to validate
Returns:
True if URL is a valid Medium URL, False otherwise
"""
domain = get_fld(url)
if not domain:
return False
parsed_url = urlparse(url)
domain_netloc = un_wwwify(parsed_url.netloc)
# Accept redirect/proxy domains (need special handling)
if domain in REDIRECT_DOMAINS:
return True
# Reject known non-Medium domains
if domain in NOT_MEDIUM_DOMAINS or domain_netloc in NOT_MEDIUM_DOMAINS:
logger.debug(f"URL domain {domain} is in NOT_MEDIUM_DOMAINS")
return False
# Accept known Medium domains
if domain in KNOWN_MEDIUM_DOMAINS or domain_netloc in KNOWN_MEDIUM_CUSTOM_DOMAINS:
return True
# For unknown domains, try to resolve and check for valid post ID
logger.debug(f"URL domain {domain} not in known lists, attempting resolution")
try:
post_id = await resolve_medium_url(url)
return bool(post_id)
except Exception as e:
logger.warning(f"Failed to validate unknown domain: {e}")
return False
def extract_post_id_from_url(url: str) -> Optional[str]:
"""
Synchronous post ID extraction from URL.
For simple URL patterns that don't require network resolution.
Use resolve_medium_url for full URL resolution.
"""
parsed_url = urlparse(url)
# Mobile link
if parsed_url.path.startswith("/p/"):
post_id = parsed_url.path.rsplit("/p/")[1].split("/")[0]
if basic_hex_check(post_id):
return post_id
# Standard URL
path_parts = parsed_url.path.strip("/").split("/")
if path_parts:
last_part = path_parts[-1]
# Try to extract from slug-postid format
post_id = last_part.split("-")[-1]
if basic_hex_check(post_id):
return post_id
# Try multi-stage extraction
extracted = extract_hex_string(last_part)
if extracted:
return extracted
return None