""" Medium URL Resolver Comprehensive URL resolution for Medium articles including: - Short link resolution (link.medium.com) - Tracking URL parsing (Facebook, Google) - Medium email redirect handling - Domain validation (35+ known domains) Ported from Freedium's medium-parser/utils.py """ import logging import re import string from functools import lru_cache from typing import Optional from urllib.parse import parse_qs, urlparse try: import aiohttp HAS_AIOHTTP = True except ImportError: HAS_AIOHTTP = False import httpx try: import tld HAS_TLD = True except ImportError: HAS_TLD = False logger = logging.getLogger("URLResolver") # Valid characters for Medium post IDs VALID_ID_CHARS = set(string.ascii_letters + string.digits) # Known Medium custom domains (subdomains) # Source: medium-parser/utils.py KNOWN_MEDIUM_CUSTOM_DOMAINS = ( "javascript.plainenglish.io", "blog.llamaindex.ai", "code.likeagirl.io", "medium.datadriveninvestor.com", "blog.det.life", "python.plainenglish.io", "blog.stackademic.com", "ai.gopubby.com", "blog.devops.dev", "levelup.gitconnected.com", "betterhumans.coach.me", "ai.plainenglish.io", ) # Known Medium main domains # Source: medium-parser/utils.py KNOWN_MEDIUM_DOMAINS = ( "medium.com", "uxplanet.org", "osintteam.blog", "ahmedelfakharany.com", "drlee.io", "artificialcorner.com", "generativeai.pub", "productcoalition.com", "towardsdev.com", "infosecwriteups.com", "towardsdatascience.com", "thetaoist.online", "devopsquare.com", "laceydearie.com", "bettermarketing.pub", "itnext.io", "eand.co", "betterprogramming.pub", "curiouse.co", "betterhumans.pub", "uxdesign.cc", "thebolditalic.com", "arcdigital.media", "codeburst.io", "psiloveyou.xyz", "writingcooperative.com", "entrepreneurshandbook.co", "prototypr.io", "theascent.pub", "storiusmag.com", ) # Domains that are NOT Medium (to avoid false positives) # Source: medium-parser/utils.py NOT_MEDIUM_DOMAINS = ( "github.com", "yandex.ru", "yandex.kz", "youtube.com", "nytimes.com", "wsj.com", "reddit.com", "elpais.com", "forbes.com", "bloomberg.com", "lesechos.fr", "otz.de", "businessinsider.com", "buff.ly", "delish.com", "economist.com", "wired.com", "rollingstone.com", ) # Domains that are proxies/redirects (need special handling) REDIRECT_DOMAINS = ("12ft.io", "google.com", "facebook.com", "googleusercontent.com") @lru_cache(maxsize=500) def un_wwwify(url: str) -> str: """Remove 'www.' prefix from URL/domain.""" if url.startswith("www."): return url.removeprefix("www.") return url def unquerify_url(url: str) -> str: """Remove all query parameters from URL.""" import urllib.parse parsed_url = urllib.parse.urlparse(url) if parsed_url.query: parsed_url = parsed_url._replace(query="") sanitized_url = urllib.parse.urlunparse(parsed_url) return sanitized_url.removesuffix("/") def unpaginate_url(url: str) -> str: """Remove page pagination from URL.""" sanitized_url = url.removesuffix("/page/2") return sanitized_url.removesuffix("/") def correct_url(url: str) -> str: """ Correct common URL issues. - Removes query parameters - Removes pagination suffixes """ unquerified_url = unquerify_url(url) unpaginated_url = unpaginate_url(unquerified_url) return unpaginated_url @lru_cache(maxsize=100) def basic_hex_check(hex_string: str) -> bool: """ Check if string is a valid Medium post ID. Post IDs are 8-12 character alphanumeric strings. """ # Check if all characters are valid for char in hex_string: if char not in VALID_ID_CHARS: return False # Check length (8-12 characters) if len(hex_string) not in range(8, 12 + 1): return False return True @lru_cache(maxsize=100) def extract_hex_string(input_string: str) -> Optional[str]: """ Extract Medium post ID from URL path. Uses two-stage regex matching: 1. Find hex string preceded by '-' (most reliable) 2. Find any hex string (fallback) Returns: The extracted post ID, or None if not found """ # Stage 1: Find hex string preceded by '-' match = re.findall(r"-(\b[a-fA-F0-9]{8,12}\b)", input_string) if not match: # Stage 2: Find hex string without '-' match = re.findall(r"(\b[a-fA-F0-9]{8,12}\b)", input_string) return match[-1] if match else None @lru_cache(maxsize=100) def is_valid_post_id(hex_string: str) -> bool: """Check if string is a valid Medium post ID.""" return extract_hex_string(hex_string) is not None def get_fld(url: str) -> Optional[str]: """Get first-level domain from URL.""" if HAS_TLD: try: return tld.get_fld(url) except Exception: return None else: # Fallback: simple domain extraction try: parsed = urlparse(url) parts = parsed.netloc.split(".") if len(parts) >= 2: return ".".join(parts[-2:]) return parsed.netloc except Exception: return None def is_valid_url(url: str) -> bool: """Check if URL has valid scheme and netloc.""" fld = get_fld(url) if not fld: return False parsed_url = urlparse(url) return bool(parsed_url.scheme and parsed_url.netloc) async def resolve_medium_short_link(short_url_id: str, timeout: int = 5) -> str: """ Resolve Medium short link (link.medium.com) to full URL. Uses rsci.app.link service for resolution. """ resolve_url = f"https://rsci.app.link/{short_url_id}" headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36" } if HAS_AIOHTTP: async with aiohttp.ClientSession() as session: async with session.get( resolve_url, headers=headers, timeout=aiohttp.ClientTimeout(total=timeout), allow_redirects=False, ) as response: return response.headers.get("Location", "") else: async with httpx.AsyncClient(timeout=timeout, follow_redirects=False) as client: response = await client.get(resolve_url, headers=headers) return response.headers.get("Location", "") async def resolve_medium_url(url: str, timeout: int = 5) -> Optional[str]: """ Resolve various URL formats to Medium post ID. Handles: - Mobile links (/p/post_id) - Short links (link.medium.com) - Facebook tracking links - Google tracking/cache links - 12ft.io proxy links - Medium email redirect links Args: url: The URL to resolve timeout: Request timeout for short link resolution Returns: The extracted post ID, or None if resolution failed """ logger.debug(f"Resolving URL: {url}") parsed_url = urlparse(url) parsed_netloc = un_wwwify(parsed_url.netloc) # Mobile link: /p/post_id if parsed_url.path.startswith("/p/"): logger.debug("URL is Medium mobile link") post_id = parsed_url.path.rsplit("/p/")[1] # Clean any trailing path segments post_id = post_id.split("/")[0] if basic_hex_check(post_id): return post_id # Facebook tracking link: l.facebook.com/l.php?u=... elif parsed_netloc == "l.facebook.com" and parsed_url.path.startswith("/l.php"): logger.debug("URL is Facebook tracking link") parsed_query = parse_qs(parsed_url.query) if parsed_query.get("u") and len(parsed_query["u"]) == 1: post_url = parsed_query["u"][0] return await resolve_medium_url(post_url, timeout) logger.warning("Facebook link missing 'u' parameter") return None # Google Web Cache: webcache.googleusercontent.com/search?q=cache:... elif ( parsed_netloc == "webcache.googleusercontent.com" and parsed_url.path.startswith("/search") ): logger.debug("URL is Google Web Cache link") parsed_query = parse_qs(parsed_url.query) if parsed_query.get("q") and len(parsed_query["q"]) == 1: post_url = parsed_query["q"][0].removeprefix("cache:") return await resolve_medium_url(post_url, timeout) logger.warning("Google cache link missing 'q' parameter") return None # Google tracking link: google.com/url?url=... or ?q=... elif parsed_netloc == "google.com" and parsed_url.path.startswith("/url"): logger.debug("URL is Google tracking link") parsed_query = parse_qs(parsed_url.query) if parsed_query.get("url") and len(parsed_query["url"]) == 1: post_url = parsed_query["url"][0] return await resolve_medium_url(post_url, timeout) elif parsed_query.get("q") and len(parsed_query["q"]) == 1: post_url = parsed_query["q"][0] return await resolve_medium_url(post_url, timeout) logger.warning("Google link missing 'url' or 'q' parameter") return None # 12ft.io proxy: 12ft.io?q=... elif parsed_netloc == "12ft.io": logger.debug("URL is 12ft.io proxy link") parsed_query = parse_qs(parsed_url.query) if parsed_query.get("q") and len(parsed_query["q"]) == 1: post_url = parsed_query["q"][0] return await resolve_medium_url(post_url, timeout) logger.warning("12ft.io link missing 'q' parameter") return None # Medium email redirect: /m/global-identity-2?redirectUrl=... elif parsed_url.path.startswith("/m/global-identity-2"): logger.debug("URL is Medium email redirect link") parsed_query = parse_qs(parsed_url.query) if parsed_query.get("redirectUrl") and len(parsed_query["redirectUrl"]) == 1: post_url = parsed_query["redirectUrl"][0] return await resolve_medium_url(post_url, timeout) logger.warning("Medium redirect missing 'redirectUrl' parameter") return None # Medium short link: link.medium.com/xyz elif parsed_netloc == "link.medium.com": logger.debug("URL is Medium short link") short_url_id = parsed_url.path.removeprefix("/") if short_url_id: try: post_url = await resolve_medium_short_link(short_url_id, timeout) if post_url: return await resolve_medium_url(post_url, timeout) except Exception as e: logger.warning(f"Failed to resolve short link: {e}") return None # Standard URL: extract post_id from end of path else: logger.debug("Extracting post ID from standard URL path") post_url = parsed_url.path.split("/")[-1] post_id = post_url.split("-")[-1] if basic_hex_check(post_id): return post_id # Try multi-stage extraction extracted = extract_hex_string(parsed_url.path) if extracted: return extracted logger.warning(f"Could not extract valid post ID from URL: {url}") return None async def is_valid_medium_url(url: str) -> bool: """ Check if URL is a valid Medium article URL. Checks domain against known Medium domains and custom domains. Returns False for known non-Medium domains. Args: url: The URL to validate Returns: True if URL is a valid Medium URL, False otherwise """ domain = get_fld(url) if not domain: return False parsed_url = urlparse(url) domain_netloc = un_wwwify(parsed_url.netloc) # Accept redirect/proxy domains (need special handling) if domain in REDIRECT_DOMAINS: return True # Reject known non-Medium domains if domain in NOT_MEDIUM_DOMAINS or domain_netloc in NOT_MEDIUM_DOMAINS: logger.debug(f"URL domain {domain} is in NOT_MEDIUM_DOMAINS") return False # Accept known Medium domains if domain in KNOWN_MEDIUM_DOMAINS or domain_netloc in KNOWN_MEDIUM_CUSTOM_DOMAINS: return True # For unknown domains, try to resolve and check for valid post ID logger.debug(f"URL domain {domain} not in known lists, attempting resolution") try: post_id = await resolve_medium_url(url) return bool(post_id) except Exception as e: logger.warning(f"Failed to validate unknown domain: {e}") return False def extract_post_id_from_url(url: str) -> Optional[str]: """ Synchronous post ID extraction from URL. For simple URL patterns that don't require network resolution. Use resolve_medium_url for full URL resolution. """ parsed_url = urlparse(url) # Mobile link if parsed_url.path.startswith("/p/"): post_id = parsed_url.path.rsplit("/p/")[1].split("/")[0] if basic_hex_check(post_id): return post_id # Standard URL path_parts = parsed_url.path.strip("/").split("/") if path_parts: last_part = path_parts[-1] # Try to extract from slug-postid format post_id = last_part.split("-")[-1] if basic_hex_check(post_id): return post_id # Try multi-stage extraction extracted = extract_hex_string(last_part) if extracted: return extracted return None