Spaces:
Sleeping
Sleeping
| """ | |
| Medium URL Resolver | |
| Comprehensive URL resolution for Medium articles including: | |
| - Short link resolution (link.medium.com) | |
| - Tracking URL parsing (Facebook, Google) | |
| - Medium email redirect handling | |
| - Domain validation (35+ known domains) | |
| Ported from Freedium's medium-parser/utils.py | |
| """ | |
| import logging | |
| import re | |
| import string | |
| from functools import lru_cache | |
| from typing import Optional | |
| from urllib.parse import parse_qs, urlparse | |
| try: | |
| import aiohttp | |
| HAS_AIOHTTP = True | |
| except ImportError: | |
| HAS_AIOHTTP = False | |
| import httpx | |
| try: | |
| import tld | |
| HAS_TLD = True | |
| except ImportError: | |
| HAS_TLD = False | |
| logger = logging.getLogger("URLResolver") | |
| # Valid characters for Medium post IDs | |
| VALID_ID_CHARS = set(string.ascii_letters + string.digits) | |
| # Known Medium custom domains (subdomains) | |
| # Source: medium-parser/utils.py | |
| KNOWN_MEDIUM_CUSTOM_DOMAINS = ( | |
| "javascript.plainenglish.io", | |
| "blog.llamaindex.ai", | |
| "code.likeagirl.io", | |
| "medium.datadriveninvestor.com", | |
| "blog.det.life", | |
| "python.plainenglish.io", | |
| "blog.stackademic.com", | |
| "ai.gopubby.com", | |
| "blog.devops.dev", | |
| "levelup.gitconnected.com", | |
| "betterhumans.coach.me", | |
| "ai.plainenglish.io", | |
| ) | |
| # Known Medium main domains | |
| # Source: medium-parser/utils.py | |
| KNOWN_MEDIUM_DOMAINS = ( | |
| "medium.com", | |
| "uxplanet.org", | |
| "osintteam.blog", | |
| "ahmedelfakharany.com", | |
| "drlee.io", | |
| "artificialcorner.com", | |
| "generativeai.pub", | |
| "productcoalition.com", | |
| "towardsdev.com", | |
| "infosecwriteups.com", | |
| "towardsdatascience.com", | |
| "thetaoist.online", | |
| "devopsquare.com", | |
| "laceydearie.com", | |
| "bettermarketing.pub", | |
| "itnext.io", | |
| "eand.co", | |
| "betterprogramming.pub", | |
| "curiouse.co", | |
| "betterhumans.pub", | |
| "uxdesign.cc", | |
| "thebolditalic.com", | |
| "arcdigital.media", | |
| "codeburst.io", | |
| "psiloveyou.xyz", | |
| "writingcooperative.com", | |
| "entrepreneurshandbook.co", | |
| "prototypr.io", | |
| "theascent.pub", | |
| "storiusmag.com", | |
| ) | |
| # Domains that are NOT Medium (to avoid false positives) | |
| # Source: medium-parser/utils.py | |
| NOT_MEDIUM_DOMAINS = ( | |
| "github.com", | |
| "yandex.ru", | |
| "yandex.kz", | |
| "youtube.com", | |
| "nytimes.com", | |
| "wsj.com", | |
| "reddit.com", | |
| "elpais.com", | |
| "forbes.com", | |
| "bloomberg.com", | |
| "lesechos.fr", | |
| "otz.de", | |
| "businessinsider.com", | |
| "buff.ly", | |
| "delish.com", | |
| "economist.com", | |
| "wired.com", | |
| "rollingstone.com", | |
| ) | |
| # Domains that are proxies/redirects (need special handling) | |
| REDIRECT_DOMAINS = ("12ft.io", "google.com", "facebook.com", "googleusercontent.com") | |
| def un_wwwify(url: str) -> str: | |
| """Remove 'www.' prefix from URL/domain.""" | |
| if url.startswith("www."): | |
| return url.removeprefix("www.") | |
| return url | |
| def unquerify_url(url: str) -> str: | |
| """Remove all query parameters from URL.""" | |
| import urllib.parse | |
| parsed_url = urllib.parse.urlparse(url) | |
| if parsed_url.query: | |
| parsed_url = parsed_url._replace(query="") | |
| sanitized_url = urllib.parse.urlunparse(parsed_url) | |
| return sanitized_url.removesuffix("/") | |
| def unpaginate_url(url: str) -> str: | |
| """Remove page pagination from URL.""" | |
| sanitized_url = url.removesuffix("/page/2") | |
| return sanitized_url.removesuffix("/") | |
| def correct_url(url: str) -> str: | |
| """ | |
| Correct common URL issues. | |
| - Removes query parameters | |
| - Removes pagination suffixes | |
| """ | |
| unquerified_url = unquerify_url(url) | |
| unpaginated_url = unpaginate_url(unquerified_url) | |
| return unpaginated_url | |
| def basic_hex_check(hex_string: str) -> bool: | |
| """ | |
| Check if string is a valid Medium post ID. | |
| Post IDs are 8-12 character alphanumeric strings. | |
| """ | |
| # Check if all characters are valid | |
| for char in hex_string: | |
| if char not in VALID_ID_CHARS: | |
| return False | |
| # Check length (8-12 characters) | |
| if len(hex_string) not in range(8, 12 + 1): | |
| return False | |
| return True | |
| def extract_hex_string(input_string: str) -> Optional[str]: | |
| """ | |
| Extract Medium post ID from URL path. | |
| Uses two-stage regex matching: | |
| 1. Find hex string preceded by '-' (most reliable) | |
| 2. Find any hex string (fallback) | |
| Returns: | |
| The extracted post ID, or None if not found | |
| """ | |
| # Stage 1: Find hex string preceded by '-' | |
| match = re.findall(r"-(\b[a-fA-F0-9]{8,12}\b)", input_string) | |
| if not match: | |
| # Stage 2: Find hex string without '-' | |
| match = re.findall(r"(\b[a-fA-F0-9]{8,12}\b)", input_string) | |
| return match[-1] if match else None | |
| def is_valid_post_id(hex_string: str) -> bool: | |
| """Check if string is a valid Medium post ID.""" | |
| return extract_hex_string(hex_string) is not None | |
| def get_fld(url: str) -> Optional[str]: | |
| """Get first-level domain from URL.""" | |
| if HAS_TLD: | |
| try: | |
| return tld.get_fld(url) | |
| except Exception: | |
| return None | |
| else: | |
| # Fallback: simple domain extraction | |
| try: | |
| parsed = urlparse(url) | |
| parts = parsed.netloc.split(".") | |
| if len(parts) >= 2: | |
| return ".".join(parts[-2:]) | |
| return parsed.netloc | |
| except Exception: | |
| return None | |
| def is_valid_url(url: str) -> bool: | |
| """Check if URL has valid scheme and netloc.""" | |
| fld = get_fld(url) | |
| if not fld: | |
| return False | |
| parsed_url = urlparse(url) | |
| return bool(parsed_url.scheme and parsed_url.netloc) | |
| async def resolve_medium_short_link(short_url_id: str, timeout: int = 5) -> str: | |
| """ | |
| Resolve Medium short link (link.medium.com) to full URL. | |
| Uses rsci.app.link service for resolution. | |
| """ | |
| resolve_url = f"https://rsci.app.link/{short_url_id}" | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36" | |
| } | |
| if HAS_AIOHTTP: | |
| async with aiohttp.ClientSession() as session: | |
| async with session.get( | |
| resolve_url, | |
| headers=headers, | |
| timeout=aiohttp.ClientTimeout(total=timeout), | |
| allow_redirects=False, | |
| ) as response: | |
| return response.headers.get("Location", "") | |
| else: | |
| async with httpx.AsyncClient(timeout=timeout, follow_redirects=False) as client: | |
| response = await client.get(resolve_url, headers=headers) | |
| return response.headers.get("Location", "") | |
| async def resolve_medium_url(url: str, timeout: int = 5) -> Optional[str]: | |
| """ | |
| Resolve various URL formats to Medium post ID. | |
| Handles: | |
| - Mobile links (/p/post_id) | |
| - Short links (link.medium.com) | |
| - Facebook tracking links | |
| - Google tracking/cache links | |
| - 12ft.io proxy links | |
| - Medium email redirect links | |
| Args: | |
| url: The URL to resolve | |
| timeout: Request timeout for short link resolution | |
| Returns: | |
| The extracted post ID, or None if resolution failed | |
| """ | |
| logger.debug(f"Resolving URL: {url}") | |
| parsed_url = urlparse(url) | |
| parsed_netloc = un_wwwify(parsed_url.netloc) | |
| # Mobile link: /p/post_id | |
| if parsed_url.path.startswith("/p/"): | |
| logger.debug("URL is Medium mobile link") | |
| post_id = parsed_url.path.rsplit("/p/")[1] | |
| # Clean any trailing path segments | |
| post_id = post_id.split("/")[0] | |
| if basic_hex_check(post_id): | |
| return post_id | |
| # Facebook tracking link: l.facebook.com/l.php?u=... | |
| elif parsed_netloc == "l.facebook.com" and parsed_url.path.startswith("/l.php"): | |
| logger.debug("URL is Facebook tracking link") | |
| parsed_query = parse_qs(parsed_url.query) | |
| if parsed_query.get("u") and len(parsed_query["u"]) == 1: | |
| post_url = parsed_query["u"][0] | |
| return await resolve_medium_url(post_url, timeout) | |
| logger.warning("Facebook link missing 'u' parameter") | |
| return None | |
| # Google Web Cache: webcache.googleusercontent.com/search?q=cache:... | |
| elif ( | |
| parsed_netloc == "webcache.googleusercontent.com" | |
| and parsed_url.path.startswith("/search") | |
| ): | |
| logger.debug("URL is Google Web Cache link") | |
| parsed_query = parse_qs(parsed_url.query) | |
| if parsed_query.get("q") and len(parsed_query["q"]) == 1: | |
| post_url = parsed_query["q"][0].removeprefix("cache:") | |
| return await resolve_medium_url(post_url, timeout) | |
| logger.warning("Google cache link missing 'q' parameter") | |
| return None | |
| # Google tracking link: google.com/url?url=... or ?q=... | |
| elif parsed_netloc == "google.com" and parsed_url.path.startswith("/url"): | |
| logger.debug("URL is Google tracking link") | |
| parsed_query = parse_qs(parsed_url.query) | |
| if parsed_query.get("url") and len(parsed_query["url"]) == 1: | |
| post_url = parsed_query["url"][0] | |
| return await resolve_medium_url(post_url, timeout) | |
| elif parsed_query.get("q") and len(parsed_query["q"]) == 1: | |
| post_url = parsed_query["q"][0] | |
| return await resolve_medium_url(post_url, timeout) | |
| logger.warning("Google link missing 'url' or 'q' parameter") | |
| return None | |
| # 12ft.io proxy: 12ft.io?q=... | |
| elif parsed_netloc == "12ft.io": | |
| logger.debug("URL is 12ft.io proxy link") | |
| parsed_query = parse_qs(parsed_url.query) | |
| if parsed_query.get("q") and len(parsed_query["q"]) == 1: | |
| post_url = parsed_query["q"][0] | |
| return await resolve_medium_url(post_url, timeout) | |
| logger.warning("12ft.io link missing 'q' parameter") | |
| return None | |
| # Medium email redirect: /m/global-identity-2?redirectUrl=... | |
| elif parsed_url.path.startswith("/m/global-identity-2"): | |
| logger.debug("URL is Medium email redirect link") | |
| parsed_query = parse_qs(parsed_url.query) | |
| if parsed_query.get("redirectUrl") and len(parsed_query["redirectUrl"]) == 1: | |
| post_url = parsed_query["redirectUrl"][0] | |
| return await resolve_medium_url(post_url, timeout) | |
| logger.warning("Medium redirect missing 'redirectUrl' parameter") | |
| return None | |
| # Medium short link: link.medium.com/xyz | |
| elif parsed_netloc == "link.medium.com": | |
| logger.debug("URL is Medium short link") | |
| short_url_id = parsed_url.path.removeprefix("/") | |
| if short_url_id: | |
| try: | |
| post_url = await resolve_medium_short_link(short_url_id, timeout) | |
| if post_url: | |
| return await resolve_medium_url(post_url, timeout) | |
| except Exception as e: | |
| logger.warning(f"Failed to resolve short link: {e}") | |
| return None | |
| # Standard URL: extract post_id from end of path | |
| else: | |
| logger.debug("Extracting post ID from standard URL path") | |
| post_url = parsed_url.path.split("/")[-1] | |
| post_id = post_url.split("-")[-1] | |
| if basic_hex_check(post_id): | |
| return post_id | |
| # Try multi-stage extraction | |
| extracted = extract_hex_string(parsed_url.path) | |
| if extracted: | |
| return extracted | |
| logger.warning(f"Could not extract valid post ID from URL: {url}") | |
| return None | |
| async def is_valid_medium_url(url: str) -> bool: | |
| """ | |
| Check if URL is a valid Medium article URL. | |
| Checks domain against known Medium domains and custom domains. | |
| Returns False for known non-Medium domains. | |
| Args: | |
| url: The URL to validate | |
| Returns: | |
| True if URL is a valid Medium URL, False otherwise | |
| """ | |
| domain = get_fld(url) | |
| if not domain: | |
| return False | |
| parsed_url = urlparse(url) | |
| domain_netloc = un_wwwify(parsed_url.netloc) | |
| # Accept redirect/proxy domains (need special handling) | |
| if domain in REDIRECT_DOMAINS: | |
| return True | |
| # Reject known non-Medium domains | |
| if domain in NOT_MEDIUM_DOMAINS or domain_netloc in NOT_MEDIUM_DOMAINS: | |
| logger.debug(f"URL domain {domain} is in NOT_MEDIUM_DOMAINS") | |
| return False | |
| # Accept known Medium domains | |
| if domain in KNOWN_MEDIUM_DOMAINS or domain_netloc in KNOWN_MEDIUM_CUSTOM_DOMAINS: | |
| return True | |
| # For unknown domains, try to resolve and check for valid post ID | |
| logger.debug(f"URL domain {domain} not in known lists, attempting resolution") | |
| try: | |
| post_id = await resolve_medium_url(url) | |
| return bool(post_id) | |
| except Exception as e: | |
| logger.warning(f"Failed to validate unknown domain: {e}") | |
| return False | |
| def extract_post_id_from_url(url: str) -> Optional[str]: | |
| """ | |
| Synchronous post ID extraction from URL. | |
| For simple URL patterns that don't require network resolution. | |
| Use resolve_medium_url for full URL resolution. | |
| """ | |
| parsed_url = urlparse(url) | |
| # Mobile link | |
| if parsed_url.path.startswith("/p/"): | |
| post_id = parsed_url.path.rsplit("/p/")[1].split("/")[0] | |
| if basic_hex_check(post_id): | |
| return post_id | |
| # Standard URL | |
| path_parts = parsed_url.path.strip("/").split("/") | |
| if path_parts: | |
| last_part = path_parts[-1] | |
| # Try to extract from slug-postid format | |
| post_id = last_part.split("-")[-1] | |
| if basic_hex_check(post_id): | |
| return post_id | |
| # Try multi-stage extraction | |
| extracted = extract_hex_string(last_part) | |
| if extracted: | |
| return extracted | |
| return None | |