Medium-MCP / src /constants.py
Nikhil Pravin Pise
feat: implement comprehensive improvement plan (Phases 1-5)
e98cc10
"""
Constants for Medium-MCP
Centralized configuration values, magic numbers, and string literals.
Eliminates scattered hardcoded values throughout the codebase.
"""
from __future__ import annotations
from typing import Final
# =============================================================================
# MEDIUM DOMAINS
# =============================================================================
# Official Medium domains
MEDIUM_DOMAINS: Final[frozenset[str]] = frozenset({
"medium.com",
"www.medium.com",
})
# Known Medium publication domains
PUBLICATION_DOMAINS: Final[frozenset[str]] = frozenset({
"towardsdatascience.com",
"betterprogramming.pub",
"levelup.gitconnected.com",
"javascript.plainenglish.io",
"python.plainenglish.io",
"aws.plainenglish.io",
"blog.devgenius.io",
"blog.stackademic.com",
"medium.datadriveninvestor.com",
"entrepreneurshandbook.co",
"betterhumans.pub",
"writingcooperative.com",
"psiloveyou.xyz",
"uxdesign.cc",
"eand.co",
"codeburst.io",
"itnext.io",
"blog.prototypr.io",
"uxplanet.org",
"hackernoon.com",
"thebolditalic.com",
"forge.medium.com",
"marker.medium.com",
"onezero.medium.com",
"gen.medium.com",
"elemental.medium.com",
"debugger.medium.com",
"zora.medium.com",
"heated.medium.com",
"humanparts.medium.com",
})
# All valid Medium-related domains
ALL_MEDIUM_DOMAINS: Final[frozenset[str]] = MEDIUM_DOMAINS | PUBLICATION_DOMAINS
# =============================================================================
# URL PATTERNS
# =============================================================================
# Post ID characteristics
POST_ID_MIN_LENGTH: Final[int] = 10
POST_ID_MAX_LENGTH: Final[int] = 16
POST_ID_HEX_CHARS: Final[str] = "0123456789abcdef"
# URL tracking domains to unwrap
TRACKING_DOMAINS: Final[frozenset[str]] = frozenset({
"l.facebook.com",
"lm.facebook.com",
"t.co",
"bit.ly",
"tinyurl.com",
"goo.gl",
"ow.ly",
"buff.ly",
"lnkd.in",
"is.gd",
"rb.gy",
})
# Cache/archive domains
CACHE_DOMAINS: Final[frozenset[str]] = frozenset({
"webcache.googleusercontent.com",
"web.archive.org",
"archive.is",
"archive.today",
"archive.ph",
"archive.vn",
"cached.googleusercontent.com",
})
# =============================================================================
# HTTP CONFIGURATION
# =============================================================================
# Connection pooling
DEFAULT_MAX_CONNECTIONS: Final[int] = 100
DEFAULT_KEEPALIVE_CONNECTIONS: Final[int] = 20
DEFAULT_KEEPALIVE_EXPIRY: Final[float] = 5.0
# Timeouts (in seconds)
DEFAULT_CONNECT_TIMEOUT: Final[float] = 5.0
DEFAULT_READ_TIMEOUT: Final[float] = 30.0
DEFAULT_TOTAL_TIMEOUT: Final[float] = 60.0
# Retry configuration
DEFAULT_MAX_RETRIES: Final[int] = 3
DEFAULT_RETRY_BACKOFF_FACTOR: Final[float] = 0.5
# Rate limiting
DEFAULT_RATE_LIMIT_REQUESTS: Final[int] = 10
DEFAULT_RATE_LIMIT_PERIOD: Final[int] = 60 # seconds
# =============================================================================
# SCRAPER CONFIGURATION
# =============================================================================
# Worker pool
DEFAULT_MAX_WORKERS: Final[int] = 5
DEFAULT_BATCH_SIZE: Final[int] = 20
# Content limits
MAX_ARTICLE_SIZE_BYTES: Final[int] = 10 * 1024 * 1024 # 10 MB
MAX_IMAGES_PER_ARTICLE: Final[int] = 50
MAX_RECURSIVE_DEPTH: Final[int] = 3
# Caching
DEFAULT_CACHE_TTL_HOURS: Final[int] = 24
STALE_CACHE_THRESHOLD_DAYS: Final[int] = 7
# =============================================================================
# RESILIENCE
# =============================================================================
# Circuit breaker
DEFAULT_FAILURE_THRESHOLD: Final[int] = 5
DEFAULT_RECOVERY_TIMEOUT: Final[int] = 300 # 5 minutes
DEFAULT_HALF_OPEN_REQUESTS: Final[int] = 1
# =============================================================================
# LLM CONFIGURATION
# =============================================================================
# Model names
GROQ_DEFAULT_MODEL: Final[str] = "llama-3.3-70b-versatile"
OPENAI_DEFAULT_MODEL: Final[str] = "gpt-4o-mini"
GEMINI_DEFAULT_MODEL: Final[str] = "gemini-2.0-flash-exp"
# Token limits
GROQ_MAX_TOKENS: Final[int] = 8000
OPENAI_MAX_TOKENS: Final[int] = 128000
GEMINI_MAX_TOKENS: Final[int] = 1000000
# Character limits (approximate)
GROQ_MAX_CHARS: Final[int] = 32000
OPENAI_MAX_CHARS: Final[int] = 500000
GEMINI_MAX_CHARS: Final[int] = 4000000
# =============================================================================
# AUDIO (ELEVENLABS)
# =============================================================================
# Default voices
ELEVENLABS_DEFAULT_VOICE: Final[str] = "Rachel"
ELEVENLABS_VOICES: Final[tuple[str, ...]] = (
"Rachel", "Drew", "Clyde", "Paul", "Domi",
"Dave", "Fin", "Sarah", "Antoni", "Thomas",
"Charlie", "George", "Emily", "Elli", "Callum",
)
# Audio settings
DEFAULT_AUDIO_STABILITY: Final[float] = 0.5
DEFAULT_AUDIO_SIMILARITY: Final[float] = 0.75
DEFAULT_AUDIO_STYLE: Final[float] = 0.0
# =============================================================================
# GRADIO UI
# =============================================================================
# Server configuration
GRADIO_DEFAULT_PORT: Final[int] = 7860
GRADIO_DEFAULT_HOST: Final[str] = "0.0.0.0"
# UI limits
MAX_URL_INPUT_LENGTH: Final[int] = 500
MAX_QUERY_INPUT_LENGTH: Final[int] = 200
MAX_BATCH_URLS: Final[int] = 20
# =============================================================================
# MCP SERVER
# =============================================================================
MCP_SERVER_NAME: Final[str] = "Medium Scraper v3"
MCP_SERVER_VERSION: Final[str] = "3.1.0"
# =============================================================================
# USER AGENTS
# =============================================================================
DEFAULT_USER_AGENT: Final[str] = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
# User agent pool for rotation
USER_AGENT_POOL: Final[tuple[str, ...]] = (
DEFAULT_USER_AGENT,
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 Safari/605.1.15",
)