Spaces:
Sleeping
Sleeping
| """ | |
| Constants for Medium-MCP | |
| Centralized configuration values, magic numbers, and string literals. | |
| Eliminates scattered hardcoded values throughout the codebase. | |
| """ | |
| from __future__ import annotations | |
| from typing import Final | |
| # ============================================================================= | |
| # MEDIUM DOMAINS | |
| # ============================================================================= | |
| # Official Medium domains | |
| MEDIUM_DOMAINS: Final[frozenset[str]] = frozenset({ | |
| "medium.com", | |
| "www.medium.com", | |
| }) | |
| # Known Medium publication domains | |
| PUBLICATION_DOMAINS: Final[frozenset[str]] = frozenset({ | |
| "towardsdatascience.com", | |
| "betterprogramming.pub", | |
| "levelup.gitconnected.com", | |
| "javascript.plainenglish.io", | |
| "python.plainenglish.io", | |
| "aws.plainenglish.io", | |
| "blog.devgenius.io", | |
| "blog.stackademic.com", | |
| "medium.datadriveninvestor.com", | |
| "entrepreneurshandbook.co", | |
| "betterhumans.pub", | |
| "writingcooperative.com", | |
| "psiloveyou.xyz", | |
| "uxdesign.cc", | |
| "eand.co", | |
| "codeburst.io", | |
| "itnext.io", | |
| "blog.prototypr.io", | |
| "uxplanet.org", | |
| "hackernoon.com", | |
| "thebolditalic.com", | |
| "forge.medium.com", | |
| "marker.medium.com", | |
| "onezero.medium.com", | |
| "gen.medium.com", | |
| "elemental.medium.com", | |
| "debugger.medium.com", | |
| "zora.medium.com", | |
| "heated.medium.com", | |
| "humanparts.medium.com", | |
| }) | |
| # All valid Medium-related domains | |
| ALL_MEDIUM_DOMAINS: Final[frozenset[str]] = MEDIUM_DOMAINS | PUBLICATION_DOMAINS | |
| # ============================================================================= | |
| # URL PATTERNS | |
| # ============================================================================= | |
| # Post ID characteristics | |
| POST_ID_MIN_LENGTH: Final[int] = 10 | |
| POST_ID_MAX_LENGTH: Final[int] = 16 | |
| POST_ID_HEX_CHARS: Final[str] = "0123456789abcdef" | |
| # URL tracking domains to unwrap | |
| TRACKING_DOMAINS: Final[frozenset[str]] = frozenset({ | |
| "l.facebook.com", | |
| "lm.facebook.com", | |
| "t.co", | |
| "bit.ly", | |
| "tinyurl.com", | |
| "goo.gl", | |
| "ow.ly", | |
| "buff.ly", | |
| "lnkd.in", | |
| "is.gd", | |
| "rb.gy", | |
| }) | |
| # Cache/archive domains | |
| CACHE_DOMAINS: Final[frozenset[str]] = frozenset({ | |
| "webcache.googleusercontent.com", | |
| "web.archive.org", | |
| "archive.is", | |
| "archive.today", | |
| "archive.ph", | |
| "archive.vn", | |
| "cached.googleusercontent.com", | |
| }) | |
| # ============================================================================= | |
| # HTTP CONFIGURATION | |
| # ============================================================================= | |
| # Connection pooling | |
| DEFAULT_MAX_CONNECTIONS: Final[int] = 100 | |
| DEFAULT_KEEPALIVE_CONNECTIONS: Final[int] = 20 | |
| DEFAULT_KEEPALIVE_EXPIRY: Final[float] = 5.0 | |
| # Timeouts (in seconds) | |
| DEFAULT_CONNECT_TIMEOUT: Final[float] = 5.0 | |
| DEFAULT_READ_TIMEOUT: Final[float] = 30.0 | |
| DEFAULT_TOTAL_TIMEOUT: Final[float] = 60.0 | |
| # Retry configuration | |
| DEFAULT_MAX_RETRIES: Final[int] = 3 | |
| DEFAULT_RETRY_BACKOFF_FACTOR: Final[float] = 0.5 | |
| # Rate limiting | |
| DEFAULT_RATE_LIMIT_REQUESTS: Final[int] = 10 | |
| DEFAULT_RATE_LIMIT_PERIOD: Final[int] = 60 # seconds | |
| # ============================================================================= | |
| # SCRAPER CONFIGURATION | |
| # ============================================================================= | |
| # Worker pool | |
| DEFAULT_MAX_WORKERS: Final[int] = 5 | |
| DEFAULT_BATCH_SIZE: Final[int] = 20 | |
| # Content limits | |
| MAX_ARTICLE_SIZE_BYTES: Final[int] = 10 * 1024 * 1024 # 10 MB | |
| MAX_IMAGES_PER_ARTICLE: Final[int] = 50 | |
| MAX_RECURSIVE_DEPTH: Final[int] = 3 | |
| # Caching | |
| DEFAULT_CACHE_TTL_HOURS: Final[int] = 24 | |
| STALE_CACHE_THRESHOLD_DAYS: Final[int] = 7 | |
| # ============================================================================= | |
| # RESILIENCE | |
| # ============================================================================= | |
| # Circuit breaker | |
| DEFAULT_FAILURE_THRESHOLD: Final[int] = 5 | |
| DEFAULT_RECOVERY_TIMEOUT: Final[int] = 300 # 5 minutes | |
| DEFAULT_HALF_OPEN_REQUESTS: Final[int] = 1 | |
| # ============================================================================= | |
| # LLM CONFIGURATION | |
| # ============================================================================= | |
| # Model names | |
| GROQ_DEFAULT_MODEL: Final[str] = "llama-3.3-70b-versatile" | |
| OPENAI_DEFAULT_MODEL: Final[str] = "gpt-4o-mini" | |
| GEMINI_DEFAULT_MODEL: Final[str] = "gemini-2.0-flash-exp" | |
| # Token limits | |
| GROQ_MAX_TOKENS: Final[int] = 8000 | |
| OPENAI_MAX_TOKENS: Final[int] = 128000 | |
| GEMINI_MAX_TOKENS: Final[int] = 1000000 | |
| # Character limits (approximate) | |
| GROQ_MAX_CHARS: Final[int] = 32000 | |
| OPENAI_MAX_CHARS: Final[int] = 500000 | |
| GEMINI_MAX_CHARS: Final[int] = 4000000 | |
| # ============================================================================= | |
| # AUDIO (ELEVENLABS) | |
| # ============================================================================= | |
| # Default voices | |
| ELEVENLABS_DEFAULT_VOICE: Final[str] = "Rachel" | |
| ELEVENLABS_VOICES: Final[tuple[str, ...]] = ( | |
| "Rachel", "Drew", "Clyde", "Paul", "Domi", | |
| "Dave", "Fin", "Sarah", "Antoni", "Thomas", | |
| "Charlie", "George", "Emily", "Elli", "Callum", | |
| ) | |
| # Audio settings | |
| DEFAULT_AUDIO_STABILITY: Final[float] = 0.5 | |
| DEFAULT_AUDIO_SIMILARITY: Final[float] = 0.75 | |
| DEFAULT_AUDIO_STYLE: Final[float] = 0.0 | |
| # ============================================================================= | |
| # GRADIO UI | |
| # ============================================================================= | |
| # Server configuration | |
| GRADIO_DEFAULT_PORT: Final[int] = 7860 | |
| GRADIO_DEFAULT_HOST: Final[str] = "0.0.0.0" | |
| # UI limits | |
| MAX_URL_INPUT_LENGTH: Final[int] = 500 | |
| MAX_QUERY_INPUT_LENGTH: Final[int] = 200 | |
| MAX_BATCH_URLS: Final[int] = 20 | |
| # ============================================================================= | |
| # MCP SERVER | |
| # ============================================================================= | |
| MCP_SERVER_NAME: Final[str] = "Medium Scraper v3" | |
| MCP_SERVER_VERSION: Final[str] = "3.1.0" | |
| # ============================================================================= | |
| # USER AGENTS | |
| # ============================================================================= | |
| DEFAULT_USER_AGENT: Final[str] = ( | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/120.0.0.0 Safari/537.36" | |
| ) | |
| # User agent pool for rotation | |
| USER_AGENT_POOL: Final[tuple[str, ...]] = ( | |
| DEFAULT_USER_AGENT, | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36", | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0", | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 Safari/605.1.15", | |
| ) | |