Spaces:
Sleeping
Sleeping
Nikhil Pravin Pise commited on
Commit ·
60742a2
1
Parent(s): f41a2fa
fix: Upgrade Medium images to high resolution (1400px) across entire app
Browse files- Added centralized upgrade_medium_image_url() and get_medium_image_url() to utils.py
- Set MEDIUM_IMAGE_DEFAULT_WIDTH = 1400 as single source of truth
- Updated parser.py to automatically upgrade search/tag result images
- Updated paragraph_parser.py get_image_url() to use 1400px default
- Updated html_renderer.py image sizes (700->1400px, 320->800px)
- Updated app.py to use centralized image upgrade from utils.py
Fixes low-resolution image thumbnails in search results and article previews
- app.py +4 -1
- src/html_renderer.py +6 -3
- src/paragraph_parser.py +6 -3
- src/parser.py +8 -0
- src/utils.py +81 -0
app.py
CHANGED
|
@@ -54,6 +54,7 @@ load_dotenv(os.path.join(os.path.dirname(__file__), ".env"))
|
|
| 54 |
from src.service import ScraperService
|
| 55 |
# Import renderer for explicit usage
|
| 56 |
from src.html_renderer import render_full_page, BASE_TEMPLATE as RENDERER_TEMPLATE
|
|
|
|
| 57 |
from src.config import MCPConfig
|
| 58 |
from elevenlabs_voices import ELEVENLABS_VOICES, VOICE_CATEGORIES, get_voice_id
|
| 59 |
# Import Gemini for Analyst (backup)
|
|
@@ -709,7 +710,9 @@ def render_cards(results, query: str = ""):
|
|
| 709 |
url = art.get('url', '#')
|
| 710 |
author = art.get('author', 'Unknown')
|
| 711 |
if isinstance(author, dict): author = author.get('name', 'Unknown')
|
| 712 |
-
|
|
|
|
|
|
|
| 713 |
|
| 714 |
html += f"""
|
| 715 |
<a href='{url}' target='_blank' class='aether-card'>
|
|
|
|
| 54 |
from src.service import ScraperService
|
| 55 |
# Import renderer for explicit usage
|
| 56 |
from src.html_renderer import render_full_page, BASE_TEMPLATE as RENDERER_TEMPLATE
|
| 57 |
+
from src.utils import upgrade_medium_image_url
|
| 58 |
from src.config import MCPConfig
|
| 59 |
from elevenlabs_voices import ELEVENLABS_VOICES, VOICE_CATEGORIES, get_voice_id
|
| 60 |
# Import Gemini for Analyst (backup)
|
|
|
|
| 710 |
url = art.get('url', '#')
|
| 711 |
author = art.get('author', 'Unknown')
|
| 712 |
if isinstance(author, dict): author = author.get('name', 'Unknown')
|
| 713 |
+
# Ensure high-resolution image (upgrade any low-res URLs)
|
| 714 |
+
raw_img = art.get('imageUrl', '') or 'https://miro.medium.com/v2/resize:fit:1400/1*jfdwtvU6V6g99q3G7gq7dQ.png'
|
| 715 |
+
img = upgrade_medium_image_url(raw_img, target_width=1400)
|
| 716 |
|
| 717 |
html += f"""
|
| 718 |
<a href='{url}' target='_blank' class='aether-card'>
|
src/html_renderer.py
CHANGED
|
@@ -9,6 +9,9 @@ import html
|
|
| 9 |
from typing import Dict, List, Any, Optional
|
| 10 |
import logging
|
| 11 |
|
|
|
|
|
|
|
|
|
|
| 12 |
logger = logging.getLogger("HTMLRenderer")
|
| 13 |
|
| 14 |
# Base HTML template for standalone page
|
|
@@ -295,7 +298,7 @@ def render_paragraph(paragraph: Dict, is_code: bool = False) -> str:
|
|
| 295 |
<div class="mt-7">
|
| 296 |
<img loading="eager" alt="{alt}" class="pt-5 m-auto"
|
| 297 |
referrerpolicy="no-referrer"
|
| 298 |
-
src="https://miro.medium.com/v2/resize:fit:
|
| 299 |
</div>
|
| 300 |
'''
|
| 301 |
if caption:
|
|
@@ -363,7 +366,7 @@ def render_paragraph(paragraph: Dict, is_code: bool = False) -> str:
|
|
| 363 |
</div>
|
| 364 |
<div class="relative flex h-40 flew-row w-60">
|
| 365 |
<div class="absolute inset-0 bg-center bg-cover"
|
| 366 |
-
style="background-image: url('https://miro.medium.com/v2/resize:fit:
|
| 367 |
</div>
|
| 368 |
</div>
|
| 369 |
</div>
|
|
@@ -504,7 +507,7 @@ def render_article_html(article_data: Dict[str, Any]) -> str:
|
|
| 504 |
preview_image_html = f'''
|
| 505 |
<img alt="Preview image" style="max-height: 65vh; width: auto; margin: auto"
|
| 506 |
loading="eager" referrerpolicy="no-referrer"
|
| 507 |
-
src="https://miro.medium.com/v2/resize:fit:
|
| 508 |
'''
|
| 509 |
|
| 510 |
# Subtitle
|
|
|
|
| 9 |
from typing import Dict, List, Any, Optional
|
| 10 |
import logging
|
| 11 |
|
| 12 |
+
# Import centralized image URL utilities
|
| 13 |
+
from src.utils import MEDIUM_IMAGE_DEFAULT_WIDTH
|
| 14 |
+
|
| 15 |
logger = logging.getLogger("HTMLRenderer")
|
| 16 |
|
| 17 |
# Base HTML template for standalone page
|
|
|
|
| 298 |
<div class="mt-7">
|
| 299 |
<img loading="eager" alt="{alt}" class="pt-5 m-auto"
|
| 300 |
referrerpolicy="no-referrer"
|
| 301 |
+
src="https://miro.medium.com/v2/resize:fit:1400/{image_id}">
|
| 302 |
</div>
|
| 303 |
'''
|
| 304 |
if caption:
|
|
|
|
| 366 |
</div>
|
| 367 |
<div class="relative flex h-40 flew-row w-60">
|
| 368 |
<div class="absolute inset-0 bg-center bg-cover"
|
| 369 |
+
style="background-image: url('https://miro.medium.com/v2/resize:fit:800/{thumbnail}');">
|
| 370 |
</div>
|
| 371 |
</div>
|
| 372 |
</div>
|
|
|
|
| 507 |
preview_image_html = f'''
|
| 508 |
<img alt="Preview image" style="max-height: 65vh; width: auto; margin: auto"
|
| 509 |
loading="eager" referrerpolicy="no-referrer"
|
| 510 |
+
src="https://miro.medium.com/v2/resize:fit:1400/{preview_image_id}">
|
| 511 |
'''
|
| 512 |
|
| 513 |
# Subtitle
|
src/paragraph_parser.py
CHANGED
|
@@ -12,6 +12,9 @@ import logging
|
|
| 12 |
import re
|
| 13 |
from typing import Dict, List, Optional, Tuple
|
| 14 |
|
|
|
|
|
|
|
|
|
|
| 15 |
try:
|
| 16 |
import tld
|
| 17 |
HAS_TLD = True
|
|
@@ -123,9 +126,9 @@ class MarkupProcessor:
|
|
| 123 |
return text
|
| 124 |
|
| 125 |
|
| 126 |
-
def get_image_url(image_id: str, width: int =
|
| 127 |
-
"""Build Medium image URL from image ID."""
|
| 128 |
-
return
|
| 129 |
|
| 130 |
|
| 131 |
def parse_paragraphs_to_markdown(
|
|
|
|
| 12 |
import re
|
| 13 |
from typing import Dict, List, Optional, Tuple
|
| 14 |
|
| 15 |
+
# Import centralized image URL utilities
|
| 16 |
+
from src.utils import get_medium_image_url, MEDIUM_IMAGE_DEFAULT_WIDTH
|
| 17 |
+
|
| 18 |
try:
|
| 19 |
import tld
|
| 20 |
HAS_TLD = True
|
|
|
|
| 126 |
return text
|
| 127 |
|
| 128 |
|
| 129 |
+
def get_image_url(image_id: str, width: int = MEDIUM_IMAGE_DEFAULT_WIDTH) -> str:
|
| 130 |
+
"""Build Medium image URL from image ID. Uses high-res by default."""
|
| 131 |
+
return get_medium_image_url(image_id, width)
|
| 132 |
|
| 133 |
|
| 134 |
def parse_paragraphs_to_markdown(
|
src/parser.py
CHANGED
|
@@ -1,8 +1,13 @@
|
|
|
|
|
| 1 |
from bs4 import BeautifulSoup
|
| 2 |
from typing import Dict, List, Optional, Any
|
| 3 |
from markdownify import markdownify as md
|
| 4 |
from urllib.parse import urljoin
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
def extract_search_results(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]:
|
| 7 |
"""
|
| 8 |
Extracts article metadata from search result cards.
|
|
@@ -133,6 +138,9 @@ def _extract_from_card(card, base_url: str) -> Dict[str, Any]:
|
|
| 133 |
if img_tag and img_tag.get("src"):
|
| 134 |
image_url = img_tag["src"]
|
| 135 |
|
|
|
|
|
|
|
|
|
|
| 136 |
return {
|
| 137 |
"url": url,
|
| 138 |
"title": title,
|
|
|
|
| 1 |
+
import re
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
from typing import Dict, List, Optional, Any
|
| 4 |
from markdownify import markdownify as md
|
| 5 |
from urllib.parse import urljoin
|
| 6 |
|
| 7 |
+
# Import centralized image URL utilities from utils
|
| 8 |
+
from src.utils import upgrade_medium_image_url, get_medium_image_url, MEDIUM_IMAGE_DEFAULT_WIDTH
|
| 9 |
+
|
| 10 |
+
|
| 11 |
def extract_search_results(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]:
|
| 12 |
"""
|
| 13 |
Extracts article metadata from search result cards.
|
|
|
|
| 138 |
if img_tag and img_tag.get("src"):
|
| 139 |
image_url = img_tag["src"]
|
| 140 |
|
| 141 |
+
# Upgrade image URL to high resolution
|
| 142 |
+
image_url = upgrade_medium_image_url(image_url, target_width=1400)
|
| 143 |
+
|
| 144 |
return {
|
| 145 |
"url": url,
|
| 146 |
"title": title,
|
src/utils.py
CHANGED
|
@@ -194,6 +194,87 @@ def make_absolute_url(url: str, base_url: str) -> str:
|
|
| 194 |
return urljoin(base_url, url)
|
| 195 |
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
# =============================================================================
|
| 198 |
# HASH UTILITIES
|
| 199 |
# =============================================================================
|
|
|
|
| 194 |
return urljoin(base_url, url)
|
| 195 |
|
| 196 |
|
| 197 |
+
# Default high resolution width for Medium images
|
| 198 |
+
MEDIUM_IMAGE_DEFAULT_WIDTH = 1400
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
def upgrade_medium_image_url(url: str, target_width: int = MEDIUM_IMAGE_DEFAULT_WIDTH) -> str:
|
| 202 |
+
"""
|
| 203 |
+
Upgrades a Medium image URL to a higher resolution.
|
| 204 |
+
|
| 205 |
+
Medium uses CDN URLs like:
|
| 206 |
+
- https://miro.medium.com/v2/resize:fit:320/{image_id}
|
| 207 |
+
- https://miro.medium.com/v2/resize:fill:88:88/{image_id}
|
| 208 |
+
- https://miro.medium.com/max/320/{image_id} (older format)
|
| 209 |
+
|
| 210 |
+
This function replaces the resize parameters with a higher resolution.
|
| 211 |
+
|
| 212 |
+
Args:
|
| 213 |
+
url: The original image URL
|
| 214 |
+
target_width: Target width in pixels (default 1400 for high-res)
|
| 215 |
+
|
| 216 |
+
Returns:
|
| 217 |
+
Upgraded URL with higher resolution, or original if not a Medium image
|
| 218 |
+
"""
|
| 219 |
+
if not url:
|
| 220 |
+
return url
|
| 221 |
+
|
| 222 |
+
# Check if it's a Medium CDN URL
|
| 223 |
+
if "miro.medium.com" not in url:
|
| 224 |
+
return url
|
| 225 |
+
|
| 226 |
+
# Pattern 1: v2/resize:fit:WIDTH or v2/resize:fill:WIDTH:HEIGHT
|
| 227 |
+
pattern_v2 = r"(miro\.medium\.com/v2/resize:)(fit|fill):(\d+)(?::(\d+))?"
|
| 228 |
+
match = re.search(pattern_v2, url)
|
| 229 |
+
if match:
|
| 230 |
+
# Replace with high-res fit format
|
| 231 |
+
new_url = re.sub(pattern_v2, f"miro.medium.com/v2/resize:fit:{target_width}", url)
|
| 232 |
+
return new_url
|
| 233 |
+
|
| 234 |
+
# Pattern 2: older format max/WIDTH
|
| 235 |
+
pattern_max = r"(miro\.medium\.com/max/)(\d+)"
|
| 236 |
+
match = re.search(pattern_max, url)
|
| 237 |
+
if match:
|
| 238 |
+
new_url = re.sub(pattern_max, f"miro.medium.com/v2/resize:fit:{target_width}", url)
|
| 239 |
+
return new_url
|
| 240 |
+
|
| 241 |
+
# Pattern 3: freeze format with dimensions
|
| 242 |
+
# Example: freeze/fit/320/240/...
|
| 243 |
+
pattern_freeze = r"(miro\.medium\.com/freeze/)(fit|fill)/(\d+)/(\d+)"
|
| 244 |
+
match = re.search(pattern_freeze, url)
|
| 245 |
+
if match:
|
| 246 |
+
new_url = re.sub(pattern_freeze, f"miro.medium.com/v2/resize:fit:{target_width}", url)
|
| 247 |
+
return new_url
|
| 248 |
+
|
| 249 |
+
# If we have a Medium URL but can't parse the format, try to extract image ID
|
| 250 |
+
# and construct a new URL
|
| 251 |
+
# Pattern: Look for the image ID (usually contains *)
|
| 252 |
+
pattern_id = r"miro\.medium\.com/.*?/([01]\*[a-zA-Z0-9_-]+\.[a-zA-Z]+)"
|
| 253 |
+
match = re.search(pattern_id, url)
|
| 254 |
+
if match:
|
| 255 |
+
image_id = match.group(1)
|
| 256 |
+
return f"https://miro.medium.com/v2/resize:fit:{target_width}/{image_id}"
|
| 257 |
+
|
| 258 |
+
# Return original if we can't upgrade
|
| 259 |
+
return url
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def get_medium_image_url(image_id: str, width: int = MEDIUM_IMAGE_DEFAULT_WIDTH) -> str:
|
| 263 |
+
"""
|
| 264 |
+
Build a high-resolution Medium image URL from an image ID.
|
| 265 |
+
|
| 266 |
+
Args:
|
| 267 |
+
image_id: The Medium image ID (e.g., "1*abc123.png")
|
| 268 |
+
width: Target width in pixels (default 1400 for high-res)
|
| 269 |
+
|
| 270 |
+
Returns:
|
| 271 |
+
Full Medium CDN URL for the image
|
| 272 |
+
"""
|
| 273 |
+
if not image_id:
|
| 274 |
+
return ""
|
| 275 |
+
return f"https://miro.medium.com/v2/resize:fit:{width}/{image_id}"
|
| 276 |
+
|
| 277 |
+
|
| 278 |
# =============================================================================
|
| 279 |
# HASH UTILITIES
|
| 280 |
# =============================================================================
|