Nikhil Pravin Pise commited on
Commit
60742a2
·
1 Parent(s): f41a2fa

fix: Upgrade Medium images to high resolution (1400px) across entire app

Browse files

- Added centralized upgrade_medium_image_url() and get_medium_image_url() to utils.py
- Set MEDIUM_IMAGE_DEFAULT_WIDTH = 1400 as single source of truth
- Updated parser.py to automatically upgrade search/tag result images
- Updated paragraph_parser.py get_image_url() to use 1400px default
- Updated html_renderer.py image sizes (700->1400px, 320->800px)
- Updated app.py to use centralized image upgrade from utils.py

Fixes low-resolution image thumbnails in search results and article previews

Files changed (5) hide show
  1. app.py +4 -1
  2. src/html_renderer.py +6 -3
  3. src/paragraph_parser.py +6 -3
  4. src/parser.py +8 -0
  5. src/utils.py +81 -0
app.py CHANGED
@@ -54,6 +54,7 @@ load_dotenv(os.path.join(os.path.dirname(__file__), ".env"))
54
  from src.service import ScraperService
55
  # Import renderer for explicit usage
56
  from src.html_renderer import render_full_page, BASE_TEMPLATE as RENDERER_TEMPLATE
 
57
  from src.config import MCPConfig
58
  from elevenlabs_voices import ELEVENLABS_VOICES, VOICE_CATEGORIES, get_voice_id
59
  # Import Gemini for Analyst (backup)
@@ -709,7 +710,9 @@ def render_cards(results, query: str = ""):
709
  url = art.get('url', '#')
710
  author = art.get('author', 'Unknown')
711
  if isinstance(author, dict): author = author.get('name', 'Unknown')
712
- img = art.get('imageUrl', '') or 'https://miro.medium.com/max/1400/1*jfdwtvU6V6g99q3G7gq7dQ.png'
 
 
713
 
714
  html += f"""
715
  <a href='{url}' target='_blank' class='aether-card'>
 
54
  from src.service import ScraperService
55
  # Import renderer for explicit usage
56
  from src.html_renderer import render_full_page, BASE_TEMPLATE as RENDERER_TEMPLATE
57
+ from src.utils import upgrade_medium_image_url
58
  from src.config import MCPConfig
59
  from elevenlabs_voices import ELEVENLABS_VOICES, VOICE_CATEGORIES, get_voice_id
60
  # Import Gemini for Analyst (backup)
 
710
  url = art.get('url', '#')
711
  author = art.get('author', 'Unknown')
712
  if isinstance(author, dict): author = author.get('name', 'Unknown')
713
+ # Ensure high-resolution image (upgrade any low-res URLs)
714
+ raw_img = art.get('imageUrl', '') or 'https://miro.medium.com/v2/resize:fit:1400/1*jfdwtvU6V6g99q3G7gq7dQ.png'
715
+ img = upgrade_medium_image_url(raw_img, target_width=1400)
716
 
717
  html += f"""
718
  <a href='{url}' target='_blank' class='aether-card'>
src/html_renderer.py CHANGED
@@ -9,6 +9,9 @@ import html
9
  from typing import Dict, List, Any, Optional
10
  import logging
11
 
 
 
 
12
  logger = logging.getLogger("HTMLRenderer")
13
 
14
  # Base HTML template for standalone page
@@ -295,7 +298,7 @@ def render_paragraph(paragraph: Dict, is_code: bool = False) -> str:
295
  <div class="mt-7">
296
  <img loading="eager" alt="{alt}" class="pt-5 m-auto"
297
  referrerpolicy="no-referrer"
298
- src="https://miro.medium.com/v2/resize:fit:700/{image_id}">
299
  </div>
300
  '''
301
  if caption:
@@ -363,7 +366,7 @@ def render_paragraph(paragraph: Dict, is_code: bool = False) -> str:
363
  </div>
364
  <div class="relative flex h-40 flew-row w-60">
365
  <div class="absolute inset-0 bg-center bg-cover"
366
- style="background-image: url('https://miro.medium.com/v2/resize:fit:320/{thumbnail}');">
367
  </div>
368
  </div>
369
  </div>
@@ -504,7 +507,7 @@ def render_article_html(article_data: Dict[str, Any]) -> str:
504
  preview_image_html = f'''
505
  <img alt="Preview image" style="max-height: 65vh; width: auto; margin: auto"
506
  loading="eager" referrerpolicy="no-referrer"
507
- src="https://miro.medium.com/v2/resize:fit:700/{preview_image_id}">
508
  '''
509
 
510
  # Subtitle
 
9
  from typing import Dict, List, Any, Optional
10
  import logging
11
 
12
+ # Import centralized image URL utilities
13
+ from src.utils import MEDIUM_IMAGE_DEFAULT_WIDTH
14
+
15
  logger = logging.getLogger("HTMLRenderer")
16
 
17
  # Base HTML template for standalone page
 
298
  <div class="mt-7">
299
  <img loading="eager" alt="{alt}" class="pt-5 m-auto"
300
  referrerpolicy="no-referrer"
301
+ src="https://miro.medium.com/v2/resize:fit:1400/{image_id}">
302
  </div>
303
  '''
304
  if caption:
 
366
  </div>
367
  <div class="relative flex h-40 flew-row w-60">
368
  <div class="absolute inset-0 bg-center bg-cover"
369
+ style="background-image: url('https://miro.medium.com/v2/resize:fit:800/{thumbnail}');">
370
  </div>
371
  </div>
372
  </div>
 
507
  preview_image_html = f'''
508
  <img alt="Preview image" style="max-height: 65vh; width: auto; margin: auto"
509
  loading="eager" referrerpolicy="no-referrer"
510
+ src="https://miro.medium.com/v2/resize:fit:1400/{preview_image_id}">
511
  '''
512
 
513
  # Subtitle
src/paragraph_parser.py CHANGED
@@ -12,6 +12,9 @@ import logging
12
  import re
13
  from typing import Dict, List, Optional, Tuple
14
 
 
 
 
15
  try:
16
  import tld
17
  HAS_TLD = True
@@ -123,9 +126,9 @@ class MarkupProcessor:
123
  return text
124
 
125
 
126
- def get_image_url(image_id: str, width: int = 700) -> str:
127
- """Build Medium image URL from image ID."""
128
- return f"https://miro.medium.com/v2/resize:fit:{width}/{image_id}"
129
 
130
 
131
  def parse_paragraphs_to_markdown(
 
12
  import re
13
  from typing import Dict, List, Optional, Tuple
14
 
15
+ # Import centralized image URL utilities
16
+ from src.utils import get_medium_image_url, MEDIUM_IMAGE_DEFAULT_WIDTH
17
+
18
  try:
19
  import tld
20
  HAS_TLD = True
 
126
  return text
127
 
128
 
129
+ def get_image_url(image_id: str, width: int = MEDIUM_IMAGE_DEFAULT_WIDTH) -> str:
130
+ """Build Medium image URL from image ID. Uses high-res by default."""
131
+ return get_medium_image_url(image_id, width)
132
 
133
 
134
  def parse_paragraphs_to_markdown(
src/parser.py CHANGED
@@ -1,8 +1,13 @@
 
1
  from bs4 import BeautifulSoup
2
  from typing import Dict, List, Optional, Any
3
  from markdownify import markdownify as md
4
  from urllib.parse import urljoin
5
 
 
 
 
 
6
  def extract_search_results(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]:
7
  """
8
  Extracts article metadata from search result cards.
@@ -133,6 +138,9 @@ def _extract_from_card(card, base_url: str) -> Dict[str, Any]:
133
  if img_tag and img_tag.get("src"):
134
  image_url = img_tag["src"]
135
 
 
 
 
136
  return {
137
  "url": url,
138
  "title": title,
 
1
+ import re
2
  from bs4 import BeautifulSoup
3
  from typing import Dict, List, Optional, Any
4
  from markdownify import markdownify as md
5
  from urllib.parse import urljoin
6
 
7
+ # Import centralized image URL utilities from utils
8
+ from src.utils import upgrade_medium_image_url, get_medium_image_url, MEDIUM_IMAGE_DEFAULT_WIDTH
9
+
10
+
11
  def extract_search_results(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]:
12
  """
13
  Extracts article metadata from search result cards.
 
138
  if img_tag and img_tag.get("src"):
139
  image_url = img_tag["src"]
140
 
141
+ # Upgrade image URL to high resolution
142
+ image_url = upgrade_medium_image_url(image_url, target_width=1400)
143
+
144
  return {
145
  "url": url,
146
  "title": title,
src/utils.py CHANGED
@@ -194,6 +194,87 @@ def make_absolute_url(url: str, base_url: str) -> str:
194
  return urljoin(base_url, url)
195
 
196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  # =============================================================================
198
  # HASH UTILITIES
199
  # =============================================================================
 
194
  return urljoin(base_url, url)
195
 
196
 
197
+ # Default high resolution width for Medium images
198
+ MEDIUM_IMAGE_DEFAULT_WIDTH = 1400
199
+
200
+
201
+ def upgrade_medium_image_url(url: str, target_width: int = MEDIUM_IMAGE_DEFAULT_WIDTH) -> str:
202
+ """
203
+ Upgrades a Medium image URL to a higher resolution.
204
+
205
+ Medium uses CDN URLs like:
206
+ - https://miro.medium.com/v2/resize:fit:320/{image_id}
207
+ - https://miro.medium.com/v2/resize:fill:88:88/{image_id}
208
+ - https://miro.medium.com/max/320/{image_id} (older format)
209
+
210
+ This function replaces the resize parameters with a higher resolution.
211
+
212
+ Args:
213
+ url: The original image URL
214
+ target_width: Target width in pixels (default 1400 for high-res)
215
+
216
+ Returns:
217
+ Upgraded URL with higher resolution, or original if not a Medium image
218
+ """
219
+ if not url:
220
+ return url
221
+
222
+ # Check if it's a Medium CDN URL
223
+ if "miro.medium.com" not in url:
224
+ return url
225
+
226
+ # Pattern 1: v2/resize:fit:WIDTH or v2/resize:fill:WIDTH:HEIGHT
227
+ pattern_v2 = r"(miro\.medium\.com/v2/resize:)(fit|fill):(\d+)(?::(\d+))?"
228
+ match = re.search(pattern_v2, url)
229
+ if match:
230
+ # Replace with high-res fit format
231
+ new_url = re.sub(pattern_v2, f"miro.medium.com/v2/resize:fit:{target_width}", url)
232
+ return new_url
233
+
234
+ # Pattern 2: older format max/WIDTH
235
+ pattern_max = r"(miro\.medium\.com/max/)(\d+)"
236
+ match = re.search(pattern_max, url)
237
+ if match:
238
+ new_url = re.sub(pattern_max, f"miro.medium.com/v2/resize:fit:{target_width}", url)
239
+ return new_url
240
+
241
+ # Pattern 3: freeze format with dimensions
242
+ # Example: freeze/fit/320/240/...
243
+ pattern_freeze = r"(miro\.medium\.com/freeze/)(fit|fill)/(\d+)/(\d+)"
244
+ match = re.search(pattern_freeze, url)
245
+ if match:
246
+ new_url = re.sub(pattern_freeze, f"miro.medium.com/v2/resize:fit:{target_width}", url)
247
+ return new_url
248
+
249
+ # If we have a Medium URL but can't parse the format, try to extract image ID
250
+ # and construct a new URL
251
+ # Pattern: Look for the image ID (usually contains *)
252
+ pattern_id = r"miro\.medium\.com/.*?/([01]\*[a-zA-Z0-9_-]+\.[a-zA-Z]+)"
253
+ match = re.search(pattern_id, url)
254
+ if match:
255
+ image_id = match.group(1)
256
+ return f"https://miro.medium.com/v2/resize:fit:{target_width}/{image_id}"
257
+
258
+ # Return original if we can't upgrade
259
+ return url
260
+
261
+
262
+ def get_medium_image_url(image_id: str, width: int = MEDIUM_IMAGE_DEFAULT_WIDTH) -> str:
263
+ """
264
+ Build a high-resolution Medium image URL from an image ID.
265
+
266
+ Args:
267
+ image_id: The Medium image ID (e.g., "1*abc123.png")
268
+ width: Target width in pixels (default 1400 for high-res)
269
+
270
+ Returns:
271
+ Full Medium CDN URL for the image
272
+ """
273
+ if not image_id:
274
+ return ""
275
+ return f"https://miro.medium.com/v2/resize:fit:{width}/{image_id}"
276
+
277
+
278
  # =============================================================================
279
  # HASH UTILITIES
280
  # =============================================================================