Medium-MCP / src /graph.py
Nikhil Pravin Pise
feat: comprehensive migration - merge Scraper + MCP Server
ae588db
import re
from typing import List, Dict, Any
def extract_code_vault(markdown_content: str) -> List[Dict[str, str]]:
"""
Extracts code blocks from markdown content.
Returns a list of dictionaries with 'language' and 'code'.
"""
code_blocks = []
# Regex to find code blocks: ```language\ncode\n```
# We use non-greedy matching for content
pattern = r"```(\w+)?\n(.*?)```"
matches = re.findall(pattern, markdown_content, re.DOTALL)
for i, match in enumerate(matches):
language = match[0] if match[0] else "text"
code = match[1].strip()
if code:
code_blocks.append({
"id": f"snippet_{i+1}",
"language": language,
"code": code
})
return code_blocks
def extract_external_links(markdown_content: str) -> List[str]:
"""
Extracts external links from markdown content.
Returns a list of unique URLs.
"""
links = set()
# Regex for markdown links: [text](url)
pattern = r"\[.*?\]\((https?://[^\s\)]+)\)"
matches = re.findall(pattern, markdown_content)
for url in matches:
# Filter out internal Medium links (optional, but good for "External" research)
if "medium.com" not in url:
links.add(url)
return list(links)
def extract_media_assets(markdown_content: str) -> List[Dict[str, str]]:
"""
Extracts media assets (images) from markdown content.
Returns a list of dictionaries with 'type', 'url', and optional 'caption'.
"""
assets = []
# Regex for images: ![alt](url)
pattern = r"!\[(.*?)\]\((https?://[^\s\)]+)\)"
matches = re.findall(pattern, markdown_content)
for alt, url in matches:
assets.append({
"type": "image",
"url": url,
"caption": alt
})
return assets