import re from typing import List, Dict, Any def extract_code_vault(markdown_content: str) -> List[Dict[str, str]]: """ Extracts code blocks from markdown content. Returns a list of dictionaries with 'language' and 'code'. """ code_blocks = [] # Regex to find code blocks: ```language\ncode\n``` # We use non-greedy matching for content pattern = r"```(\w+)?\n(.*?)```" matches = re.findall(pattern, markdown_content, re.DOTALL) for i, match in enumerate(matches): language = match[0] if match[0] else "text" code = match[1].strip() if code: code_blocks.append({ "id": f"snippet_{i+1}", "language": language, "code": code }) return code_blocks def extract_external_links(markdown_content: str) -> List[str]: """ Extracts external links from markdown content. Returns a list of unique URLs. """ links = set() # Regex for markdown links: [text](url) pattern = r"\[.*?\]\((https?://[^\s\)]+)\)" matches = re.findall(pattern, markdown_content) for url in matches: # Filter out internal Medium links (optional, but good for "External" research) if "medium.com" not in url: links.add(url) return list(links) def extract_media_assets(markdown_content: str) -> List[Dict[str, str]]: """ Extracts media assets (images) from markdown content. Returns a list of dictionaries with 'type', 'url', and optional 'caption'. """ assets = [] # Regex for images: ![alt](url) pattern = r"!\[(.*?)\]\((https?://[^\s\)]+)\)" matches = re.findall(pattern, markdown_content) for alt, url in matches: assets.append({ "type": "image", "url": url, "caption": alt }) return assets