Spaces:
Sleeping
Sleeping
| import re | |
| from typing import List, Dict, Any | |
| def extract_code_vault(markdown_content: str) -> List[Dict[str, str]]: | |
| """ | |
| Extracts code blocks from markdown content. | |
| Returns a list of dictionaries with 'language' and 'code'. | |
| """ | |
| code_blocks = [] | |
| # Regex to find code blocks: ```language\ncode\n``` | |
| # We use non-greedy matching for content | |
| pattern = r"```(\w+)?\n(.*?)```" | |
| matches = re.findall(pattern, markdown_content, re.DOTALL) | |
| for i, match in enumerate(matches): | |
| language = match[0] if match[0] else "text" | |
| code = match[1].strip() | |
| if code: | |
| code_blocks.append({ | |
| "id": f"snippet_{i+1}", | |
| "language": language, | |
| "code": code | |
| }) | |
| return code_blocks | |
| def extract_external_links(markdown_content: str) -> List[str]: | |
| """ | |
| Extracts external links from markdown content. | |
| Returns a list of unique URLs. | |
| """ | |
| links = set() | |
| # Regex for markdown links: [text](url) | |
| pattern = r"\[.*?\]\((https?://[^\s\)]+)\)" | |
| matches = re.findall(pattern, markdown_content) | |
| for url in matches: | |
| # Filter out internal Medium links (optional, but good for "External" research) | |
| if "medium.com" not in url: | |
| links.add(url) | |
| return list(links) | |
| def extract_media_assets(markdown_content: str) -> List[Dict[str, str]]: | |
| """ | |
| Extracts media assets (images) from markdown content. | |
| Returns a list of dictionaries with 'type', 'url', and optional 'caption'. | |
| """ | |
| assets = [] | |
| # Regex for images:  | |
| pattern = r"!\[(.*?)\]\((https?://[^\s\)]+)\)" | |
| matches = re.findall(pattern, markdown_content) | |
| for alt, url in matches: | |
| assets.append({ | |
| "type": "image", | |
| "url": url, | |
| "caption": alt | |
| }) | |
| return assets | |