File size: 1,893 Bytes
ae588db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import re
from typing import List, Dict, Any

def extract_code_vault(markdown_content: str) -> List[Dict[str, str]]:
    """
    Extracts code blocks from markdown content.
    Returns a list of dictionaries with 'language' and 'code'.
    """
    code_blocks = []
    # Regex to find code blocks: ```language\ncode\n```
    # We use non-greedy matching for content
    pattern = r"```(\w+)?\n(.*?)```"
    matches = re.findall(pattern, markdown_content, re.DOTALL)
    
    for i, match in enumerate(matches):
        language = match[0] if match[0] else "text"
        code = match[1].strip()
        if code:
            code_blocks.append({
                "id": f"snippet_{i+1}",
                "language": language,
                "code": code
            })
    return code_blocks

def extract_external_links(markdown_content: str) -> List[str]:
    """
    Extracts external links from markdown content.
    Returns a list of unique URLs.
    """
    links = set()
    # Regex for markdown links: [text](url)
    pattern = r"\[.*?\]\((https?://[^\s\)]+)\)"
    matches = re.findall(pattern, markdown_content)
    
    for url in matches:
        # Filter out internal Medium links (optional, but good for "External" research)
        if "medium.com" not in url:
            links.add(url)
            
    return list(links)

def extract_media_assets(markdown_content: str) -> List[Dict[str, str]]:
    """
    Extracts media assets (images) from markdown content.
    Returns a list of dictionaries with 'type', 'url', and optional 'caption'.
    """
    assets = []
    # Regex for images: ![alt](url)
    pattern = r"!\[(.*?)\]\((https?://[^\s\)]+)\)"
    matches = re.findall(pattern, markdown_content)
    
    for alt, url in matches:
        assets.append({
            "type": "image",
            "url": url,
            "caption": alt
        })
        
    return assets