File size: 996 Bytes
22dcdfd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import re
import yaml

FRONTMATTER_REGEX = re.compile(
    r"^---\s*\n(.*?)\n---\s*\n(.*)", re.DOTALL
)

def extract_metadata_and_content(text: str):
    """
    Extracts metadata and content from a string containing frontmatter.
    """
    match = FRONTMATTER_REGEX.match(text)
    if not match:
        return {}, text

    try:
        metadata = yaml.safe_load(match.group(1)) or {}
        if isinstance(metadata, dict):
            return metadata, match.group(2)
    except Exception:
        pass

    return {}, text

def process_document(doc):
    """
    Handles metadata extraction and content cleaning for a single document object.
    """
    metadata, body = extract_metadata_and_content(doc.page_content)
    doc.page_content = body
    new_metadata = {}
    if "url" in metadata:
        new_metadata["source"] = metadata["url"]

    for k, v in metadata.items():
        if k != "url":
            new_metadata[k] = v
            
    doc.metadata = new_metadata
    return doc