File size: 776 Bytes
d6703a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from datetime import datetime

KEYS_TO_EXCLUDE = ["content", "pages", "tables", "paragraphs", "sections", "figures"]


def filter_metadata(metadata: dict[str, any]) -> dict[str, any]:
    metadata = {
        key: value for key, value in metadata.items() if key not in KEYS_TO_EXCLUDE
    }
    return metadata


def process_metadata(
    metadata: dict[str, any],
) -> dict[str, any]:
    for key, value in metadata.items():
        # Remove large fields
        if key in KEYS_TO_EXCLUDE:
            del metadata[key]

        # Convert non-serializable fields to strings
        if (
            isinstance(value, datetime)
            or isinstance(value, list)
            or isinstance(value, dict)
        ):
            metadata[key] = str(value)
    return metadata