Spaces:
Paused
Paused
| from datetime import datetime | |
| KEYS_TO_EXCLUDE = ["content", "pages", "tables", "paragraphs", "sections", "figures"] | |
| def filter_metadata(metadata: dict[str, any]) -> dict[str, any]: | |
| metadata = { | |
| key: value for key, value in metadata.items() if key not in KEYS_TO_EXCLUDE | |
| } | |
| return metadata | |
| def process_metadata( | |
| metadata: dict[str, any], | |
| ) -> dict[str, any]: | |
| for key, value in metadata.items(): | |
| # Remove large fields | |
| if key in KEYS_TO_EXCLUDE: | |
| del metadata[key] | |
| # Convert non-serializable fields to strings | |
| if ( | |
| isinstance(value, datetime) | |
| or isinstance(value, list) | |
| or isinstance(value, dict) | |
| ): | |
| metadata[key] = str(value) | |
| return metadata | |