| import re | |
| import yaml | |
| FRONTMATTER_REGEX = re.compile( | |
| r"^---\s*\n(.*?)\n---\s*\n(.*)", re.DOTALL | |
| ) | |
| def extract_metadata_and_content(text: str): | |
| """ | |
| Extracts metadata and content from a string containing frontmatter. | |
| """ | |
| match = FRONTMATTER_REGEX.match(text) | |
| if not match: | |
| return {}, text | |
| try: | |
| metadata = yaml.safe_load(match.group(1)) or {} | |
| if isinstance(metadata, dict): | |
| return metadata, match.group(2) | |
| except Exception: | |
| pass | |
| return {}, text | |
| def process_document(doc): | |
| """ | |
| Handles metadata extraction and content cleaning for a single document object. | |
| """ | |
| metadata, body = extract_metadata_and_content(doc.page_content) | |
| doc.page_content = body | |
| new_metadata = {} | |
| if "url" in metadata: | |
| new_metadata["source"] = metadata["url"] | |
| for k, v in metadata.items(): | |
| if k != "url": | |
| new_metadata[k] = v | |
| doc.metadata = new_metadata | |
| return doc |