backend / scripts /utils.py
anujjoshi3105's picture
initial
22dcdfd
raw
history blame contribute delete
996 Bytes
import re
import yaml
FRONTMATTER_REGEX = re.compile(
r"^---\s*\n(.*?)\n---\s*\n(.*)", re.DOTALL
)
def extract_metadata_and_content(text: str):
"""
Extracts metadata and content from a string containing frontmatter.
"""
match = FRONTMATTER_REGEX.match(text)
if not match:
return {}, text
try:
metadata = yaml.safe_load(match.group(1)) or {}
if isinstance(metadata, dict):
return metadata, match.group(2)
except Exception:
pass
return {}, text
def process_document(doc):
"""
Handles metadata extraction and content cleaning for a single document object.
"""
metadata, body = extract_metadata_and_content(doc.page_content)
doc.page_content = body
new_metadata = {}
if "url" in metadata:
new_metadata["source"] = metadata["url"]
for k, v in metadata.items():
if k != "url":
new_metadata[k] = v
doc.metadata = new_metadata
return doc