fhoc / src /utils /parser_utils.py
Francisco Zanartu
feat: add parser utilities for cleaning Markdown and encoding PDFs to base64
462129a
raw
history blame contribute delete
569 Bytes
import re
import base64
def clean_markdown(text):
"""
Removes the ```markdown and ``` wrappers that LLMs often include.
"""
# Remove leading ```markdown or ```
text = re.sub(r"^```(?:markdown)?\n?", "", text, flags=re.IGNORECASE)
# Remove trailing ```
text = re.sub(r"\n?```$", "", text)
return text.strip()
def encode_pdf_to_base64(file_path):
"""Helper to convert local file to base64 string."""
with open(file_path, "rb") as f:
encoded_string = base64.b64encode(f.read()).decode("utf-8")
return encoded_string