""" MarkdownMuse: A Python application for converting Markdown to beautifully formatted PDFs This module implements the core functionality needed for the MarkdownMuse application. """ import os import re import sys import glob import logging from typing import List, Dict, Any, Optional, Tuple from bs4 import BeautifulSoup import markdown from markdown.extensions.tables import TableExtension from markdown.extensions.fenced_code import FencedCodeExtension from markdown.extensions.toc import TocExtension from reportlab.lib.pagesizes import letter, A4 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.lib.units import inch from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image, PageBreak, Preformatted, ListFlowable, ListItem from reportlab.lib.colors import HexColor, black, grey from reportlab.lib import colors from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT import html import base64 import requests from PIL import Image as PilImage import io import tempfile # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class MarkdownToPDFConverter: """ Class to convert Markdown content to PDF using ReportLab. """ def __init__( self, output_path: str = "output.pdf", page_size: str = "A4", margins: Tuple[float, float, float, float] = (0.75, 0.75, 0.75, 0.75), font_name: str = "Helvetica", base_font_size: int = 10, heading_scale: Dict[int, float] = None, include_toc: bool = True, code_style: str = "github" ): """ Initialize the converter with configuration options. Args: output_path: Path to save the PDF page_size: Page size ("A4" or "letter") margins: Tuple of margins (left, right, top, bottom) in inches font_name: Base font name to use base_font_size: Base font size in points heading_scale: Dictionary of heading levels to font size multipliers include_toc: Whether to include a table of contents code_style: Style to use for code blocks """ self.output_path = output_path self.page_size = A4 if page_size.upper() == "A4" else letter self.margins = margins self.font_name = font_name self.base_font_size = base_font_size self.heading_scale = heading_scale or { 1: 2.0, # H1 is 2.0x base font size 2: 1.7, # H2 is 1.7x base font size 3: 1.4, # H3 is 1.4x base font size 4: 1.2, # H4 is 1.2x base font size 5: 1.1, # H5 is 1.1x base font size 6: 1.0 # H6 is 1.0x base font size } self.include_toc = include_toc self.code_style = code_style # Initialize styles self.styles = getSampleStyleSheet() self._setup_styles() # Initialize document elements self.elements = [] self.toc_entries = [] def _setup_styles(self) -> None: """Set up custom paragraph styles for the document.""" # Modify existing Normal style self.styles['Normal'].fontName = self.font_name self.styles['Normal'].fontSize = self.base_font_size self.styles['Normal'].leading = self.base_font_size * 1.2 self.styles['Normal'].spaceAfter = self.base_font_size * 0.8 # Heading styles for level in range(1, 7): size_multiplier = self.heading_scale.get(level, 1.0) heading_name = f'Heading{level}' # Check if the heading style already exists if heading_name in self.styles: # Modify existing style self.styles[heading_name].parent = self.styles['Normal'] self.styles[heading_name].fontName = f'{self.font_name}-Bold' self.styles[heading_name].fontSize = int(self.base_font_size * size_multiplier) self.styles[heading_name].leading = int(self.base_font_size * size_multiplier * 1.2) self.styles[heading_name].spaceAfter = self.base_font_size self.styles[heading_name].spaceBefore = self.base_font_size * (1 + (0.2 * (7 - level))) else: # Create new style self.styles.add( ParagraphStyle( name=heading_name, parent=self.styles['Normal'], fontName=f'{self.font_name}-Bold', fontSize=int(self.base_font_size * size_multiplier), leading=int(self.base_font_size * size_multiplier * 1.2), spaceAfter=self.base_font_size, spaceBefore=self.base_font_size * (1 + (0.2 * (7 - level))), ) ) # Code block style self.styles.add( ParagraphStyle( name='CodeBlock', fontName='Courier', fontSize=self.base_font_size * 0.9, leading=self.base_font_size * 1.1, spaceAfter=self.base_font_size, spaceBefore=self.base_font_size, leftIndent=self.base_font_size, backgroundColor=HexColor('#EEEEEE'), borderWidth=0, borderPadding=self.base_font_size * 0.5, ) ) # List item style self.styles.add( ParagraphStyle( name='ListItem', parent=self.styles['Normal'], leftIndent=self.base_font_size * 2, firstLineIndent=-self.base_font_size, ) ) # Table of contents styles self.styles.add( ParagraphStyle( name='TOCHeading', parent=self.styles['Heading1'], fontSize=int(self.base_font_size * 1.5), spaceAfter=self.base_font_size * 1.5, ) ) for level in range(1, 4): # Create styles for TOC levels self.styles.add( ParagraphStyle( name=f'TOC{level}', parent=self.styles['Normal'], leftIndent=self.base_font_size * (level - 1) * 2, fontSize=self.base_font_size - (level - 1), leading=self.base_font_size * 1.4, ) ) def convert_file(self, md_file_path: str) -> None: """ Convert a single markdown file to PDF. Args: md_file_path: Path to the markdown file """ # Read markdown content with open(md_file_path, 'r', encoding='utf-8') as f: md_content = f.read() # Convert markdown to PDF self.convert_content(md_content) def convert_content(self, md_content: str) -> None: """ Convert markdown content string to PDF. Args: md_content: Markdown content as a string """ # Convert markdown to HTML html_content = self._md_to_html(md_content) # Convert HTML to ReportLab elements self._html_to_elements(html_content) # Generate the PDF self._generate_pdf() logger.info(f"PDF created at {self.output_path}") def convert_multiple_files(self, md_file_paths: List[str], merge: bool = True, separate_toc: bool = False) -> None: """ Convert multiple markdown files to PDF. Args: md_file_paths: List of paths to markdown files merge: Whether to merge all files into a single PDF separate_toc: Whether to include a separate TOC for each file """ if merge: all_content = [] for file_path in md_file_paths: logger.info(f"Processing {file_path}") with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Add file name as heading if more than one file if len(md_file_paths) > 1: file_name = os.path.splitext(os.path.basename(file_path))[0] content = f"# {file_name}\n\n{content}" # Add page break between files if all_content: all_content.append("\n\n
\n\n") all_content.append(content) combined_content = "\n".join(all_content) self.convert_content(combined_content) else: # Process each file separately for i, file_path in enumerate(md_file_paths): converter = MarkdownToPDFConverter( output_path=f"{os.path.splitext(file_path)[0]}.pdf", page_size=self.page_size, margins=self.margins, font_name=self.font_name, base_font_size=self.base_font_size, heading_scale=self.heading_scale, include_toc=separate_toc, code_style=self.code_style ) converter.convert_file(file_path) def _md_to_html(self, md_content: str) -> str: """ Convert markdown content to HTML. Args: md_content: Markdown content Returns: HTML content """ # Define extensions for markdown conversion extensions = [ 'markdown.extensions.extra', 'markdown.extensions.smarty', TableExtension(), FencedCodeExtension(), TocExtension(toc_depth=3) if self.include_toc else None ] # Remove None values extensions = [ext for ext in extensions if ext is not None] # Convert markdown to HTML html_content = markdown.markdown(md_content, extensions=extensions) return html_content def _html_to_elements(self, html_content: str) -> None: """ Convert HTML content to ReportLab elements. Args: html_content: HTML content """ soup = BeautifulSoup(html_content, 'html.parser') # Process elements for element in soup.children: if element.name: self._process_element(element) def _process_element(self, element: BeautifulSoup) -> None: """ Process an HTML element and convert it to ReportLab elements. Args: element: BeautifulSoup element """ if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: level = int(element.name[1]) text = element.get_text() # Add to TOC if self.include_toc: self.toc_entries.append((level, text)) # Create heading paragraph self.elements.append( Paragraph(text, self.styles[f'Heading{level}']) ) elif element.name == 'p': text = self._process_inline_elements(element) self.elements.append( Paragraph(text, self.styles['Normal']) ) elif element.name == 'pre': code = element.get_text() self.elements.append( Preformatted(code, self.styles['CodeBlock']) ) elif element.name == 'img': src = element.get('src', '') alt = element.get('alt', 'Image') # Handle different image sources if src.startswith('http'): # Remote image try: response = requests.get(src) img_data = response.content img_stream = io.BytesIO(img_data) image = Image(img_stream, width=4*inch, height=3*inch) # Try to get actual dimensions try: pil_img = PilImage.open(img_stream) width, height = pil_img.size aspect = width / height max_width = 6 * inch if width > max_width: new_width = max_width new_height = new_width / aspect image = Image(img_stream, width=new_width, height=new_height) except: pass # Use default size if image can't be processed self.elements.append(image) except: # If image can't be retrieved, add a placeholder self.elements.append( Paragraph(f"[Image: {alt}]", self.styles['Normal']) ) elif src.startswith('data:image'): # Base64 encoded image try: # Extract base64 data b64_data = src.split(',')[1] img_data = base64.b64decode(b64_data) img_stream = io.BytesIO(img_data) image = Image(img_stream, width=4*inch, height=3*inch) self.elements.append(image) except: # If image can't be processed, add a placeholder self.elements.append( Paragraph(f"[Image: {alt}]", self.styles['Normal']) ) else: # Local image if os.path.exists(src): image = Image(src, width=4*inch, height=3*inch) self.elements.append(image) else: # If image can't be found, add a placeholder self.elements.append( Paragraph(f"[Image: {alt}]", self.styles['Normal']) ) elif element.name == 'ul' or element.name == 'ol': list_items = [] bullet_type = 'bullet' if element.name == 'ul' else 'numbered' for item in element.find_all('li', recursive=False): text = self._process_inline_elements(item) list_items.append( ListItem( Paragraph(text, self.styles['ListItem']), leftIndent=20 ) ) self.elements.append( ListFlowable( list_items, bulletType=bullet_type, start=1 if bullet_type == 'numbered' else None, bulletFormat='%s.' if bullet_type == 'numbered' else '%s' ) ) elif element.name == 'table': self._process_table(element) elif element.name == 'div' and 'page-break' in element.get('class', []): self.elements.append(PageBreak()) elif element.name == 'hr': self.elements.append(Spacer(1, 0.25*inch)) # Process children for complex elements elif element.name in ['div', 'blockquote', 'section', 'article']: for child in element.children: if hasattr(child, 'name') and child.name: self._process_element(child) def _process_inline_elements(self, element: BeautifulSoup) -> str: """ Process inline HTML elements like bold, italic, etc. Args: element: BeautifulSoup element Returns: Formatted text with ReportLab markup """ html_str = str(element) # Convert common HTML tags to ReportLab paragraph markup replacements = [ (r'(.*?)', r'\1'), (r'(.*?)', r'\1'), (r'(.*?)', r'\1'), (r'(.*?)', r'\1'), (r'(.*?)', r'\1'),
(r'(.*?)', r'\2'),
(r'(.*?)', r'\1'),
(r'