Spaces:

SatyamSinghal
/

MarkdownMuse

Running

App Files Files Community

Create app.py

by Prak2005 - opened Mar 17, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+967

-0

Files changed (1) hide show

app.py +967 -0

app.py ADDED Viewed

	@@ -0,0 +1,967 @@

+import os
+import re
+import sys
+import glob
+import argparse
+from typing import List, Dict, Any, Optional, Tuple
+import logging
+import markdown
+from markdown.extensions.tables import TableExtension
+from markdown.extensions.fenced_code import FencedCodeExtension
+from markdown.extensions.toc import TocExtension
+from reportlab.lib.pagesizes import letter, A4
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from reportlab.lib.units import inch
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image, PageBreak, Preformatted, ListFlowable, ListItem
+from reportlab.lib.colors import HexColor, black, grey
+from reportlab.lib import colors
+from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
+from reportlab.pdfbase import pdfmetrics
+from reportlab.pdfbase.ttfonts import TTFont
+import html
+from bs4 import BeautifulSoup
+import re
+from PIL import Image as PilImage
+import io
+import base64
+import requests
+from crewai import Agent, Task, Crew
+from dotenv import load_dotenv
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Load environment variables
+load_dotenv()
+class MarkdownToPDFConverter:
+    """
+    Class to convert Markdown content to PDF using ReportLab.
+    """
+    def __init__(
+        self,
+        output_path: str = "output.pdf",
+        page_size: str = "A4",
+        margins: Tuple[float, float, float, float] = (0.75, 0.75, 0.75, 0.75),
+        font_name: str = "Helvetica",
+        base_font_size: int = 10,
+        heading_scale: Dict[int, float] = None,
+        include_toc: bool = True,
+        code_style: str = "github"
+    ):
+        """
+        Initialize the converter with configuration options.
+        Args:
+            output_path: Path to save the PDF
+            page_size: Page size ("A4" or "letter")
+            margins: Tuple of margins (left, right, top, bottom) in inches
+            font_name: Base font name to use
+            base_font_size: Base font size in points
+            heading_scale: Dictionary of heading levels to font size multipliers
+            include_toc: Whether to include a table of contents
+            code_style: Style to use for code blocks
+        """
+        self.output_path = output_path
+        self.page_size = A4 if page_size.upper() == "A4" else letter
+        self.margins = margins
+        self.font_name = font_name
+        self.base_font_size = base_font_size
+        self.heading_scale = heading_scale or {
+            1: 2.0,   # H1 is 2.0x base font size
+            2: 1.7,   # H2 is 1.7x base font size
+            3: 1.4,   # H3 is 1.4x base font size
+            4: 1.2,   # H4 is 1.2x base font size
+            5: 1.1,   # H5 is 1.1x base font size
+            6: 1.0    # H6 is 1.0x base font size
+        }
+        self.include_toc = include_toc
+        self.code_style = code_style
+        # Initialize styles
+        self.styles = getSampleStyleSheet()
+        self._setup_styles()
+        # Initialize document elements
+        self.elements = []
+        self.toc_entries = []
+    def _setup_styles(self) -> None:
+        """Set up custom paragraph styles for the document."""
+        # Modify existing Normal style instead of adding a duplicate
+        self.styles['Normal'].fontName = self.font_name
+        self.styles['Normal'].fontSize = self.base_font_size
+        self.styles['Normal'].leading = self.base_font_size * 1.2
+        self.styles['Normal'].spaceAfter = self.base_font_size * 0.8
+        # Heading styles - modify existing ones instead of adding new ones
+        for level in range(1, 7):
+            size_multiplier = self.heading_scale.get(level, 1.0)
+            heading_name = f'Heading{level}'
+            # Check if the heading style already exists
+            if heading_name in self.styles:
+                # Modify existing style
+                self.styles[heading_name].parent = self.styles['Normal']
+                self.styles[heading_name].fontName = f'{self.font_name}-Bold'
+                self.styles[heading_name].fontSize = int(self.base_font_size * size_multiplier)
+                self.styles[heading_name].leading = int(self.base_font_size * size_multiplier * 1.2)
+                self.styles[heading_name].spaceAfter = self.base_font_size
+                self.styles[heading_name].spaceBefore = self.base_font_size * (1 + (0.2 * (7 - level)))
+            else:
+                # Create new style if it doesn't exist
+                self.styles.add(
+                    ParagraphStyle(
+                        name=heading_name,
+                        parent=self.styles['Normal'],
+                        fontName=f'{self.font_name}-Bold',
+                        fontSize=int(self.base_font_size * size_multiplier),
+                        leading=int(self.base_font_size * size_multiplier * 1.2),
+                        spaceAfter=self.base_font_size,
+                        spaceBefore=self.base_font_size * (1 + (0.2 * (7 - level))),
+                    )
+                )
+        # Code block style
+        if 'CodeBlock' in self.styles:
+            # Modify existing style
+            self.styles['CodeBlock'].fontName = 'Courier'
+            self.styles['CodeBlock'].fontSize = self.base_font_size * 0.9
+            self.styles['CodeBlock'].leading = self.base_font_size * 1.1
+            self.styles['CodeBlock'].spaceAfter = self.base_font_size
+            self.styles['CodeBlock'].spaceBefore = self.base_font_size
+            self.styles['CodeBlock'].leftIndent = self.base_font_size
+            self.styles['CodeBlock'].backgroundColor = HexColor('#EEEEEE')
+            self.styles['CodeBlock'].borderWidth = 0
+            self.styles['CodeBlock'].borderPadding = self.base_font_size * 0.5
+        else:
+            # Create new style
+            self.styles.add(
+                ParagraphStyle(
+                    name='CodeBlock',
+                    fontName='Courier',
+                    fontSize=self.base_font_size * 0.9,
+                    leading=self.base_font_size * 1.1,
+                    spaceAfter=self.base_font_size,
+                    spaceBefore=self.base_font_size,
+                    leftIndent=self.base_font_size,
+                    backgroundColor=HexColor('#EEEEEE'),
+                    borderWidth=0,
+                    borderPadding=self.base_font_size * 0.5,
+                )
+            )
+        # List item style
+        if 'ListItem' in self.styles:
+            # Modify existing style
+            self.styles['ListItem'].parent = self.styles['Normal']
+            self.styles['ListItem'].leftIndent = self.base_font_size * 2
+            self.styles['ListItem'].firstLineIndent = -self.base_font_size
+        else:
+            # Create new style
+            self.styles.add(
+                ParagraphStyle(
+                    name='ListItem',
+                    parent=self.styles['Normal'],
+                    leftIndent=self.base_font_size * 2,
+                    firstLineIndent=-self.base_font_size,
+                )
+            )
+        # Table of contents styles
+        if 'TOCHeading' in self.styles:
+            # Modify existing style
+            self.styles['TOCHeading'].parent = self.styles['Heading1']
+            self.styles['TOCHeading'].fontSize = int(self.base_font_size * 1.5)
+            self.styles['TOCHeading'].spaceAfter = self.base_font_size * 1.5
+        else:
+            # Create new style
+            self.styles.add(
+                ParagraphStyle(
+                    name='TOCHeading',
+                    parent=self.styles['Heading1'],
+                    fontSize=int(self.base_font_size * 1.5),
+                    spaceAfter=self.base_font_size * 1.5,
+                )
+            )
+        for level in range(1, 4):  # Create styles for TOC levels
+            toc_name = f'TOC{level}'
+            if toc_name in self.styles:
+                # Modify existing style
+                self.styles[toc_name].parent = self.styles['Normal']
+                self.styles[toc_name].leftIndent = self.base_font_size * (level - 1) * 2
+                self.styles[toc_name].fontSize = self.base_font_size - (level - 1)
+                self.styles[toc_name].leading = self.base_font_size * 1.4
+            else:
+                # Create new style
+                self.styles.add(
+                    ParagraphStyle(
+                        name=toc_name,
+                        parent=self.styles['Normal'],
+                        leftIndent=self.base_font_size * (level - 1) * 2,
+                        fontSize=self.base_font_size - (level - 1),
+                        leading=self.base_font_size * 1.4,
+                    )
+                )
+    def convert_file(self, md_file_path: str) -> None:
+        """
+        Convert a single markdown file to PDF.
+        Args:
+            md_file_path: Path to the markdown file
+        """
+        # Read markdown content
+        with open(md_file_path, 'r', encoding='utf-8') as f:
+            md_content = f.read()
+        # Convert markdown to PDF
+        self.convert_content(md_content)
+    def convert_content(self, md_content: str) -> None:
+        """
+        Convert markdown content string to PDF.
+        Args:
+            md_content: Markdown content as a string
+        """
+        # Convert markdown to HTML
+        html_content = self._md_to_html(md_content)
+        # Convert HTML to ReportLab elements
+        self._html_to_elements(html_content)
+        # Generate the PDF
+        self._generate_pdf()
+        logger.info(f"PDF created at {self.output_path}")
+    def convert_multiple_files(self, md_file_paths: List[str],
+                              merge: bool = True,
+                              separate_toc: bool = False) -> None:
+        """
+        Convert multiple markdown files to PDF.
+        Args:
+            md_file_paths: List of paths to markdown files
+            merge: Whether to merge all files into a single PDF
+            separate_toc: Whether to include a separate TOC for each file
+        """
+        if merge:
+            all_content = []
+            for file_path in md_file_paths:
+                logger.info(f"Processing {file_path}")
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                # Add file name as heading if more than one file
+                if len(md_file_paths) > 1:
+                    file_name = os.path.splitext(os.path.basename(file_path))[0]
+                    content = f"# {file_name}\n\n{content}"
+                # Add page break between files
+                if all_content:
+                    all_content.append("\n\n<div class='page-break'></div>\n\n")
+                all_content.append(content)
+            combined_content = "\n".join(all_content)
+            self.convert_content(combined_content)
+        else:
+            # Process each file separately
+            for i, file_path in enumerate(md_file_paths):
+                converter = MarkdownToPDFConverter(
+                    output_path=f"{os.path.splitext(file_path)[0]}.pdf",
+                    page_size=self.page_size,
+                    margins=self.margins,
+                    font_name=self.font_name,
+                    base_font_size=self.base_font_size,
+                    heading_scale=self.heading_scale,
+                    include_toc=separate_toc,
+                    code_style=self.code_style
+                )
+                converter.convert_file(file_path)
+    def _md_to_html(self, md_content: str) -> str:
+        """
+        Convert markdown content to HTML.
+        Args:
+            md_content: Markdown content
+        Returns:
+            HTML content
+        """
+        # Define extensions for markdown conversion
+        extensions = [
+            'markdown.extensions.extra',
+            'markdown.extensions.smarty',
+            TableExtension(),
+            FencedCodeExtension(),
+            TocExtension(toc_depth=3) if self.include_toc else None
+        ]
+        # Remove None values
+        extensions = [ext for ext in extensions if ext is not None]
+        # Convert markdown to HTML
+        html_content = markdown.markdown(md_content, extensions=extensions)
+        return html_content
+    def _html_to_elements(self, html_content: str) -> None:
+        """
+        Convert HTML content to ReportLab elements.
+        Args:
+            html_content: HTML content
+        """
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Process elements
+        for element in soup.children:
+            if element.name:
+                self._process_element(element)
+    def _process_element(self, element: BeautifulSoup) -> None:
+        """
+        Process an HTML element and convert it to ReportLab elements.
+        Args:
+            element: BeautifulSoup element
+        """
+        if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            level = int(element.name[1])
+            text = element.get_text()
+            # Add to TOC
+            if self.include_toc:
+                self.toc_entries.append((level, text))
+            # Create heading paragraph
+            self.elements.append(
+                Paragraph(text, self.styles[f'Heading{level}'])
+            )
+        elif element.name == 'p':
+            text = self._process_inline_elements(element)
+            self.elements.append(
+                Paragraph(text, self.styles['Normal'])
+            )
+        elif element.name == 'pre':
+            code = element.get_text()
+            self.elements.append(
+                Preformatted(code, self.styles['CodeBlock'])
+            )
+        elif element.name == 'img':
+            src = element.get('src', '')
+            alt = element.get('alt', 'Image')
+            # Handle different image sources
+            if src.startswith('http'):
+                # Remote image
+                try:
+                    response = requests.get(src)
+                    img_data = response.content
+                    img_stream = io.BytesIO(img_data)
+                    image = Image(img_stream, width=4*inch, height=3*inch)
+                    # Try to get actual dimensions
+                    try:
+                        pil_img = PilImage.open(img_stream)
+                        width, height = pil_img.size
+                        aspect = width / height
+                        max_width = 6 * inch
+                        if width > max_width:
+                            new_width = max_width
+                            new_height = new_width / aspect
+                            image = Image(img_stream, width=new_width, height=new_height)
+                    except:
+                        pass  # Use default size if image can't be processed
+                    self.elements.append(image)
+                except:
+                    # If image can't be retrieved, add a placeholder
+                    self.elements.append(
+                        Paragraph(f"[Image: {alt}]", self.styles['Normal'])
+                    )
+            elif src.startswith('data:image'):
+                # Base64 encoded image
+                try:
+                    # Extract base64 data
+                    b64_data = src.split(',')[1]
+                    img_data = base64.b64decode(b64_data)
+                    img_stream = io.BytesIO(img_data)
+                    image = Image(img_stream, width=4*inch, height=3*inch)
+                    self.elements.append(image)
+                except:
+                    # If image can't be processed, add a placeholder
+                    self.elements.append(
+                        Paragraph(f"[Image: {alt}]", self.styles['Normal'])
+                    )
+            else:
+                # Local image
+                if os.path.exists(src):
+                    image = Image(src, width=4*inch, height=3*inch)
+                    self.elements.append(image)
+                else:
+                    # If image can't be found, add a placeholder
+                    self.elements.append(
+                        Paragraph(f"[Image: {alt}]", self.styles['Normal'])
+                    )
+        elif element.name == 'ul' or element.name == 'ol':
+            list_items = []
+            bullet_type = 'bullet' if element.name == 'ul' else 'numbered'
+            for item in element.find_all('li', recursive=False):
+                text = self._process_inline_elements(item)
+                list_items.append(
+                    ListItem(
+                        Paragraph(text, self.styles['ListItem']),
+                        leftIndent=20
+                    )
+                )
+            self.elements.append(
+                ListFlowable(
+                    list_items,
+                    bulletType='bullet',  # Always use 'bullet' as the bulletType
+                    start=None,  # Don't use numeric start to avoid int decode errors
+                    bulletFormat='%s.' if bullet_type == 'numbered' else '%s'  # Use string formatting for numbers
+                )
+            )
+        elif element.name == 'table':
+            self._process_table(element)
+        elif element.name == 'div' and 'page-break' in element.get('class', []):
+            self.elements.append(PageBreak())
+        elif element.name == 'hr':
+            self.elements.append(Spacer(1, 0.25*inch))
+        # Process children for complex elements
+        elif element.name in ['div', 'blockquote', 'section', 'article']:
+            for child in element.children:
+                if hasattr(child, 'name') and child.name:
+                    self._process_element(child)
+    def _process_inline_elements(self, element: BeautifulSoup) -> str:
+        """
+        Process inline HTML elements like bold, italic, etc.
+        Args:
+            element: BeautifulSoup element
+        Returns:
+            Formatted text with ReportLab markup
+        """
+        html_str = str(element)
+        # Convert common HTML tags to ReportLab paragraph markup
+        replacements = [
+            (r'<strong>(.*?)</strong>', r'<b>\1</b>'),
+            (r'<b>(.*?)</b>', r'<b>\1</b>'),
+            (r'<em>(.*?)</em>', r'<i>\1</i>'),
+            (r'<i>(.*?)</i>', r'<i>\1</i>'),
+            (r'<code>(.*?)</code>', r'<font name="Courier">\1</font>'),
+            (r'<a href="(.*?)">(.*?)</a>', r'<link href="\1">\2</link>'),
+            (r'<u>(.*?)</u>', r'<u>\1</u>'),
+            (r'<strike>(.*?)</strike>', r'<strike>\1</strike>'),
+            (r'<del>(.*?)</del>', r'<strike>\1</strike>'),
+        ]
+        for pattern, replacement in replacements:
+            html_str = re.sub(pattern, replacement, html_str, flags=re.DOTALL)
+        # Extract text with our ReportLab markup from the modified HTML
+        soup = BeautifulSoup(html_str, 'html.parser')
+        return soup.get_text()
+    def _process_table(self, table_element: BeautifulSoup) -> None:
+        """
+        Process an HTML table into a ReportLab Table.
+        Args:
+            table_element: BeautifulSoup table element
+        """
+        rows = []
+        # Extract header row
+        thead = table_element.find('thead')
+        if thead:
+            header_cells = []
+            for th in thead.find_all(['th']):
+                text = self._process_inline_elements(th)
+                # Create a paragraph with bold text for headers
+                header_cells.append(Paragraph(f"<b>{text}</b>", self.styles['Normal']))
+            rows.append(header_cells)
+        # Extract body rows
+        tbody = table_element.find('tbody') or table_element
+        for tr in tbody.find_all('tr'):
+            if tr.parent.name == 'thead':
+                continue  # Skip header rows already processed
+            row_cells = []
+            for cell in tr.find_all(['td', 'th']):
+                text = self._process_inline_elements(cell)
+                if cell.name == 'th':
+                    # Headers are bold
+                    row_cells.append(Paragraph(f"<b>{text}</b>", self.styles['Normal']))
+                else:
+                    row_cells.append(Paragraph(text, self.styles['Normal']))
+            if row_cells:  # Only add non-empty rows
+                rows.append(row_cells)
+        if rows:
+            # Create table and style
+            col_widths = [None] * len(rows[0])  # Auto width for columns
+            table = Table(rows, colWidths=col_widths)
+            # Add basic grid and header styling
+            style = TableStyle([
+                ('GRID', (0, 0), (-1, -1), 0.5, colors.Color(0.7, 0.7, 0.7)),
+                ('BACKGROUND', (0, 0), (-1, 0), colors.Color(0.8, 0.8, 0.8)),
+                ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
+                ('ALIGN', (0, 0), (-1, 0), 'CENTER'),
+                ('FONTNAME', (0, 0), (-1, 0), f'{self.font_name}-Bold'),
+                ('BOTTOMPADDING', (0, 0), (-1, 0), 8),
+                ('TOPPADDING', (0, 0), (-1, 0), 8),
+                ('BOTTOMPADDING', (0, 1), (-1, -1), 6),
+                ('TOPPADDING', (0, 1), (-1, -1), 6),
+            ])
+            table.setStyle(style)
+            self.elements.append(table)
+            # Add some space after the table
+            self.elements.append(Spacer(1, 0.1*inch))
+    def _generate_toc(self) -> None:
+        """Generate a table of contents."""
+        if not self.toc_entries:
+            return
+        self.elements.append(Paragraph("Table of Contents", self.styles['TOCHeading']))
+        self.elements.append(Spacer(1, 0.2*inch))
+        for level, text in self.toc_entries:
+            if level <= 3:  # Only include headings up to level 3
+                self.elements.append(
+                    Paragraph(text, self.styles[f'TOC{level}'])
+                )
+        self.elements.append(PageBreak())
+    def _generate_pdf(self) -> None:
+        """Generate the PDF document."""
+        # Create the document
+        doc = SimpleDocTemplate(
+            self.output_path,
+            pagesize=self.page_size,
+            leftMargin=self.margins[0]*inch,
+            rightMargin=self.margins[1]*inch,
+            topMargin=self.margins[2]*inch,
+            bottomMargin=self.margins[3]*inch
+        )
+        # Add TOC if requested
+        if self.include_toc and self.toc_entries:
+            self._generate_toc()
+        # Build the PDF
+        doc.build(self.elements)
+class MarkdownToPDFAgent:
+    """
+    AI Agent to convert Markdown files to PDF with enhanced formatting.
+    """
+    def __init__(self, llm=None):
+        """
+        Initialize the agent with optional LLM for content enhancement.
+        Args:
+            llm: Optional language model for content enhancement
+        """
+        self.llm = llm
+        self.converter = MarkdownToPDFConverter()
+        # Try to set up Gemini as the default LLM if no LLM is provided
+        if not self.llm:
+            self.setup_from_gemini()
+    def setup_from_openai(self, api_key=None):
+        """
+        Setup agent with OpenAI LLM.
+        Args:
+            api_key: OpenAI API key (will use env var if not provided)
+        """
+        try:
+            from langchain_openai import ChatOpenAI
+            api_key = api_key or os.getenv("OPENAI_API_KEY")
+            if not api_key:
+                logger.warning("No OpenAI API key provided. Agent will run without LLM enhancement.")
+                return False
+            self.llm = ChatOpenAI(
+                model="gpt-4",
+                temperature=0.1,
+                api_key=api_key
+            )
+            return True
+        except ImportError:
+            logger.warning("LangChain OpenAI package not found. Install with 'pip install langchain-openai'")
+            return False
+    def setup_from_gemini(self, api_key=None):
+        """
+        Setup agent with Google Gemini LLM.
+        Args:
+            api_key: Google Gemini API key (will use env var if not provided)
+        """
+        try:
+            from langchain_google_genai import ChatGoogleGenerativeAI
+            api_key = api_key or os.getenv("GOOGLE_API_KEY")
+            if not api_key:
+                logger.warning("No Google API key provided. Agent will run without LLM enhancement.")
+                return False
+            try:
+                # Use the latest Gemini model version
+                self.llm = ChatGoogleGenerativeAI(
+                    model="gemini-1.5-flash",  # Updated to a valid model name
+                    temperature=0.1,
+                    google_api_key=api_key,
+                    convert_system_message_to_human=True  # Required for Gemini models
+                )
+                logger.info("Successfully set up Google Gemini LLM")
+                return True
+            except Exception as e:
+                logger.error(f"Error setting up Google Gemini LLM: {str(e)}")
+                return False
+        except ImportError:
+            logger.warning("LangChain Google Generative AI package not found. Install with 'pip install langchain-google-genai'")
+            return False
+    def enhance_markdown(self, content: str, instructions: str = None) -> str:
+        """
+        Enhance markdown content using LLM if available.
+        Args:
+            content: Original markdown content
+            instructions: Specific enhancement instructions
+        Returns:
+            Enhanced markdown content
+        """
+        if not self.llm:
+            logger.warning("No LLM available for enhancement. Returning original content.")
+            return content
+        default_instructions = """
+        Enhance this markdown content while preserving its structure and meaning.
+        Make the following improvements:
+        1. Fix any grammar or spelling issues
+        2. Improve formatting for better readability
+        3. Ensure proper markdown syntax is used
+        4. Add appropriate section headings if missing
+        5. Keep the content factually identical to the original
+        """
+        instructions = instructions or default_instructions
+        try:
+            # Create a prompt for the LLM
+            prompt = f"{instructions}\n\nOriginal content:\n\n{content}\n\nPlease provide the enhanced markdown content:"
+            # Use the LLM directly with proper error handling
+            try:
+                from langchain.schema import HumanMessage
+                logger.info(f"Using LLM type: {type(self.llm).__name__}")
+                messages = [HumanMessage(content=prompt)]
+                result = self.llm.invoke(messages).content
+                logger.info("Successfully received response from LLM")
+            except Exception as e:
+                logger.error(f"Error invoking LLM: {str(e)}")
+                return content
+            # Clean up the result (extract just the markdown part)
+            result = self._clean_agent_output(result)
+            return result
+        except Exception as e:
+            logger.error(f"Error enhancing markdown: {str(e)}")
+            return content  # Return original content if enhancement fails
+    def _clean_agent_output(self, output: str) -> str:
+        """
+        Clean up agent output to extract just the markdown content.
+        Args:
+            output: Raw agent output
+        Returns:
+            Cleaned markdown content
+        """
+        # Check if the output is wrapped in markdown code blocks
+        md_pattern = r"```(?:markdown|md)?\s*([\s\S]*?)```"
+        match = re.search(md_pattern, output)
+        if match:
+            return match.group(1).strip()
+        # If no markdown blocks found, remove any agent commentary
+        lines = output.split('\n')
+        result_lines = []
+        capture = False
+        for line in lines:
+            if capture or not (line.startswith("I") or line.startswith("Here") or line.startswith("The")):
+                capture = True
+                result_lines.append(line)
+        return '\n'.join(result_lines)
+    def process_file(self, input_path: str, output_path: str = None, enhance: bool = False,
+                     enhancement_instructions: str = None, page_size: str = "A4") -> str:
+        """
+        Process a single markdown file and convert it to PDF.
+        Args:
+            input_path: Path to input markdown file
+            output_path: Path for output PDF (defaults to input path with .pdf extension)
+            enhance: Whether to enhance the content with LLM
+            enhancement_instructions: Specific instructions for enhancement
+            page_size: Page size for the PDF ("A4" or "letter")
+        Returns:
+            Path to the generated PDF
+        """
+        # Validate input file
+        if not os.path.exists(input_path):
+            logger.error(f"Input file not found: {input_path}")
+            return None
+        # Set default output path if not provided
+        if not output_path:
+            output_path = os.path.splitext(input_path)[0] + ".pdf"
+        # Read markdown content
+        with open(input_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+        # Enhance content if requested
+        if enhance and self.llm:
+            logger.info(f"Enhancing content for {input_path}")
+            content = self.enhance_markdown(content, enhancement_instructions)
+        # Configure converter
+        self.converter = MarkdownToPDFConverter(
+            output_path=output_path,
+            page_size=page_size
+        )
+        # Convert to PDF
+        logger.info(f"Converting {input_path} to PDF")
+        self.converter.convert_content(content)
+        return output_path
+    def process_directory(self, input_dir: str, output_dir: str = None, pattern: str = "*.md",
+                        enhance: bool = False, merge: bool = False,
+                        output_filename: str = "merged_document.pdf",
+                        page_size: str = "A4") -> List[str]:
+        """
+        Process all markdown files in a directory.
+        Args:
+            input_dir: Path to input directory
+            output_dir: Path to output directory (defaults to input directory)
+            pattern: Glob pattern for markdown files
+            enhance: Whether to enhance content with LLM
+            merge: Whether to merge all files into a single PDF
+            output_filename: Filename for merged PDF
+            page_size: Page size for the PDF ("A4" or "letter")
+        Returns:
+            List of paths to generated PDFs
+        """
+        # Validate input directory
+        if not os.path.isdir(input_dir):
+            logger.error(f"Input directory not found: {input_dir}")
+            return []
+        # Set default output directory if not provided
+        if not output_dir:
+            output_dir = input_dir
+        elif not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        # Get all markdown files
+        md_files = glob.glob(os.path.join(input_dir, pattern))
+        if not md_files:
+            logger.warning(f"No markdown files found in {input_dir} with pattern {pattern}")
+            return []
+        # Sort files to ensure consistent ordering
+        md_files.sort()
+        if merge:
+            logger.info(f"Merging {len(md_files)} markdown files into a single PDF")
+            # Process each file for enhancement if requested
+            if enhance and self.llm:
+                enhanced_contents = []
+                for md_file in md_files:
+                    logger.info(f"Enhancing content for {md_file}")
+                    with open(md_file, 'r', encoding='utf-8') as f:
+                        content = f.read()
+                    # Add file name as heading
+                    file_name = os.path.splitext(os.path.basename(md_file))[0]
+                    content = f"# {file_name}\n\n{content}"
+                    enhanced_content = self.enhance_markdown(content)
+                    enhanced_contents.append(enhanced_content)
+                # Merge enhanced contents with page breaks
+                merged_content = "\n\n<div class='page-break'></div>\n\n".join(enhanced_contents)
+                # Convert merged content
+                output_path = os.path.join(output_dir, output_filename)
+                self.converter = MarkdownToPDFConverter(
+                    output_path=output_path,
+                    page_size=page_size
+                )
+                self.converter.convert_content(merged_content)
+                return [output_path]
+            else:
+                # Merge without enhancement
+                output_path = os.path.join(output_dir, output_filename)
+                self.converter = MarkdownToPDFConverter(
+                    output_path=output_path,
+                    page_size=page_size
+                )
+                self.converter.convert_multiple_files(md_files, merge=True)
+                return [output_path]
+        else:
+            # Process each file individually
+            output_files = []
+            for md_file in md_files:
+                output_filename = os.path.splitext(os.path.basename(md_file))[0] + ".pdf"
+                output_path = os.path.join(output_dir, output_filename)
+                processed_file = self.process_file(
+                    md_file,
+                    output_path,
+                    enhance=enhance,
+                    page_size=page_size
+                )
+                if processed_file:
+                    output_files.append(processed_file)
+            return output_files
+def main():
+    """Main function for command-line usage."""
+    parser = argparse.ArgumentParser(description="Convert Markdown files to PDF")
+    # Input arguments
+    parser.add_argument("input", help="Input markdown file or directory")
+    parser.add_argument("-o", "--output", help="Output PDF file or directory")
+    parser.add_argument("-p", "--pattern", default="*.md", help="File pattern for markdown files in directory mode")
+    # Options
+    parser.add_argument("--enhance", action="store_true", help="Enhance markdown content using LLM")
+    parser.add_argument("--merge", action="store_true", help="Merge multiple markdown files into a single PDF")
+    parser.add_argument("--page-size", choices=["A4", "letter"], default="A4", help="Page size (A4 or letter)")
+    parser.add_argument("--toc", action="store_true", default=True, help="Include table of contents")
+    parser.add_argument("--no-toc", action="store_false", dest="toc", help="Exclude table of contents")
+    parser.add_argument("--font-size", type=int, default=10, help="Base font size in points")
+    parser.add_argument("--margins", type=float, nargs=4, default=(0.75, 0.75, 0.75, 0.75),
+                        metavar=("LEFT", "RIGHT", "TOP", "BOTTOM"),
+                        help="Page margins in inches (left right top bottom)")
+    # LLM options
+    parser.add_argument("--llm", choices=["openai", "gemini", "none"], default="none",
+                        help="LLM provider for content enhancement")
+    parser.add_argument("--api-key", help="API key for LLM provider (will use env var if not provided)")
+    args = parser.parse_args()
+    # Initialize agent
+    agent = MarkdownToPDFAgent()
+    # Setup LLM if requested
+    if args.enhance and args.llm != "none":
+        if args.llm == "openai":
+            success = agent.setup_from_openai(args.api_key)
+            if not success:
+                logger.warning("Could not initialize OpenAI LLM. Enhancement disabled.")
+                args.enhance = False
+        elif args.llm == "gemini":
+            success = agent.setup_from_gemini(args.api_key)
+            if not success:
+                logger.warning("Could not initialize Gemini LLM. Enhancement disabled.")
+                args.enhance = False
+    # Process input
+    if os.path.isdir(args.input):
+        # Directory mode
+        output_files = agent.process_directory(
+            args.input,
+            args.output,
+            args.pattern,
+            enhance=args.enhance,
+            merge=args.merge,
+            output_filename=os.path.basename(args.output) if args.output and args.merge else "merged_document.pdf",
+            page_size=args.page_size
+        )
+        if output_files:
+            logger.info(f"Generated {len(output_files)} PDF files:")
+            for output_file in output_files:
+                logger.info(f"  - {output_file}")
+        else:
+            logger.error("No PDFs were generated.")
+    else:
+        # Single file mode
+        output_file = agent.process_file(
+            args.input,
+            args.output,
+            enhance=args.enhance,
+            page_size=args.page_size
+        )
+        if output_file:
+            logger.info(f"Generated PDF: {output_file}")
+        else:
+            logger.error("PDF generation failed.")
+if __name__ == "__main__":
+    main()