Files changed (1) hide show
  1. app.py +967 -0
app.py ADDED
@@ -0,0 +1,967 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import sys
4
+ import glob
5
+ import argparse
6
+ from typing import List, Dict, Any, Optional, Tuple
7
+ import logging
8
+ import markdown
9
+ from markdown.extensions.tables import TableExtension
10
+ from markdown.extensions.fenced_code import FencedCodeExtension
11
+ from markdown.extensions.toc import TocExtension
12
+ from reportlab.lib.pagesizes import letter, A4
13
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
14
+ from reportlab.lib.units import inch
15
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image, PageBreak, Preformatted, ListFlowable, ListItem
16
+ from reportlab.lib.colors import HexColor, black, grey
17
+ from reportlab.lib import colors
18
+ from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
19
+ from reportlab.pdfbase import pdfmetrics
20
+ from reportlab.pdfbase.ttfonts import TTFont
21
+ import html
22
+ from bs4 import BeautifulSoup
23
+ import re
24
+ from PIL import Image as PilImage
25
+ import io
26
+ import base64
27
+ import requests
28
+ from crewai import Agent, Task, Crew
29
+ from dotenv import load_dotenv
30
+
31
+ # Set up logging
32
+ logging.basicConfig(
33
+ level=logging.INFO,
34
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
35
+ )
36
+ logger = logging.getLogger(__name__)
37
+
38
+ # Load environment variables
39
+ load_dotenv()
40
+
41
+ class MarkdownToPDFConverter:
42
+ """
43
+ Class to convert Markdown content to PDF using ReportLab.
44
+ """
45
+ def __init__(
46
+ self,
47
+ output_path: str = "output.pdf",
48
+ page_size: str = "A4",
49
+ margins: Tuple[float, float, float, float] = (0.75, 0.75, 0.75, 0.75),
50
+ font_name: str = "Helvetica",
51
+ base_font_size: int = 10,
52
+ heading_scale: Dict[int, float] = None,
53
+ include_toc: bool = True,
54
+ code_style: str = "github"
55
+ ):
56
+ """
57
+ Initialize the converter with configuration options.
58
+
59
+ Args:
60
+ output_path: Path to save the PDF
61
+ page_size: Page size ("A4" or "letter")
62
+ margins: Tuple of margins (left, right, top, bottom) in inches
63
+ font_name: Base font name to use
64
+ base_font_size: Base font size in points
65
+ heading_scale: Dictionary of heading levels to font size multipliers
66
+ include_toc: Whether to include a table of contents
67
+ code_style: Style to use for code blocks
68
+ """
69
+ self.output_path = output_path
70
+ self.page_size = A4 if page_size.upper() == "A4" else letter
71
+ self.margins = margins
72
+ self.font_name = font_name
73
+ self.base_font_size = base_font_size
74
+ self.heading_scale = heading_scale or {
75
+ 1: 2.0, # H1 is 2.0x base font size
76
+ 2: 1.7, # H2 is 1.7x base font size
77
+ 3: 1.4, # H3 is 1.4x base font size
78
+ 4: 1.2, # H4 is 1.2x base font size
79
+ 5: 1.1, # H5 is 1.1x base font size
80
+ 6: 1.0 # H6 is 1.0x base font size
81
+ }
82
+ self.include_toc = include_toc
83
+ self.code_style = code_style
84
+
85
+ # Initialize styles
86
+ self.styles = getSampleStyleSheet()
87
+ self._setup_styles()
88
+
89
+ # Initialize document elements
90
+ self.elements = []
91
+ self.toc_entries = []
92
+
93
+ def _setup_styles(self) -> None:
94
+ """Set up custom paragraph styles for the document."""
95
+ # Modify existing Normal style instead of adding a duplicate
96
+ self.styles['Normal'].fontName = self.font_name
97
+ self.styles['Normal'].fontSize = self.base_font_size
98
+ self.styles['Normal'].leading = self.base_font_size * 1.2
99
+ self.styles['Normal'].spaceAfter = self.base_font_size * 0.8
100
+
101
+ # Heading styles - modify existing ones instead of adding new ones
102
+ for level in range(1, 7):
103
+ size_multiplier = self.heading_scale.get(level, 1.0)
104
+ heading_name = f'Heading{level}'
105
+
106
+ # Check if the heading style already exists
107
+ if heading_name in self.styles:
108
+ # Modify existing style
109
+ self.styles[heading_name].parent = self.styles['Normal']
110
+ self.styles[heading_name].fontName = f'{self.font_name}-Bold'
111
+ self.styles[heading_name].fontSize = int(self.base_font_size * size_multiplier)
112
+ self.styles[heading_name].leading = int(self.base_font_size * size_multiplier * 1.2)
113
+ self.styles[heading_name].spaceAfter = self.base_font_size
114
+ self.styles[heading_name].spaceBefore = self.base_font_size * (1 + (0.2 * (7 - level)))
115
+ else:
116
+ # Create new style if it doesn't exist
117
+ self.styles.add(
118
+ ParagraphStyle(
119
+ name=heading_name,
120
+ parent=self.styles['Normal'],
121
+ fontName=f'{self.font_name}-Bold',
122
+ fontSize=int(self.base_font_size * size_multiplier),
123
+ leading=int(self.base_font_size * size_multiplier * 1.2),
124
+ spaceAfter=self.base_font_size,
125
+ spaceBefore=self.base_font_size * (1 + (0.2 * (7 - level))),
126
+ )
127
+ )
128
+
129
+ # Code block style
130
+ if 'CodeBlock' in self.styles:
131
+ # Modify existing style
132
+ self.styles['CodeBlock'].fontName = 'Courier'
133
+ self.styles['CodeBlock'].fontSize = self.base_font_size * 0.9
134
+ self.styles['CodeBlock'].leading = self.base_font_size * 1.1
135
+ self.styles['CodeBlock'].spaceAfter = self.base_font_size
136
+ self.styles['CodeBlock'].spaceBefore = self.base_font_size
137
+ self.styles['CodeBlock'].leftIndent = self.base_font_size
138
+ self.styles['CodeBlock'].backgroundColor = HexColor('#EEEEEE')
139
+ self.styles['CodeBlock'].borderWidth = 0
140
+ self.styles['CodeBlock'].borderPadding = self.base_font_size * 0.5
141
+ else:
142
+ # Create new style
143
+ self.styles.add(
144
+ ParagraphStyle(
145
+ name='CodeBlock',
146
+ fontName='Courier',
147
+ fontSize=self.base_font_size * 0.9,
148
+ leading=self.base_font_size * 1.1,
149
+ spaceAfter=self.base_font_size,
150
+ spaceBefore=self.base_font_size,
151
+ leftIndent=self.base_font_size,
152
+ backgroundColor=HexColor('#EEEEEE'),
153
+ borderWidth=0,
154
+ borderPadding=self.base_font_size * 0.5,
155
+ )
156
+ )
157
+
158
+ # List item style
159
+ if 'ListItem' in self.styles:
160
+ # Modify existing style
161
+ self.styles['ListItem'].parent = self.styles['Normal']
162
+ self.styles['ListItem'].leftIndent = self.base_font_size * 2
163
+ self.styles['ListItem'].firstLineIndent = -self.base_font_size
164
+ else:
165
+ # Create new style
166
+ self.styles.add(
167
+ ParagraphStyle(
168
+ name='ListItem',
169
+ parent=self.styles['Normal'],
170
+ leftIndent=self.base_font_size * 2,
171
+ firstLineIndent=-self.base_font_size,
172
+ )
173
+ )
174
+
175
+ # Table of contents styles
176
+ if 'TOCHeading' in self.styles:
177
+ # Modify existing style
178
+ self.styles['TOCHeading'].parent = self.styles['Heading1']
179
+ self.styles['TOCHeading'].fontSize = int(self.base_font_size * 1.5)
180
+ self.styles['TOCHeading'].spaceAfter = self.base_font_size * 1.5
181
+ else:
182
+ # Create new style
183
+ self.styles.add(
184
+ ParagraphStyle(
185
+ name='TOCHeading',
186
+ parent=self.styles['Heading1'],
187
+ fontSize=int(self.base_font_size * 1.5),
188
+ spaceAfter=self.base_font_size * 1.5,
189
+ )
190
+ )
191
+
192
+ for level in range(1, 4): # Create styles for TOC levels
193
+ toc_name = f'TOC{level}'
194
+ if toc_name in self.styles:
195
+ # Modify existing style
196
+ self.styles[toc_name].parent = self.styles['Normal']
197
+ self.styles[toc_name].leftIndent = self.base_font_size * (level - 1) * 2
198
+ self.styles[toc_name].fontSize = self.base_font_size - (level - 1)
199
+ self.styles[toc_name].leading = self.base_font_size * 1.4
200
+ else:
201
+ # Create new style
202
+ self.styles.add(
203
+ ParagraphStyle(
204
+ name=toc_name,
205
+ parent=self.styles['Normal'],
206
+ leftIndent=self.base_font_size * (level - 1) * 2,
207
+ fontSize=self.base_font_size - (level - 1),
208
+ leading=self.base_font_size * 1.4,
209
+ )
210
+ )
211
+
212
+ def convert_file(self, md_file_path: str) -> None:
213
+ """
214
+ Convert a single markdown file to PDF.
215
+
216
+ Args:
217
+ md_file_path: Path to the markdown file
218
+ """
219
+ # Read markdown content
220
+ with open(md_file_path, 'r', encoding='utf-8') as f:
221
+ md_content = f.read()
222
+
223
+ # Convert markdown to PDF
224
+ self.convert_content(md_content)
225
+
226
+ def convert_content(self, md_content: str) -> None:
227
+ """
228
+ Convert markdown content string to PDF.
229
+
230
+ Args:
231
+ md_content: Markdown content as a string
232
+ """
233
+ # Convert markdown to HTML
234
+ html_content = self._md_to_html(md_content)
235
+
236
+ # Convert HTML to ReportLab elements
237
+ self._html_to_elements(html_content)
238
+
239
+ # Generate the PDF
240
+ self._generate_pdf()
241
+
242
+ logger.info(f"PDF created at {self.output_path}")
243
+
244
+ def convert_multiple_files(self, md_file_paths: List[str],
245
+ merge: bool = True,
246
+ separate_toc: bool = False) -> None:
247
+ """
248
+ Convert multiple markdown files to PDF.
249
+
250
+ Args:
251
+ md_file_paths: List of paths to markdown files
252
+ merge: Whether to merge all files into a single PDF
253
+ separate_toc: Whether to include a separate TOC for each file
254
+ """
255
+ if merge:
256
+ all_content = []
257
+
258
+ for file_path in md_file_paths:
259
+ logger.info(f"Processing {file_path}")
260
+ with open(file_path, 'r', encoding='utf-8') as f:
261
+ content = f.read()
262
+
263
+ # Add file name as heading if more than one file
264
+ if len(md_file_paths) > 1:
265
+ file_name = os.path.splitext(os.path.basename(file_path))[0]
266
+ content = f"# {file_name}\n\n{content}"
267
+
268
+ # Add page break between files
269
+ if all_content:
270
+ all_content.append("\n\n<div class='page-break'></div>\n\n")
271
+
272
+ all_content.append(content)
273
+
274
+ combined_content = "\n".join(all_content)
275
+ self.convert_content(combined_content)
276
+ else:
277
+ # Process each file separately
278
+ for i, file_path in enumerate(md_file_paths):
279
+ converter = MarkdownToPDFConverter(
280
+ output_path=f"{os.path.splitext(file_path)[0]}.pdf",
281
+ page_size=self.page_size,
282
+ margins=self.margins,
283
+ font_name=self.font_name,
284
+ base_font_size=self.base_font_size,
285
+ heading_scale=self.heading_scale,
286
+ include_toc=separate_toc,
287
+ code_style=self.code_style
288
+ )
289
+ converter.convert_file(file_path)
290
+
291
+ def _md_to_html(self, md_content: str) -> str:
292
+ """
293
+ Convert markdown content to HTML.
294
+
295
+ Args:
296
+ md_content: Markdown content
297
+
298
+ Returns:
299
+ HTML content
300
+ """
301
+ # Define extensions for markdown conversion
302
+ extensions = [
303
+ 'markdown.extensions.extra',
304
+ 'markdown.extensions.smarty',
305
+ TableExtension(),
306
+ FencedCodeExtension(),
307
+ TocExtension(toc_depth=3) if self.include_toc else None
308
+ ]
309
+
310
+ # Remove None values
311
+ extensions = [ext for ext in extensions if ext is not None]
312
+
313
+ # Convert markdown to HTML
314
+ html_content = markdown.markdown(md_content, extensions=extensions)
315
+ return html_content
316
+
317
+ def _html_to_elements(self, html_content: str) -> None:
318
+ """
319
+ Convert HTML content to ReportLab elements.
320
+
321
+ Args:
322
+ html_content: HTML content
323
+ """
324
+ soup = BeautifulSoup(html_content, 'html.parser')
325
+
326
+ # Process elements
327
+ for element in soup.children:
328
+ if element.name:
329
+ self._process_element(element)
330
+
331
+ def _process_element(self, element: BeautifulSoup) -> None:
332
+ """
333
+ Process an HTML element and convert it to ReportLab elements.
334
+
335
+ Args:
336
+ element: BeautifulSoup element
337
+ """
338
+ if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
339
+ level = int(element.name[1])
340
+ text = element.get_text()
341
+
342
+ # Add to TOC
343
+ if self.include_toc:
344
+ self.toc_entries.append((level, text))
345
+
346
+ # Create heading paragraph
347
+ self.elements.append(
348
+ Paragraph(text, self.styles[f'Heading{level}'])
349
+ )
350
+
351
+ elif element.name == 'p':
352
+ text = self._process_inline_elements(element)
353
+ self.elements.append(
354
+ Paragraph(text, self.styles['Normal'])
355
+ )
356
+
357
+ elif element.name == 'pre':
358
+ code = element.get_text()
359
+ self.elements.append(
360
+ Preformatted(code, self.styles['CodeBlock'])
361
+ )
362
+
363
+ elif element.name == 'img':
364
+ src = element.get('src', '')
365
+ alt = element.get('alt', 'Image')
366
+
367
+ # Handle different image sources
368
+ if src.startswith('http'):
369
+ # Remote image
370
+ try:
371
+ response = requests.get(src)
372
+ img_data = response.content
373
+ img_stream = io.BytesIO(img_data)
374
+ image = Image(img_stream, width=4*inch, height=3*inch)
375
+
376
+ # Try to get actual dimensions
377
+ try:
378
+ pil_img = PilImage.open(img_stream)
379
+ width, height = pil_img.size
380
+ aspect = width / height
381
+ max_width = 6 * inch
382
+
383
+ if width > max_width:
384
+ new_width = max_width
385
+ new_height = new_width / aspect
386
+ image = Image(img_stream, width=new_width, height=new_height)
387
+ except:
388
+ pass # Use default size if image can't be processed
389
+
390
+ self.elements.append(image)
391
+ except:
392
+ # If image can't be retrieved, add a placeholder
393
+ self.elements.append(
394
+ Paragraph(f"[Image: {alt}]", self.styles['Normal'])
395
+ )
396
+ elif src.startswith('data:image'):
397
+ # Base64 encoded image
398
+ try:
399
+ # Extract base64 data
400
+ b64_data = src.split(',')[1]
401
+ img_data = base64.b64decode(b64_data)
402
+ img_stream = io.BytesIO(img_data)
403
+ image = Image(img_stream, width=4*inch, height=3*inch)
404
+ self.elements.append(image)
405
+ except:
406
+ # If image can't be processed, add a placeholder
407
+ self.elements.append(
408
+ Paragraph(f"[Image: {alt}]", self.styles['Normal'])
409
+ )
410
+ else:
411
+ # Local image
412
+ if os.path.exists(src):
413
+ image = Image(src, width=4*inch, height=3*inch)
414
+ self.elements.append(image)
415
+ else:
416
+ # If image can't be found, add a placeholder
417
+ self.elements.append(
418
+ Paragraph(f"[Image: {alt}]", self.styles['Normal'])
419
+ )
420
+
421
+ elif element.name == 'ul' or element.name == 'ol':
422
+ list_items = []
423
+ bullet_type = 'bullet' if element.name == 'ul' else 'numbered'
424
+
425
+ for item in element.find_all('li', recursive=False):
426
+ text = self._process_inline_elements(item)
427
+ list_items.append(
428
+ ListItem(
429
+ Paragraph(text, self.styles['ListItem']),
430
+ leftIndent=20
431
+ )
432
+ )
433
+
434
+ self.elements.append(
435
+ ListFlowable(
436
+ list_items,
437
+ bulletType='bullet', # Always use 'bullet' as the bulletType
438
+ start=None, # Don't use numeric start to avoid int decode errors
439
+ bulletFormat='%s.' if bullet_type == 'numbered' else '%s' # Use string formatting for numbers
440
+ )
441
+ )
442
+
443
+ elif element.name == 'table':
444
+ self._process_table(element)
445
+
446
+ elif element.name == 'div' and 'page-break' in element.get('class', []):
447
+ self.elements.append(PageBreak())
448
+
449
+ elif element.name == 'hr':
450
+ self.elements.append(Spacer(1, 0.25*inch))
451
+
452
+ # Process children for complex elements
453
+ elif element.name in ['div', 'blockquote', 'section', 'article']:
454
+ for child in element.children:
455
+ if hasattr(child, 'name') and child.name:
456
+ self._process_element(child)
457
+
458
+ def _process_inline_elements(self, element: BeautifulSoup) -> str:
459
+ """
460
+ Process inline HTML elements like bold, italic, etc.
461
+
462
+ Args:
463
+ element: BeautifulSoup element
464
+
465
+ Returns:
466
+ Formatted text with ReportLab markup
467
+ """
468
+ html_str = str(element)
469
+
470
+ # Convert common HTML tags to ReportLab paragraph markup
471
+ replacements = [
472
+ (r'<strong>(.*?)</strong>', r'<b>\1</b>'),
473
+ (r'<b>(.*?)</b>', r'<b>\1</b>'),
474
+ (r'<em>(.*?)</em>', r'<i>\1</i>'),
475
+ (r'<i>(.*?)</i>', r'<i>\1</i>'),
476
+ (r'<code>(.*?)</code>', r'<font name="Courier">\1</font>'),
477
+ (r'<a href="(.*?)">(.*?)</a>', r'<link href="\1">\2</link>'),
478
+ (r'<u>(.*?)</u>', r'<u>\1</u>'),
479
+ (r'<strike>(.*?)</strike>', r'<strike>\1</strike>'),
480
+ (r'<del>(.*?)</del>', r'<strike>\1</strike>'),
481
+ ]
482
+
483
+ for pattern, replacement in replacements:
484
+ html_str = re.sub(pattern, replacement, html_str, flags=re.DOTALL)
485
+
486
+ # Extract text with our ReportLab markup from the modified HTML
487
+ soup = BeautifulSoup(html_str, 'html.parser')
488
+ return soup.get_text()
489
+
490
+ def _process_table(self, table_element: BeautifulSoup) -> None:
491
+ """
492
+ Process an HTML table into a ReportLab Table.
493
+
494
+ Args:
495
+ table_element: BeautifulSoup table element
496
+ """
497
+ rows = []
498
+
499
+ # Extract header row
500
+ thead = table_element.find('thead')
501
+ if thead:
502
+ header_cells = []
503
+ for th in thead.find_all(['th']):
504
+ text = self._process_inline_elements(th)
505
+ # Create a paragraph with bold text for headers
506
+ header_cells.append(Paragraph(f"<b>{text}</b>", self.styles['Normal']))
507
+ rows.append(header_cells)
508
+
509
+ # Extract body rows
510
+ tbody = table_element.find('tbody') or table_element
511
+ for tr in tbody.find_all('tr'):
512
+ if tr.parent.name == 'thead':
513
+ continue # Skip header rows already processed
514
+
515
+ row_cells = []
516
+ for cell in tr.find_all(['td', 'th']):
517
+ text = self._process_inline_elements(cell)
518
+ if cell.name == 'th':
519
+ # Headers are bold
520
+ row_cells.append(Paragraph(f"<b>{text}</b>", self.styles['Normal']))
521
+ else:
522
+ row_cells.append(Paragraph(text, self.styles['Normal']))
523
+
524
+ if row_cells: # Only add non-empty rows
525
+ rows.append(row_cells)
526
+
527
+ if rows:
528
+ # Create table and style
529
+ col_widths = [None] * len(rows[0]) # Auto width for columns
530
+ table = Table(rows, colWidths=col_widths)
531
+
532
+ # Add basic grid and header styling
533
+ style = TableStyle([
534
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.Color(0.7, 0.7, 0.7)),
535
+ ('BACKGROUND', (0, 0), (-1, 0), colors.Color(0.8, 0.8, 0.8)),
536
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
537
+ ('ALIGN', (0, 0), (-1, 0), 'CENTER'),
538
+ ('FONTNAME', (0, 0), (-1, 0), f'{self.font_name}-Bold'),
539
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 8),
540
+ ('TOPPADDING', (0, 0), (-1, 0), 8),
541
+ ('BOTTOMPADDING', (0, 1), (-1, -1), 6),
542
+ ('TOPPADDING', (0, 1), (-1, -1), 6),
543
+ ])
544
+
545
+ table.setStyle(style)
546
+ self.elements.append(table)
547
+
548
+ # Add some space after the table
549
+ self.elements.append(Spacer(1, 0.1*inch))
550
+
551
+ def _generate_toc(self) -> None:
552
+ """Generate a table of contents."""
553
+ if not self.toc_entries:
554
+ return
555
+
556
+ self.elements.append(Paragraph("Table of Contents", self.styles['TOCHeading']))
557
+ self.elements.append(Spacer(1, 0.2*inch))
558
+
559
+ for level, text in self.toc_entries:
560
+ if level <= 3: # Only include headings up to level 3
561
+ self.elements.append(
562
+ Paragraph(text, self.styles[f'TOC{level}'])
563
+ )
564
+
565
+ self.elements.append(PageBreak())
566
+
567
+ def _generate_pdf(self) -> None:
568
+ """Generate the PDF document."""
569
+ # Create the document
570
+ doc = SimpleDocTemplate(
571
+ self.output_path,
572
+ pagesize=self.page_size,
573
+ leftMargin=self.margins[0]*inch,
574
+ rightMargin=self.margins[1]*inch,
575
+ topMargin=self.margins[2]*inch,
576
+ bottomMargin=self.margins[3]*inch
577
+ )
578
+
579
+ # Add TOC if requested
580
+ if self.include_toc and self.toc_entries:
581
+ self._generate_toc()
582
+
583
+ # Build the PDF
584
+ doc.build(self.elements)
585
+
586
+
587
+ class MarkdownToPDFAgent:
588
+ """
589
+ AI Agent to convert Markdown files to PDF with enhanced formatting.
590
+ """
591
+
592
+ def __init__(self, llm=None):
593
+ """
594
+ Initialize the agent with optional LLM for content enhancement.
595
+
596
+ Args:
597
+ llm: Optional language model for content enhancement
598
+ """
599
+ self.llm = llm
600
+ self.converter = MarkdownToPDFConverter()
601
+
602
+ # Try to set up Gemini as the default LLM if no LLM is provided
603
+ if not self.llm:
604
+ self.setup_from_gemini()
605
+
606
+ def setup_from_openai(self, api_key=None):
607
+ """
608
+ Setup agent with OpenAI LLM.
609
+
610
+ Args:
611
+ api_key: OpenAI API key (will use env var if not provided)
612
+ """
613
+ try:
614
+ from langchain_openai import ChatOpenAI
615
+
616
+ api_key = api_key or os.getenv("OPENAI_API_KEY")
617
+ if not api_key:
618
+ logger.warning("No OpenAI API key provided. Agent will run without LLM enhancement.")
619
+ return False
620
+
621
+ self.llm = ChatOpenAI(
622
+ model="gpt-4",
623
+ temperature=0.1,
624
+ api_key=api_key
625
+ )
626
+ return True
627
+ except ImportError:
628
+ logger.warning("LangChain OpenAI package not found. Install with 'pip install langchain-openai'")
629
+ return False
630
+
631
+ def setup_from_gemini(self, api_key=None):
632
+ """
633
+ Setup agent with Google Gemini LLM.
634
+
635
+ Args:
636
+ api_key: Google Gemini API key (will use env var if not provided)
637
+ """
638
+ try:
639
+ from langchain_google_genai import ChatGoogleGenerativeAI
640
+
641
+ api_key = api_key or os.getenv("GOOGLE_API_KEY")
642
+ if not api_key:
643
+ logger.warning("No Google API key provided. Agent will run without LLM enhancement.")
644
+ return False
645
+
646
+ try:
647
+ # Use the latest Gemini model version
648
+ self.llm = ChatGoogleGenerativeAI(
649
+ model="gemini-1.5-flash", # Updated to a valid model name
650
+ temperature=0.1,
651
+ google_api_key=api_key,
652
+ convert_system_message_to_human=True # Required for Gemini models
653
+ )
654
+ logger.info("Successfully set up Google Gemini LLM")
655
+ return True
656
+ except Exception as e:
657
+ logger.error(f"Error setting up Google Gemini LLM: {str(e)}")
658
+ return False
659
+ except ImportError:
660
+ logger.warning("LangChain Google Generative AI package not found. Install with 'pip install langchain-google-genai'")
661
+ return False
662
+
663
+ def enhance_markdown(self, content: str, instructions: str = None) -> str:
664
+ """
665
+ Enhance markdown content using LLM if available.
666
+
667
+ Args:
668
+ content: Original markdown content
669
+ instructions: Specific enhancement instructions
670
+
671
+ Returns:
672
+ Enhanced markdown content
673
+ """
674
+ if not self.llm:
675
+ logger.warning("No LLM available for enhancement. Returning original content.")
676
+ return content
677
+
678
+ default_instructions = """
679
+ Enhance this markdown content while preserving its structure and meaning.
680
+ Make the following improvements:
681
+ 1. Fix any grammar or spelling issues
682
+ 2. Improve formatting for better readability
683
+ 3. Ensure proper markdown syntax is used
684
+ 4. Add appropriate section headings if missing
685
+ 5. Keep the content factually identical to the original
686
+ """
687
+
688
+ instructions = instructions or default_instructions
689
+
690
+ try:
691
+ # Create a prompt for the LLM
692
+ prompt = f"{instructions}\n\nOriginal content:\n\n{content}\n\nPlease provide the enhanced markdown content:"
693
+
694
+ # Use the LLM directly with proper error handling
695
+ try:
696
+ from langchain.schema import HumanMessage
697
+ logger.info(f"Using LLM type: {type(self.llm).__name__}")
698
+ messages = [HumanMessage(content=prompt)]
699
+ result = self.llm.invoke(messages).content
700
+ logger.info("Successfully received response from LLM")
701
+ except Exception as e:
702
+ logger.error(f"Error invoking LLM: {str(e)}")
703
+ return content
704
+
705
+ # Clean up the result (extract just the markdown part)
706
+ result = self._clean_agent_output(result)
707
+
708
+ return result
709
+ except Exception as e:
710
+ logger.error(f"Error enhancing markdown: {str(e)}")
711
+ return content # Return original content if enhancement fails
712
+
713
+ def _clean_agent_output(self, output: str) -> str:
714
+ """
715
+ Clean up agent output to extract just the markdown content.
716
+
717
+ Args:
718
+ output: Raw agent output
719
+
720
+ Returns:
721
+ Cleaned markdown content
722
+ """
723
+ # Check if the output is wrapped in markdown code blocks
724
+ md_pattern = r"```(?:markdown|md)?\s*([\s\S]*?)```"
725
+ match = re.search(md_pattern, output)
726
+
727
+ if match:
728
+ return match.group(1).strip()
729
+
730
+ # If no markdown blocks found, remove any agent commentary
731
+ lines = output.split('\n')
732
+ result_lines = []
733
+ capture = False
734
+
735
+ for line in lines:
736
+ if capture or not (line.startswith("I") or line.startswith("Here") or line.startswith("The")):
737
+ capture = True
738
+ result_lines.append(line)
739
+
740
+ return '\n'.join(result_lines)
741
+
742
+ def process_file(self, input_path: str, output_path: str = None, enhance: bool = False,
743
+ enhancement_instructions: str = None, page_size: str = "A4") -> str:
744
+ """
745
+ Process a single markdown file and convert it to PDF.
746
+
747
+ Args:
748
+ input_path: Path to input markdown file
749
+ output_path: Path for output PDF (defaults to input path with .pdf extension)
750
+ enhance: Whether to enhance the content with LLM
751
+ enhancement_instructions: Specific instructions for enhancement
752
+ page_size: Page size for the PDF ("A4" or "letter")
753
+
754
+ Returns:
755
+ Path to the generated PDF
756
+ """
757
+ # Validate input file
758
+ if not os.path.exists(input_path):
759
+ logger.error(f"Input file not found: {input_path}")
760
+ return None
761
+
762
+ # Set default output path if not provided
763
+ if not output_path:
764
+ output_path = os.path.splitext(input_path)[0] + ".pdf"
765
+
766
+ # Read markdown content
767
+ with open(input_path, 'r', encoding='utf-8') as f:
768
+ content = f.read()
769
+
770
+ # Enhance content if requested
771
+ if enhance and self.llm:
772
+ logger.info(f"Enhancing content for {input_path}")
773
+ content = self.enhance_markdown(content, enhancement_instructions)
774
+
775
+ # Configure converter
776
+ self.converter = MarkdownToPDFConverter(
777
+ output_path=output_path,
778
+ page_size=page_size
779
+ )
780
+
781
+ # Convert to PDF
782
+ logger.info(f"Converting {input_path} to PDF")
783
+ self.converter.convert_content(content)
784
+
785
+ return output_path
786
+
787
+ def process_directory(self, input_dir: str, output_dir: str = None, pattern: str = "*.md",
788
+ enhance: bool = False, merge: bool = False,
789
+ output_filename: str = "merged_document.pdf",
790
+ page_size: str = "A4") -> List[str]:
791
+ """
792
+ Process all markdown files in a directory.
793
+
794
+ Args:
795
+ input_dir: Path to input directory
796
+ output_dir: Path to output directory (defaults to input directory)
797
+ pattern: Glob pattern for markdown files
798
+ enhance: Whether to enhance content with LLM
799
+ merge: Whether to merge all files into a single PDF
800
+ output_filename: Filename for merged PDF
801
+ page_size: Page size for the PDF ("A4" or "letter")
802
+
803
+ Returns:
804
+ List of paths to generated PDFs
805
+ """
806
+ # Validate input directory
807
+ if not os.path.isdir(input_dir):
808
+ logger.error(f"Input directory not found: {input_dir}")
809
+ return []
810
+
811
+ # Set default output directory if not provided
812
+ if not output_dir:
813
+ output_dir = input_dir
814
+ elif not os.path.exists(output_dir):
815
+ os.makedirs(output_dir)
816
+
817
+ # Get all markdown files
818
+ md_files = glob.glob(os.path.join(input_dir, pattern))
819
+
820
+ if not md_files:
821
+ logger.warning(f"No markdown files found in {input_dir} with pattern {pattern}")
822
+ return []
823
+
824
+ # Sort files to ensure consistent ordering
825
+ md_files.sort()
826
+
827
+ if merge:
828
+ logger.info(f"Merging {len(md_files)} markdown files into a single PDF")
829
+
830
+ # Process each file for enhancement if requested
831
+ if enhance and self.llm:
832
+ enhanced_contents = []
833
+
834
+ for md_file in md_files:
835
+ logger.info(f"Enhancing content for {md_file}")
836
+ with open(md_file, 'r', encoding='utf-8') as f:
837
+ content = f.read()
838
+
839
+ # Add file name as heading
840
+ file_name = os.path.splitext(os.path.basename(md_file))[0]
841
+ content = f"# {file_name}\n\n{content}"
842
+
843
+ enhanced_content = self.enhance_markdown(content)
844
+ enhanced_contents.append(enhanced_content)
845
+
846
+ # Merge enhanced contents with page breaks
847
+ merged_content = "\n\n<div class='page-break'></div>\n\n".join(enhanced_contents)
848
+
849
+ # Convert merged content
850
+ output_path = os.path.join(output_dir, output_filename)
851
+ self.converter = MarkdownToPDFConverter(
852
+ output_path=output_path,
853
+ page_size=page_size
854
+ )
855
+ self.converter.convert_content(merged_content)
856
+
857
+ return [output_path]
858
+ else:
859
+ # Merge without enhancement
860
+ output_path = os.path.join(output_dir, output_filename)
861
+ self.converter = MarkdownToPDFConverter(
862
+ output_path=output_path,
863
+ page_size=page_size
864
+ )
865
+ self.converter.convert_multiple_files(md_files, merge=True)
866
+
867
+ return [output_path]
868
+ else:
869
+ # Process each file individually
870
+ output_files = []
871
+
872
+ for md_file in md_files:
873
+ output_filename = os.path.splitext(os.path.basename(md_file))[0] + ".pdf"
874
+ output_path = os.path.join(output_dir, output_filename)
875
+
876
+ processed_file = self.process_file(
877
+ md_file,
878
+ output_path,
879
+ enhance=enhance,
880
+ page_size=page_size
881
+ )
882
+
883
+ if processed_file:
884
+ output_files.append(processed_file)
885
+
886
+ return output_files
887
+
888
+
889
+ def main():
890
+ """Main function for command-line usage."""
891
+ parser = argparse.ArgumentParser(description="Convert Markdown files to PDF")
892
+
893
+ # Input arguments
894
+ parser.add_argument("input", help="Input markdown file or directory")
895
+ parser.add_argument("-o", "--output", help="Output PDF file or directory")
896
+ parser.add_argument("-p", "--pattern", default="*.md", help="File pattern for markdown files in directory mode")
897
+
898
+ # Options
899
+ parser.add_argument("--enhance", action="store_true", help="Enhance markdown content using LLM")
900
+ parser.add_argument("--merge", action="store_true", help="Merge multiple markdown files into a single PDF")
901
+ parser.add_argument("--page-size", choices=["A4", "letter"], default="A4", help="Page size (A4 or letter)")
902
+ parser.add_argument("--toc", action="store_true", default=True, help="Include table of contents")
903
+ parser.add_argument("--no-toc", action="store_false", dest="toc", help="Exclude table of contents")
904
+ parser.add_argument("--font-size", type=int, default=10, help="Base font size in points")
905
+ parser.add_argument("--margins", type=float, nargs=4, default=(0.75, 0.75, 0.75, 0.75),
906
+ metavar=("LEFT", "RIGHT", "TOP", "BOTTOM"),
907
+ help="Page margins in inches (left right top bottom)")
908
+
909
+ # LLM options
910
+ parser.add_argument("--llm", choices=["openai", "gemini", "none"], default="none",
911
+ help="LLM provider for content enhancement")
912
+ parser.add_argument("--api-key", help="API key for LLM provider (will use env var if not provided)")
913
+
914
+ args = parser.parse_args()
915
+
916
+ # Initialize agent
917
+ agent = MarkdownToPDFAgent()
918
+
919
+ # Setup LLM if requested
920
+ if args.enhance and args.llm != "none":
921
+ if args.llm == "openai":
922
+ success = agent.setup_from_openai(args.api_key)
923
+ if not success:
924
+ logger.warning("Could not initialize OpenAI LLM. Enhancement disabled.")
925
+ args.enhance = False
926
+ elif args.llm == "gemini":
927
+ success = agent.setup_from_gemini(args.api_key)
928
+ if not success:
929
+ logger.warning("Could not initialize Gemini LLM. Enhancement disabled.")
930
+ args.enhance = False
931
+
932
+ # Process input
933
+ if os.path.isdir(args.input):
934
+ # Directory mode
935
+ output_files = agent.process_directory(
936
+ args.input,
937
+ args.output,
938
+ args.pattern,
939
+ enhance=args.enhance,
940
+ merge=args.merge,
941
+ output_filename=os.path.basename(args.output) if args.output and args.merge else "merged_document.pdf",
942
+ page_size=args.page_size
943
+ )
944
+
945
+ if output_files:
946
+ logger.info(f"Generated {len(output_files)} PDF files:")
947
+ for output_file in output_files:
948
+ logger.info(f" - {output_file}")
949
+ else:
950
+ logger.error("No PDFs were generated.")
951
+ else:
952
+ # Single file mode
953
+ output_file = agent.process_file(
954
+ args.input,
955
+ args.output,
956
+ enhance=args.enhance,
957
+ page_size=args.page_size
958
+ )
959
+
960
+ if output_file:
961
+ logger.info(f"Generated PDF: {output_file}")
962
+ else:
963
+ logger.error("PDF generation failed.")
964
+
965
+
966
+ if __name__ == "__main__":
967
+ main()