Create appgradio.py

#5
by Prak2005 - opened
Files changed (1) hide show
  1. appgradio.py +1052 -0
appgradio.py ADDED
@@ -0,0 +1,1052 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import sys
4
+ import glob
5
+ import logging
6
+ import gradio as gr
7
+ from typing import List, Dict, Any, Optional, Tuple
8
+ from bs4 import BeautifulSoup
9
+ import markdown
10
+ from markdown.extensions.tables import TableExtension
11
+ from markdown.extensions.fenced_code import FencedCodeExtension
12
+ from markdown.extensions.toc import TocExtension
13
+ from reportlab.lib.pagesizes import letter, A4
14
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
15
+ from reportlab.lib.units import inch
16
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image, PageBreak, Preformatted, ListFlowable, ListItem
17
+ from reportlab.lib.colors import HexColor, black, grey
18
+ from reportlab.lib import colors
19
+ from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
20
+ import html
21
+ import base64
22
+ import requests
23
+ from PIL import Image as PilImage
24
+ import io
25
+ import tempfile
26
+ from datetime import datetime
27
+
28
+ # Set up logging
29
+ logging.basicConfig(
30
+ level=logging.INFO,
31
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
32
+ )
33
+ logger = logging.getLogger(__name__)
34
+
35
+ class MarkdownToPDFConverter:
36
+ """
37
+ Class to convert Markdown content to PDF using ReportLab.
38
+ """
39
+ def __init__(
40
+ self,
41
+ output_path: str = "output.pdf",
42
+ page_size: str = "A4",
43
+ margins: Tuple[float, float, float, float] = (0.75, 0.75, 0.75, 0.75),
44
+ font_name: str = "Helvetica",
45
+ base_font_size: int = 10,
46
+ heading_scale: Dict[int, float] = None,
47
+ include_toc: bool = True,
48
+ code_style: str = "github"
49
+ ):
50
+ """
51
+ Initialize the converter with configuration options.
52
+
53
+ Args:
54
+ output_path: Path to save the PDF
55
+ page_size: Page size ("A4" or "letter")
56
+ margins: Tuple of margins (left, right, top, bottom) in inches
57
+ font_name: Base font name to use
58
+ base_font_size: Base font size in points
59
+ heading_scale: Dictionary of heading levels to font size multipliers
60
+ include_toc: Whether to include a table of contents
61
+ code_style: Style to use for code blocks
62
+ """
63
+ self.output_path = output_path
64
+ self.page_size = A4 if page_size.upper() == "A4" else letter
65
+ self.margins = margins
66
+ self.font_name = font_name
67
+ self.base_font_size = base_font_size
68
+ self.heading_scale = heading_scale or {
69
+ 1: 2.0, # H1 is 2.0x base font size
70
+ 2: 1.7, # H2 is 1.7x base font size
71
+ 3: 1.4, # H3 is 1.4x base font size
72
+ 4: 1.2, # H4 is 1.2x base font size
73
+ 5: 1.1, # H5 is 1.1x base font size
74
+ 6: 1.0 # H6 is 1.0x base font size
75
+ }
76
+ self.include_toc = include_toc
77
+ self.code_style = code_style
78
+
79
+ # Initialize styles
80
+ self.styles = getSampleStyleSheet()
81
+ self._setup_styles()
82
+
83
+ # Initialize document elements
84
+ self.elements = []
85
+ self.toc_entries = []
86
+
87
+ def _setup_styles(self) -> None:
88
+ """Set up custom paragraph styles for the document."""
89
+ # Modify existing Normal style
90
+ self.styles['Normal'].fontName = self.font_name
91
+ self.styles['Normal'].fontSize = self.base_font_size
92
+ self.styles['Normal'].leading = self.base_font_size * 1.2
93
+ self.styles['Normal'].spaceAfter = self.base_font_size * 0.8
94
+
95
+ # Heading styles
96
+ for level in range(1, 7):
97
+ size_multiplier = self.heading_scale.get(level, 1.0)
98
+ heading_name = f'Heading{level}'
99
+
100
+ # Check if the heading style already exists
101
+ if heading_name in self.styles:
102
+ # Modify existing style
103
+ self.styles[heading_name].parent = self.styles['Normal']
104
+ self.styles[heading_name].fontName = f'{self.font_name}-Bold'
105
+ self.styles[heading_name].fontSize = int(self.base_font_size * size_multiplier)
106
+ self.styles[heading_name].leading = int(self.base_font_size * size_multiplier * 1.2)
107
+ self.styles[heading_name].spaceAfter = self.base_font_size
108
+ self.styles[heading_name].spaceBefore = self.base_font_size * (1 + (0.2 * (7 - level)))
109
+ else:
110
+ # Create new style
111
+ self.styles.add(
112
+ ParagraphStyle(
113
+ name=heading_name,
114
+ parent=self.styles['Normal'],
115
+ fontName=f'{self.font_name}-Bold',
116
+ fontSize=int(self.base_font_size * size_multiplier),
117
+ leading=int(self.base_font_size * size_multiplier * 1.2),
118
+ spaceAfter=self.base_font_size,
119
+ spaceBefore=self.base_font_size * (1 + (0.2 * (7 - level))),
120
+ )
121
+ )
122
+
123
+ # Code block style
124
+ self.styles.add(
125
+ ParagraphStyle(
126
+ name='CodeBlock',
127
+ fontName='Courier',
128
+ fontSize=self.base_font_size * 0.9,
129
+ leading=self.base_font_size * 1.1,
130
+ spaceAfter=self.base_font_size,
131
+ spaceBefore=self.base_font_size,
132
+ leftIndent=self.base_font_size,
133
+ backgroundColor=HexColor('#EEEEEE'),
134
+ borderWidth=0,
135
+ borderPadding=self.base_font_size * 0.5,
136
+ )
137
+ )
138
+
139
+ # List item style
140
+ self.styles.add(
141
+ ParagraphStyle(
142
+ name='ListItem',
143
+ parent=self.styles['Normal'],
144
+ leftIndent=self.base_font_size * 2,
145
+ firstLineIndent=-self.base_font_size,
146
+ )
147
+ )
148
+
149
+ # Table of contents styles
150
+ self.styles.add(
151
+ ParagraphStyle(
152
+ name='TOCHeading',
153
+ parent=self.styles['Heading1'],
154
+ fontSize=int(self.base_font_size * 1.5),
155
+ spaceAfter=self.base_font_size * 1.5,
156
+ )
157
+ )
158
+
159
+ for level in range(1, 4): # Create styles for TOC levels
160
+ self.styles.add(
161
+ ParagraphStyle(
162
+ name=f'TOC{level}',
163
+ parent=self.styles['Normal'],
164
+ leftIndent=self.base_font_size * (level - 1) * 2,
165
+ fontSize=self.base_font_size - (level - 1),
166
+ leading=self.base_font_size * 1.4,
167
+ )
168
+ )
169
+
170
+ def convert_file(self, md_file_path: str) -> None:
171
+ """
172
+ Convert a single markdown file to PDF.
173
+
174
+ Args:
175
+ md_file_path: Path to the markdown file
176
+ """
177
+ # Read markdown content
178
+ with open(md_file_path, 'r', encoding='utf-8') as f:
179
+ md_content = f.read()
180
+
181
+ # Convert markdown to PDF
182
+ self.convert_content(md_content)
183
+
184
+ def convert_content(self, md_content: str) -> None:
185
+ """
186
+ Convert markdown content string to PDF.
187
+
188
+ Args:
189
+ md_content: Markdown content as a string
190
+ """
191
+ # Convert markdown to HTML
192
+ html_content = self._md_to_html(md_content)
193
+
194
+ # Convert HTML to ReportLab elements
195
+ self._html_to_elements(html_content)
196
+
197
+ # Generate the PDF
198
+ self._generate_pdf()
199
+
200
+ logger.info(f"PDF created at {self.output_path}")
201
+
202
+ def convert_multiple_files(self, md_file_paths: List[str],
203
+ merge: bool = True,
204
+ separate_toc: bool = False) -> None:
205
+ """
206
+ Convert multiple markdown files to PDF.
207
+
208
+ Args:
209
+ md_file_paths: List of paths to markdown files
210
+ merge: Whether to merge all files into a single PDF
211
+ separate_toc: Whether to include a separate TOC for each file
212
+ """
213
+ if merge:
214
+ all_content = []
215
+
216
+ for file_path in md_file_paths:
217
+ logger.info(f"Processing {file_path}")
218
+ with open(file_path, 'r', encoding='utf-8') as f:
219
+ content = f.read()
220
+
221
+ # Add file name as heading if more than one file
222
+ if len(md_file_paths) > 1:
223
+ file_name = os.path.splitext(os.path.basename(file_path))[0]
224
+ content = f"# {file_name}\n\n{content}"
225
+
226
+ # Add page break between files
227
+ if all_content:
228
+ all_content.append("\n\n<div class='page-break'></div>\n\n")
229
+
230
+ all_content.append(content)
231
+
232
+ combined_content = "\n".join(all_content)
233
+ self.convert_content(combined_content)
234
+ else:
235
+ # Process each file separately
236
+ for i, file_path in enumerate(md_file_paths):
237
+ converter = MarkdownToPDFConverter(
238
+ output_path=f"{os.path.splitext(file_path)[0]}.pdf",
239
+ page_size=self.page_size,
240
+ margins=self.margins,
241
+ font_name=self.font_name,
242
+ base_font_size=self.base_font_size,
243
+ heading_scale=self.heading_scale,
244
+ include_toc=separate_toc,
245
+ code_style=self.code_style
246
+ )
247
+ converter.convert_file(file_path)
248
+
249
+ def _md_to_html(self, md_content: str) -> str:
250
+ """
251
+ Convert markdown content to HTML.
252
+
253
+ Args:
254
+ md_content: Markdown content
255
+
256
+ Returns:
257
+ HTML content
258
+ """
259
+ # Define extensions for markdown conversion
260
+ extensions = [
261
+ 'markdown.extensions.extra',
262
+ 'markdown.extensions.smarty',
263
+ TableExtension(),
264
+ FencedCodeExtension(),
265
+ TocExtension(toc_depth=3) if self.include_toc else None
266
+ ]
267
+
268
+ # Remove None values
269
+ extensions = [ext for ext in extensions if ext is not None]
270
+
271
+ # Convert markdown to HTML
272
+ html_content = markdown.markdown(md_content, extensions=extensions)
273
+ return html_content
274
+
275
+ def _html_to_elements(self, html_content: str) -> None:
276
+ """
277
+ Convert HTML content to ReportLab elements.
278
+
279
+ Args:
280
+ html_content: HTML content
281
+ """
282
+ soup = BeautifulSoup(html_content, 'html.parser')
283
+
284
+ # Process elements
285
+ for element in soup.children:
286
+ if element.name:
287
+ self._process_element(element)
288
+
289
+ def _process_element(self, element: BeautifulSoup) -> None:
290
+ """
291
+ Process an HTML element and convert it to ReportLab elements.
292
+
293
+ Args:
294
+ element: BeautifulSoup element
295
+ """
296
+ if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
297
+ level = int(element.name[1])
298
+ text = element.get_text()
299
+
300
+ # Add to TOC
301
+ if self.include_toc:
302
+ self.toc_entries.append((level, text))
303
+
304
+ # Create heading paragraph
305
+ self.elements.append(
306
+ Paragraph(text, self.styles[f'Heading{level}'])
307
+ )
308
+
309
+ elif element.name == 'p':
310
+ text = self._process_inline_elements(element)
311
+ self.elements.append(
312
+ Paragraph(text, self.styles['Normal'])
313
+ )
314
+
315
+ elif element.name == 'pre':
316
+ code = element.get_text()
317
+ self.elements.append(
318
+ Preformatted(code, self.styles['CodeBlock'])
319
+ )
320
+
321
+ elif element.name == 'img':
322
+ src = element.get('src', '')
323
+ alt = element.get('alt', 'Image')
324
+
325
+ # Handle different image sources
326
+ if src.startswith('http'):
327
+ # Remote image
328
+ try:
329
+ response = requests.get(src)
330
+ img_data = response.content
331
+ img_stream = io.BytesIO(img_data)
332
+ image = Image(img_stream, width=4*inch, height=3*inch)
333
+
334
+ # Try to get actual dimensions
335
+ try:
336
+ pil_img = PilImage.open(img_stream)
337
+ width, height = pil_img.size
338
+ aspect = width / height
339
+ max_width = 6 * inch
340
+
341
+ if width > max_width:
342
+ new_width = max_width
343
+ new_height = new_width / aspect
344
+ image = Image(img_stream, width=new_width, height=new_height)
345
+ except:
346
+ pass # Use default size if image can't be processed
347
+
348
+ self.elements.append(image)
349
+ except:
350
+ # If image can't be retrieved, add a placeholder
351
+ self.elements.append(
352
+ Paragraph(f"[Image: {alt}]", self.styles['Normal'])
353
+ )
354
+ elif src.startswith('data:image'):
355
+ # Base64 encoded image
356
+ try:
357
+ # Extract base64 data
358
+ b64_data = src.split(',')[1]
359
+ img_data = base64.b64decode(b64_data)
360
+ img_stream = io.BytesIO(img_data)
361
+ image = Image(img_stream, width=4*inch, height=3*inch)
362
+ self.elements.append(image)
363
+ except:
364
+ # If image can't be processed, add a placeholder
365
+ self.elements.append(
366
+ Paragraph(f"[Image: {alt}]", self.styles['Normal'])
367
+ )
368
+ else:
369
+ # Local image
370
+ if os.path.exists(src):
371
+ image = Image(src, width=4*inch, height=3*inch)
372
+ self.elements.append(image)
373
+ else:
374
+ # If image can't be found, add a placeholder
375
+ self.elements.append(
376
+ Paragraph(f"[Image: {alt}]", self.styles['Normal'])
377
+ )
378
+
379
+ elif element.name == 'ul' or element.name == 'ol':
380
+ list_items = []
381
+ bullet_type = 'bullet' if element.name == 'ul' else 'numbered'
382
+
383
+ for item in element.find_all('li', recursive=False):
384
+ text = self._process_inline_elements(item)
385
+ list_items.append(
386
+ ListItem(
387
+ Paragraph(text, self.styles['ListItem']),
388
+ leftIndent=20
389
+ )
390
+ )
391
+
392
+ self.elements.append(
393
+ ListFlowable(
394
+ list_items,
395
+ bulletType=bullet_type,
396
+ start=1 if bullet_type == 'numbered' else None,
397
+ bulletFormat='%s.' if bullet_type == 'numbered' else '%s'
398
+ )
399
+ )
400
+
401
+ elif element.name == 'table':
402
+ self._process_table(element)
403
+
404
+ elif element.name == 'div' and 'page-break' in element.get('class', []):
405
+ self.elements.append(PageBreak())
406
+
407
+ elif element.name == 'hr':
408
+ self.elements.append(Spacer(1, 0.25*inch))
409
+
410
+ # Process children for complex elements
411
+ elif element.name in ['div', 'blockquote', 'section', 'article']:
412
+ for child in element.children:
413
+ if hasattr(child, 'name') and child.name:
414
+ self._process_element(child)
415
+
416
+ def _process_inline_elements(self, element: BeautifulSoup) -> str:
417
+ """
418
+ Process inline HTML elements like bold, italic, etc.
419
+
420
+ Args:
421
+ element: BeautifulSoup element
422
+
423
+ Returns:
424
+ Formatted text with ReportLab markup
425
+ """
426
+ html_str = str(element)
427
+
428
+ # Convert common HTML tags to ReportLab paragraph markup
429
+ replacements = [
430
+ (r'<strong>(.*?)</strong>', r'<b>\1</b>'),
431
+ (r'<b>(.*?)</b>', r'<b>\1</b>'),
432
+ (r'<em>(.*?)</em>', r'<i>\1</i>'),
433
+ (r'<i>(.*?)</i>', r'<i>\1</i>'),
434
+ (r'<code>(.*?)</code>', r'<font name="Courier">\1</font>'),
435
+ (r'<a href="(.*?)">(.*?)</a>', r'<link href="\1">\2</link>'),
436
+ (r'<u>(.*?)</u>', r'<u>\1</u>'),
437
+ (r'<strike>(.*?)</strike>', r'<strike>\1</strike>'),
438
+ (r'<del>(.*?)</del>', r'<strike>\1</strike>'),
439
+ ]
440
+
441
+ for pattern, replacement in replacements:
442
+ html_str = re.sub(pattern, replacement, html_str, flags=re.DOTALL)
443
+
444
+ # Extract text with our ReportLab markup from the modified HTML
445
+ soup = BeautifulSoup(html_str, 'html.parser')
446
+ return soup.get_text()
447
+
448
+ def _process_table(self, table_element: BeautifulSoup) -> None:
449
+ """
450
+ Process an HTML table into a ReportLab Table.
451
+
452
+ Args:
453
+ table_element: BeautifulSoup table element
454
+ """
455
+ rows = []
456
+
457
+ # Extract header row
458
+ thead = table_element.find('thead')
459
+ if thead:
460
+ header_cells = []
461
+ for th in thead.find_all(['th']):
462
+ text = self._process_inline_elements(th)
463
+ # Create a paragraph with bold text for headers
464
+ header_cells.append(Paragraph(f"<b>{text}</b>", self.styles['Normal']))
465
+ rows.append(header_cells)
466
+
467
+ # Extract body rows
468
+ tbody = table_element.find('tbody') or table_element
469
+ for tr in tbody.find_all('tr'):
470
+ if tr.parent.name == 'thead':
471
+ continue # Skip header rows already processed
472
+
473
+ row_cells = []
474
+ for cell in tr.find_all(['td', 'th']):
475
+ text = self._process_inline_elements(cell)
476
+ if cell.name == 'th':
477
+ # Headers are bold
478
+ row_cells.append(Paragraph(f"<b>{text}</b>", self.styles['Normal']))
479
+ else:
480
+ row_cells.append(Paragraph(text, self.styles['Normal']))
481
+
482
+ if row_cells: # Only add non-empty rows
483
+ rows.append(row_cells)
484
+
485
+ if rows:
486
+ # Create table and style
487
+ col_widths = [None] * len(rows[0]) # Auto width for columns
488
+ table = Table(rows, colWidths=col_widths)
489
+
490
+ # Add basic grid and header styling
491
+ style = TableStyle([
492
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.Color(0.7, 0.7, 0.7)),
493
+ ('BACKGROUND', (0, 0), (-1, 0), colors.Color(0.8, 0.8, 0.8)),
494
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
495
+ ('ALIGN', (0, 0), (-1, 0), 'CENTER'),
496
+ ('FONTNAME', (0, 0), (-1, 0), f'{self.font_name}-Bold'),
497
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 8),
498
+ ('TOPPADDING', (0, 0), (-1, 0), 8),
499
+ ('BOTTOMPADDING', (0, 1), (-1, -1), 6),
500
+ ('TOPPADDING', (0, 1), (-1, -1), 6),
501
+ ])
502
+
503
+ table.setStyle(style)
504
+ self.elements.append(table)
505
+
506
+ # Add some space after the table
507
+ self.elements.append(Spacer(1, 0.1*inch))
508
+
509
+ def _generate_toc(self) -> None:
510
+ """Generate a table of contents."""
511
+ if not self.toc_entries:
512
+ return
513
+
514
+ self.elements.append(Paragraph("Table of Contents", self.styles['TOCHeading']))
515
+ self.elements.append(Spacer(1, 0.2*inch))
516
+
517
+ for level, text in self.toc_entries:
518
+ if level <= 3: # Only include headings up to level 3
519
+ self.elements.append(
520
+ Paragraph(text, self.styles[f'TOC{level}'])
521
+ )
522
+
523
+ self.elements.append(PageBreak())
524
+
525
+ def _generate_pdf(self) -> None:
526
+ """Generate the PDF document."""
527
+ # Create the document
528
+ doc = SimpleDocTemplate(
529
+ self.output_path,
530
+ pagesize=self.page_size,
531
+ leftMargin=self.margins[0]*inch,
532
+ rightMargin=self.margins[1]*inch,
533
+ topMargin=self.margins[2]*inch,
534
+ bottomMargin=self.margins[3]*inch
535
+ )
536
+
537
+ # Add TOC if requested
538
+ if self.include_toc and self.toc_entries:
539
+ self._generate_toc()
540
+
541
+ # Build the PDF
542
+ doc.build(self.elements)
543
+
544
+
545
+ class MarkdownToPDFAgent:
546
+ """
547
+ AI Agent to convert Markdown files to PDF with enhanced formatting.
548
+ """
549
+
550
+ def __init__(self, llm=None):
551
+ """
552
+ Initialize the agent with optional LLM for content enhancement.
553
+
554
+ Args:
555
+ llm: Optional language model for content enhancement
556
+ """
557
+ self.llm = llm
558
+ self.converter = MarkdownToPDFConverter()
559
+
560
+ def setup_from_openai(self, api_key=None):
561
+ """
562
+ Setup agent with OpenAI LLM.
563
+
564
+ Args:
565
+ api_key: OpenAI API key (will use env var if not provided)
566
+ """
567
+ try:
568
+ from langchain_openai import ChatOpenAI
569
+
570
+ api_key = api_key or os.getenv("OPENAI_API_KEY")
571
+ if not api_key:
572
+ logger.warning("No OpenAI API key provided. Agent will run without LLM enhancement.")
573
+ return False
574
+
575
+ self.llm = ChatOpenAI(
576
+ model="gpt-4",
577
+ temperature=0.1,
578
+ api_key=api_key
579
+ )
580
+ return True
581
+ except ImportError:
582
+ logger.warning("LangChain OpenAI package not found. Install with 'pip install langchain-openai'")
583
+ return False
584
+
585
+ def setup_from_gemini(self, api_key=None):
586
+ """
587
+ Setup agent with Google Gemini LLM.
588
+
589
+ Args:
590
+ api_key: Google Gemini API key (will use env var if not provided)
591
+ """
592
+ try:
593
+ from langchain_google_genai import ChatGoogleGenerativeAI
594
+
595
+ api_key = api_key or os.getenv("GOOGLE_API_KEY")
596
+ if not api_key:
597
+ logger.warning("No Google API key provided. Agent will run without LLM enhancement.")
598
+ return False
599
+
600
+ try:
601
+ # Use the latest Gemini model version
602
+ self.llm = ChatGoogleGenerativeAI(
603
+ model="gemini-1.5-flash",
604
+ temperature=0.1,
605
+ google_api_key=api_key,
606
+ convert_system_message_to_human=True
607
+ )
608
+ logger.info("Successfully set up Google Gemini LLM")
609
+ return True
610
+ except Exception as e:
611
+ logger.error(f"Error setting up Google Gemini LLM: {str(e)}")
612
+ return False
613
+ except ImportError:
614
+ logger.warning("LangChain Google Generative AI package not found. Install with 'pip install langchain-google-genai'")
615
+ return False
616
+
617
+ def enhance_markdown(self, content: str, instructions: str = None) -> str:
618
+ """
619
+ Enhance markdown content using LLM if available.
620
+
621
+ Args:
622
+ content: Original markdown content
623
+ instructions: Specific enhancement instructions
624
+
625
+ Returns:
626
+ Enhanced markdown content
627
+ """
628
+ if not self.llm:
629
+ logger.warning("No LLM available for enhancement. Returning original content.")
630
+ return content
631
+
632
+ default_instructions = """
633
+ Enhance this markdown content while preserving its structure and meaning.
634
+ Make the following improvements:
635
+ 1. Fix any grammar or spelling issues
636
+ 2. Improve formatting for better readability
637
+ 3. Ensure proper markdown syntax is used
638
+ 4. Add appropriate section headings if missing
639
+ 5. Keep the content factually identical to the original
640
+ """
641
+
642
+ instructions = instructions or default_instructions
643
+
644
+ try:
645
+ # Create a prompt for the LLM
646
+ prompt = f"{instructions}\n\nOriginal content:\n\n{content}\n\nPlease provide the enhanced markdown content:"
647
+
648
+ # Use the LLM directly with proper error handling
649
+ try:
650
+ from langchain.schema import HumanMessage
651
+ logger.info(f"Using LLM type: {type(self.llm).__name__}")
652
+ messages = [HumanMessage(content=prompt)]
653
+ result = self.llm.invoke(messages).content
654
+ logger.info("Successfully received response from LLM")
655
+ except Exception as e:
656
+ logger.error(f"Error invoking LLM: {str(e)}")
657
+ return content
658
+
659
+ # Clean up the result (extract just the markdown part)
660
+ result = self._clean_agent_output(result)
661
+
662
+ return result
663
+ except Exception as e:
664
+ logger.error(f"Error enhancing markdown: {str(e)}")
665
+ return content # Return original content if enhancement fails
666
+
667
+ def _clean_agent_output(self, output: str) -> str:
668
+ """
669
+ Clean up agent output to extract just the markdown content.
670
+
671
+ Args:
672
+ output: Raw agent output
673
+
674
+ Returns:
675
+ Cleaned markdown content
676
+ """
677
+ # Check if the output is wrapped in markdown code blocks
678
+ md_pattern = r"```(?:markdown|md)?\s*([\s\S]*?)```"
679
+ match = re.search(md_pattern, output)
680
+
681
+ if match:
682
+ return match.group(1).strip()
683
+
684
+ # If no markdown blocks found, remove any agent commentary
685
+ lines = output.split('\n')
686
+ result_lines = []
687
+ capture = False
688
+
689
+ for line in lines:
690
+ if capture or not (line.startswith("I") or line.startswith("Here") or line.startswith("The")):
691
+ capture = True
692
+ result_lines.append(line)
693
+
694
+ return '\n'.join(result_lines)
695
+
696
+ def process_file(self, input_path: str, output_path: str = None, enhance: bool = False,
697
+ enhancement_instructions: str = None, page_size: str = "A4") -> str:
698
+ """
699
+ Process a single markdown file and convert it to PDF.
700
+
701
+ Args:
702
+ input_path: Path to input markdown file
703
+ output_path: Path for output PDF (defaults to input path with .pdf extension)
704
+ enhance: Whether to enhance the content with LLM
705
+ enhancement_instructions: Specific instructions for enhancement
706
+ page_size: Page size for the PDF ("A4" or "letter")
707
+
708
+ Returns:
709
+ Path to the generated PDF
710
+ """
711
+ # Validate input file
712
+ if not os.path.exists(input_path):
713
+ logger.error(f"Input file not found: {input_path}")
714
+ return None
715
+
716
+ # Set default output path if not provided
717
+ if not output_path:
718
+ output_path = os.path.splitext(input_path)[0] + ".pdf"
719
+
720
+ # Read markdown content
721
+ with open(input_path, 'r', encoding='utf-8') as f:
722
+ content = f.read()
723
+
724
+ # Enhance content if requested
725
+ if enhance and self.llm:
726
+ logger.info(f"Enhancing content for {input_path}")
727
+ content = self.enhance_markdown(content, enhancement_instructions)
728
+
729
+ # Configure converter
730
+ self.converter = MarkdownToPDFConverter(
731
+ output_path=output_path,
732
+ page_size=page_size
733
+ )
734
+
735
+ # Convert to PDF
736
+ logger.info(f"Converting {input_path} to PDF")
737
+ self.converter.convert_content(content)
738
+
739
+ return output_path
740
+
741
+ def process_directory(self, input_dir: str, output_dir: str = None, pattern: str = "*.md",
742
+ enhance: bool = False, merge: bool = False,
743
+ output_filename: str = "merged_document.pdf",
744
+ page_size: str = "A4") -> List[str]:
745
+ """
746
+ Process all markdown files in a directory.
747
+
748
+ Args:
749
+ input_dir: Path to input directory
750
+ output_dir: Path to output directory (defaults to input directory)
751
+ pattern: Glob pattern for markdown files
752
+ enhance: Whether to enhance content with LLM
753
+ merge: Whether to merge all files into a single PDF
754
+ output_filename: Filename for merged PDF
755
+ page_size: Page size for the PDF ("A4" or "letter")
756
+
757
+ Returns:
758
+ List of paths to generated PDFs
759
+ """
760
+ # Validate input directory
761
+ if not os.path.isdir(input_dir):
762
+ logger.error(f"Input directory not found: {input_dir}")
763
+ return []
764
+
765
+ # Set default output directory if not provided
766
+ if not output_dir:
767
+ output_dir = input_dir
768
+ elif not os.path.exists(output_dir):
769
+ os.makedirs(output_dir)
770
+
771
+ # Get all markdown files
772
+ md_files = glob.glob(os.path.join(input_dir, pattern))
773
+
774
+ if not md_files:
775
+ logger.warning(f"No markdown files found in {input_dir} with pattern {pattern}")
776
+ return []
777
+
778
+ # Sort files to ensure consistent ordering
779
+ md_files.sort()
780
+
781
+ if merge:
782
+ logger.info(f"Merging {len(md_files)} markdown files into a single PDF")
783
+
784
+ # Process each file for enhancement if requested
785
+ if enhance and self.llm:
786
+ enhanced_contents = []
787
+
788
+ for md_file in md_files:
789
+ logger.info(f"Enhancing content for {md_file}")
790
+ with open(md_file, 'r', encoding='utf-8') as f:
791
+ content = f.read()
792
+
793
+ # Add file name as heading
794
+ file_name = os.path.splitext(os.path.basename(md_file))[0]
795
+ content = f"# {file_name}\n\n{content}"
796
+
797
+ enhanced_content = self.enhance_markdown(content)
798
+ enhanced_contents.append(enhanced_content)
799
+
800
+ # Merge enhanced contents with page breaks
801
+ merged_content = "\n\n<div class='page-break'></div>\n\n".join(enhanced_contents)
802
+
803
+ # Convert merged content
804
+ output_path = os.path.join(output_dir, output_filename)
805
+ self.converter = MarkdownToPDFConverter(
806
+ output_path=output_path,
807
+ page_size=page_size
808
+ )
809
+ self.converter.convert_content(merged_content)
810
+
811
+ return [output_path]
812
+ else:
813
+ # Merge without enhancement
814
+ output_path = os.path.join(output_dir, output_filename)
815
+ self.converter = MarkdownToPDFConverter(
816
+ output_path=output_path,
817
+ page_size=page_size
818
+ )
819
+ self.converter.convert_multiple_files(md_files, merge=True)
820
+
821
+ return [output_path]
822
+ else:
823
+ # Process each file individually
824
+ output_files = []
825
+
826
+ for md_file in md_files:
827
+ output_filename = os.path.splitext(os.path.basename(md_file))[0] + ".pdf"
828
+ output_path = os.path.join(output_dir, output_filename)
829
+
830
+ processed_file = self.process_file(
831
+ md_file,
832
+ output_path,
833
+ enhance=enhance,
834
+ page_size=page_size
835
+ )
836
+
837
+ if processed_file:
838
+ output_files.append(processed_file)
839
+
840
+ return output_files
841
+
842
+ # Gradio Interface for Hugging Face
843
+ def load_sample():
844
+ """Load a sample markdown document."""
845
+ return """# Sample Markdown Document
846
+
847
+ ## Introduction
848
+ This is a sample markdown document to demonstrate the capabilities of **MarkdownMuse**. You can use this as a starting point for your own documents.
849
+
850
+ ## Features
851
+ - Convert markdown to PDF
852
+ - Support for tables and code blocks
853
+ - AI enhancement options
854
+
855
+ ### Code Example
856
+ ```python
857
+ def hello_world():
858
+ print("Hello from MarkdownMuse!")
859
+ return True
860
+ ```
861
+
862
+ ## Table Example
863
+ | Feature | Description | Status |
864
+ |---------|-------------|---------|
865
+ | Markdown Conversion | Convert MD to PDF | βœ… |
866
+ | AI Enhancement | Improve content with AI | βœ… |
867
+ | Custom Styling | Apply custom styles | βœ… |
868
+
869
+ > **Note:** This is just a sample document. Feel free to modify it or create your own!
870
+ """
871
+
872
+ def process_markdown(markdown_text, page_size, font_size, font_name,
873
+ margin_size, include_toc, use_ai, api_key, enhancement_instructions):
874
+ """
875
+ Process markdown text and generate a PDF.
876
+
877
+ Returns:
878
+ Path to generated PDF file
879
+ """
880
+ # Create a temporary file for the output
881
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
882
+ output_path = temp_file.name
883
+ temp_file.close()
884
+
885
+ # Initialize the agent and process the markdown
886
+ agent = MarkdownToPDFAgent()
887
+
888
+ # Configure converter
889
+ agent.converter = MarkdownToPDFConverter(
890
+ output_path=output_path,
891
+ page_size=page_size,
892
+ base_font_size=font_size,
893
+ font_name=font_name,
894
+ margins=(margin_size, margin_size, margin_size, margin_size),
895
+ include_toc=include_toc
896
+ )
897
+
898
+ # Setup AI enhancement if requested
899
+ enhance = False
900
+ if use_ai and api_key:
901
+ if api_key:
902
+ success = agent.setup_from_gemini(api_key)
903
+ enhance = success
904
+
905
+ try:
906
+ # Create a temporary file for the markdown content
907
+ with tempfile.NamedTemporaryFile(suffix='.md', delete=False) as temp_md_file:
908
+ temp_md_path = temp_md_file.name
909
+ temp_md_file.write(markdown_text.encode('utf-8'))
910
+
911
+ # Process the file
912
+ output_file = agent.process_file(
913
+ temp_md_path,
914
+ output_path,
915
+ enhance=enhance,
916
+ enhancement_instructions=enhancement_instructions if enhancement_instructions else None,
917
+ page_size=page_size.lower()
918
+ )
919
+
920
+ # Remove the temporary md file
921
+ os.unlink(temp_md_path)
922
+
923
+ if output_file:
924
+ return output_file
925
+ else:
926
+ return None
927
+ except Exception as e:
928
+ logger.error(f"Error processing markdown: {e}")
929
+ return None
930
+
931
+ # Define the Gradio interface
932
+ with gr.Blocks(title="MarkdownMuse", theme=gr.themes.Soft()) as app:
933
+ gr.Markdown(
934
+ """
935
+ # πŸ“ MarkdownMuse
936
+
937
+ Transform your Markdown files into beautifully formatted PDFs with ease.
938
+ """
939
+ )
940
+
941
+ with gr.Row():
942
+ with gr.Column(scale=1):
943
+ gr.Markdown("### πŸ“ Input")
944
+
945
+ markdown_input = gr.TextArea(
946
+ placeholder="Enter your markdown content here...",
947
+ label="Markdown Content",
948
+ lines=15
949
+ )
950
+
951
+ sample_btn = gr.Button("πŸ“‹ Load Sample")
952
+
953
+ with gr.Accordion("βš™οΈ PDF Settings", open=False):
954
+ page_size = gr.Radio(
955
+ ["A4", "Letter"],
956
+ label="Page Size",
957
+ value="A4"
958
+ )
959
+
960
+ font_size = gr.Slider(
961
+ minimum=8,
962
+ maximum=14,
963
+ value=10,
964
+ step=1,
965
+ label="Base Font Size (pt)"
966
+ )
967
+
968
+ font_name = gr.Dropdown(
969
+ ["Helvetica", "Times-Roman", "Courier"],
970
+ label="Font Family",
971
+ value="Helvetica"
972
+ )
973
+
974
+ margin_size = gr.Slider(
975
+ minimum=0.5,
976
+ maximum=2.0,
977
+ value=0.75,
978
+ step=0.25,
979
+ label="Margins (inches)"
980
+ )
981
+
982
+ include_toc = gr.Checkbox(
983
+ value=True,
984
+ label="Include Table of Contents"
985
+ )
986
+
987
+ with gr.Accordion("🧠 AI Enhancement", open=False):
988
+ use_ai = gr.Checkbox(
989
+ value=False,
990
+ label="Enable AI Enhancement"
991
+ )
992
+
993
+ api_key = gr.Textbox(
994
+ placeholder="Enter your Google Gemini API key...",
995
+ label="Google Gemini API Key",
996
+ type="password",
997
+ visible=True
998
+ )
999
+
1000
+ enhancement_instructions = gr.TextArea(
1001
+ placeholder="Optional: Provide specific instructions for how the AI should enhance your markdown...",
1002
+ label="Enhancement Instructions",
1003
+ lines=3,
1004
+ visible=True
1005
+ )
1006
+
1007
+ convert_btn = gr.Button("πŸ”„ Convert to PDF", variant="primary")
1008
+
1009
+ with gr.Column(scale=1):
1010
+ gr.Markdown("### πŸ“‹ Output")
1011
+
1012
+ output_pdf = gr.File(label="Generated PDF")
1013
+
1014
+ # Set up event handlers
1015
+ sample_btn.click(load_sample, outputs=markdown_input)
1016
+
1017
+ # Add visibility toggle for API key based on checkbox
1018
+ use_ai.change(
1019
+ lambda x: [gr.update(visible=x), gr.update(visible=x)],
1020
+ inputs=[use_ai],
1021
+ outputs=[api_key, enhancement_instructions]
1022
+ )
1023
+
1024
+ convert_btn.click(
1025
+ process_markdown,
1026
+ inputs=[
1027
+ markdown_input,
1028
+ page_size,
1029
+ font_size,
1030
+ font_name,
1031
+ margin_size,
1032
+ include_toc,
1033
+ use_ai,
1034
+ api_key,
1035
+ enhancement_instructions
1036
+ ],
1037
+ outputs=output_pdf
1038
+ )
1039
+
1040
+ gr.Markdown(
1041
+ """
1042
+ ### πŸ“š About MarkdownMuse
1043
+
1044
+ This tool allows you to convert Markdown documents to beautifully formatted PDFs. Use the options to customize your output.
1045
+
1046
+ Made with ❀️ using Gradio and ReportLab | © 2024
1047
+ """
1048
+ )
1049
+
1050
+ # Launch the app
1051
+ if __name__ == "__main__":
1052
+ app.launch()