"""Conversion result class for handling different output formats.""" import csv import io import json import logging import re from typing import Any, Dict, List, Optional, Union logger = logging.getLogger(__name__) class MarkdownToJSONParser: """Comprehensive markdown to structured JSON parser.""" def __init__(self): """Initialize the parser.""" # Compile regex patterns for better performance self.header_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE) self.list_item_pattern = re.compile(r'^(\s*)[*\-+]\s+(.+)$', re.MULTILINE) self.ordered_list_pattern = re.compile(r'^(\s*)\d+\.\s+(.+)$', re.MULTILINE) self.code_block_pattern = re.compile(r'```(\w+)?\n(.*?)```', re.DOTALL) self.inline_code_pattern = re.compile(r'`([^`]+)`') self.link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)') self.image_pattern = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)') self.table_pattern = re.compile(r'\|(.+)\|\s*\n\|[-\s|:]+\|\s*\n((?:\|.+\|\s*\n?)*)', re.MULTILINE) self.blockquote_pattern = re.compile(r'^>\s+(.+)$', re.MULTILINE) self.bold_pattern = re.compile(r'\*\*(.+?)\*\*') self.italic_pattern = re.compile(r'\*(.+?)\*') def parse(self, markdown_text: str) -> Dict[str, Any]: """Parse markdown text into structured JSON. Args: markdown_text: The markdown content to parse Returns: Structured JSON representation """ if not markdown_text or not markdown_text.strip(): return { "document": { "sections": [], "metadata": {"total_sections": 0} } } lines = markdown_text.split('\n') sections = [] current_section = None current_content = [] for line in lines: line = line.rstrip() # Check if this is a header header_match = self.header_pattern.match(line) if header_match: # Save previous section if exists if current_section is not None: current_section['content'] = self._parse_content('\n'.join(current_content)) sections.append(current_section) # Start new section header_level = len(header_match.group(1)) header_text = header_match.group(2).strip() current_section = { "title": header_text, "level": header_level, "type": "section", "content": {} } current_content = [] else: # Add to current content if line.strip() or current_content: # Keep empty lines only if we have content current_content.append(line) # Don't forget the last section if current_section is not None: current_section['content'] = self._parse_content('\n'.join(current_content)) sections.append(current_section) elif current_content: # Handle content without any headers sections.append({ "title": "Content", "level": 1, "type": "section", "content": self._parse_content('\n'.join(current_content)) }) # Create hierarchical structure structured_sections = self._create_hierarchy(sections) return { "document": { "sections": structured_sections, "metadata": { "total_sections": len(sections), "max_heading_level": max([s.get('level', 1) for s in sections]) if sections else 0, "has_tables": any('tables' in s.get('content', {}) for s in sections), "has_code_blocks": any('code_blocks' in s.get('content', {}) for s in sections), "has_lists": any('lists' in s.get('content', {}) for s in sections), "has_images": any('images' in s.get('content', {}) for s in sections) } } } def _parse_content(self, content: str) -> Dict[str, Any]: """Parse content within a section into structured components.""" if not content.strip(): return {} result = {} # Extract and parse different content types paragraphs = self._extract_paragraphs(content) if paragraphs: result['paragraphs'] = paragraphs lists = self._extract_lists(content) if lists: result['lists'] = lists code_blocks = self._extract_code_blocks(content) if code_blocks: result['code_blocks'] = code_blocks tables = self._extract_tables(content) if tables: result['tables'] = tables images = self._extract_images(content) if images: result['images'] = images links = self._extract_links(content) if links: result['links'] = links blockquotes = self._extract_blockquotes(content) if blockquotes: result['blockquotes'] = blockquotes return result def _extract_paragraphs(self, content: str) -> List[str]: """Extract paragraphs from content.""" # Remove code blocks, tables, lists, etc. to get clean paragraphs clean_content = content # Remove code blocks clean_content = self.code_block_pattern.sub('', clean_content) # Remove tables (simplified) clean_content = re.sub(r'\|.*\|', '', clean_content) # Remove list items clean_content = self.list_item_pattern.sub('', clean_content) clean_content = self.ordered_list_pattern.sub('', clean_content) # Remove blockquotes clean_content = self.blockquote_pattern.sub('', clean_content) # Split into paragraphs and clean paragraphs = [] for para in clean_content.split('\n\n'): para = para.strip() if para and not para.startswith('#'): # Clean up markdown formatting for paragraphs para = self._clean_inline_formatting(para) paragraphs.append(para) return paragraphs def _extract_lists(self, content: str) -> List[Dict[str, Any]]: """Extract lists from content.""" lists = [] lines = content.split('\n') current_list = None for line in lines: line = line.rstrip() # Check for unordered list unordered_match = self.list_item_pattern.match(line) if unordered_match: indent_level = len(unordered_match.group(1)) // 2 item_text = self._clean_inline_formatting(unordered_match.group(2)) if current_list is None or current_list['type'] != 'unordered': if current_list: lists.append(current_list) current_list = {'type': 'unordered', 'items': []} current_list['items'].append({ 'text': item_text, 'level': indent_level }) continue # Check for ordered list ordered_match = self.ordered_list_pattern.match(line) if ordered_match: indent_level = len(ordered_match.group(1)) // 2 item_text = self._clean_inline_formatting(ordered_match.group(2)) if current_list is None or current_list['type'] != 'ordered': if current_list: lists.append(current_list) current_list = {'type': 'ordered', 'items': []} current_list['items'].append({ 'text': item_text, 'level': indent_level }) continue # If we hit a non-list line and have a current list, save it if current_list and line.strip(): lists.append(current_list) current_list = None # Don't forget the last list if current_list: lists.append(current_list) return lists def _extract_code_blocks(self, content: str) -> List[Dict[str, str]]: """Extract code blocks from content.""" code_blocks = [] for match in self.code_block_pattern.finditer(content): language = match.group(1) or 'text' code = match.group(2).strip() code_blocks.append({ 'language': language, 'code': code }) return code_blocks def _extract_tables(self, content: str) -> List[Dict[str, Any]]: """Extract tables from content.""" tables = [] for match in self.table_pattern.finditer(content): header_row = match.group(1).strip() body_rows = match.group(2).strip() # Parse header headers = [cell.strip() for cell in header_row.split('|') if cell.strip()] # Parse body rows rows = [] for row_line in body_rows.split('\n'): if row_line.strip() and '|' in row_line: cells = [cell.strip() for cell in row_line.split('|') if cell.strip()] if cells: rows.append(cells) if headers and rows: tables.append({ 'headers': headers, 'rows': rows, 'columns': len(headers) }) return tables def _extract_images(self, content: str) -> List[Dict[str, str]]: """Extract images from content.""" images = [] for match in self.image_pattern.finditer(content): alt_text = match.group(1) url = match.group(2) images.append({ 'alt_text': alt_text, 'url': url }) return images def _extract_links(self, content: str) -> List[Dict[str, str]]: """Extract links from content.""" links = [] for match in self.link_pattern.finditer(content): text = match.group(1) url = match.group(2) links.append({ 'text': text, 'url': url }) return links def _extract_blockquotes(self, content: str) -> List[str]: """Extract blockquotes from content.""" blockquotes = [] for match in self.blockquote_pattern.finditer(content): quote_text = match.group(1).strip() blockquotes.append(quote_text) return blockquotes def _clean_inline_formatting(self, text: str) -> str: """Clean inline markdown formatting from text.""" # Remove bold text = self.bold_pattern.sub(r'\1', text) # Remove italic text = self.italic_pattern.sub(r'\1', text) # Remove inline code text = self.inline_code_pattern.sub(r'\1', text) return text.strip() def _create_hierarchy(self, sections: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Create hierarchical structure from flat sections list.""" if not sections: return [] result = [] stack = [] for section in sections: level = section['level'] # Pop from stack until we find a parent at appropriate level while stack and stack[-1]['level'] >= level: stack.pop() # If we have a parent, add this section as a subsection if stack: parent = stack[-1] if 'subsections' not in parent: parent['subsections'] = [] parent['subsections'].append(section) else: # This is a top-level section result.append(section) # Add this section to the stack stack.append(section) return result class MarkdownToHTMLConverter: """Comprehensive markdown to HTML extractor.""" def __init__(self): """Initialize the extractor.""" # Compile regex patterns for better performance self.header_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE) self.bold_pattern = re.compile(r'\*\*(.+?)\*\*') self.italic_pattern = re.compile(r'\*(.+?)\*') self.bold_italic_pattern = re.compile(r'\*\*\*(.+?)\*\*\*') self.strikethrough_pattern = re.compile(r'~~(.+?)~~') self.inline_code_pattern = re.compile(r'`([^`]+)`') self.link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)') self.image_pattern = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)') self.horizontal_rule_pattern = re.compile(r'^---+$', re.MULTILINE) self.blockquote_pattern = re.compile(r'^>\s+(.+)$', re.MULTILINE) def extract(self, markdown_text: str) -> str: """Convert markdown text to HTML. Args: markdown_text: The markdown content to extract Returns: HTML string """ html = markdown_text # Process code blocks first (before other inline processing) html = self._process_code_blocks(html) # Process tables html = self._process_tables(html) # Process horizontal rules html = self._process_horizontal_rules(html) # Process blockquotes html = self._process_blockquotes(html) # Process headers html = self._process_headers(html) # Process lists html = self._process_lists(html) # Process inline elements html = self._process_inline_elements(html) # Process paragraphs html = self._process_paragraphs(html) return html def _process_code_blocks(self, text: str) -> str: """Process fenced code blocks.""" # Handle ```code blocks``` def replace_code_block(match): language = match.group(1) or '' code = match.group(2) lang_class = f' class="language-{language}"' if language else '' return f'
{self._escape_html(code)}'
text = re.sub(r'```(\w+)?\n(.*?)\n```', replace_code_block, text, flags=re.DOTALL)
# Handle indented code blocks (4 spaces or tab)
lines = text.split('\n')
in_code_block = False
code_lines = []
result_lines = []
for line in lines:
if line.startswith(' ') or line.startswith('\t'):
if not in_code_block:
in_code_block = True
code_lines = [line.lstrip()]
else:
code_lines.append(line.lstrip())
else:
if in_code_block:
# End code block
code_content = '\n'.join(code_lines)
result_lines.append(f'{self._escape_html(code_content)}')
code_lines = []
in_code_block = False
result_lines.append(line)
if in_code_block:
code_content = '\n'.join(code_lines)
result_lines.append(f'{self._escape_html(code_content)}')
return '\n'.join(result_lines)
def _process_tables(self, text: str) -> str:
"""Process markdown tables."""
lines = text.split('\n')
result_lines = []
i = 0
while i < len(lines):
line = lines[i]
# Check if this line looks like a table header
if '|' in line and i + 1 < len(lines) and '|' in lines[i + 1]:
# Check if next line is separator
next_line = lines[i + 1]
if re.match(r'^\s*\|[\s\-:|]+\|\s*$', next_line):
# This is a table
table_lines = [line]
j = i + 1
# Collect all table rows
while j < len(lines) and '|' in lines[j]:
table_lines.append(lines[j])
j += 1
# Convert table to HTML
html_table = self._convert_table_to_html(table_lines)
result_lines.append(html_table)
i = j
continue
result_lines.append(line)
i += 1
return '\n'.join(result_lines)
def _convert_table_to_html(self, table_lines: List[str]) -> str:
"""Convert table lines to HTML table."""
if len(table_lines) < 2:
return table_lines[0] if table_lines else ''
html_parts = ['| {self._escape_html(cell)} | ') html_parts.append('
|---|
| {self._escape_html(cell)} | ') html_parts.append('
{quote_html}') i = j continue result_lines.append(line) i += 1 return '\n'.join(result_lines) def _process_headers(self, text: str) -> str: """Process markdown headers.""" def replace_header(match): level = len(match.group(1)) content = match.group(2) return f'
\1', text)
# Process links
text = self.link_pattern.sub(r'\1', text)
# Process images
text = self.image_pattern.sub(r'tags.""" lines = text.split('\n') result_lines = [] current_paragraph = [] for line in lines: if line.strip() == '': if current_paragraph: # End current paragraph paragraph_content = ' '.join(current_paragraph) result_lines.append(f'
{paragraph_content}
') current_paragraph = [] else: # Check if line is already an HTML block element if re.match(r'^<(h[1-6]|p|div|blockquote|pre|table|ul|ol|li|hr)', line.strip()): # Flush current paragraph if any if current_paragraph: paragraph_content = ' '.join(current_paragraph) result_lines.append(f'{paragraph_content}
') current_paragraph = [] result_lines.append(line) else: current_paragraph.append(line) # Handle any remaining paragraph if current_paragraph: paragraph_content = ' '.join(current_paragraph) result_lines.append(f'{paragraph_content}
') return '\n'.join(result_lines) def _escape_html(self, text: str) -> str: """Escape HTML special characters.""" return (text.replace('&', '&') .replace('<', '<') .replace('>', '>') .replace('"', '"') .replace("'", ''')) class ConversionResult: """Result object with methods to export to different formats.""" def __init__(self, content: str, metadata: Optional[Dict[str, Any]] = None): """Initialize the conversion result. Args: content: The converted content as string metadata: Optional metadata about the conversion """ self.content = content self.metadata = metadata or {} self._html_converter = MarkdownToHTMLConverter() self._json_parser = MarkdownToJSONParser() def extract_markdown(self) -> str: """Export as markdown. Returns: The content formatted as markdown """ return self.content def extract_html(self) -> str: """Export as HTML. Returns: The content formatted as HTML """ # Convert markdown content to HTML using the comprehensive extractor html_content = self._html_converter.extract(self.content) # Wrap in HTML structure with Nanonets design system return f"""