Spaces:
Sleeping
Sleeping
| // src/markdown-transformer/docsToMarkdown.ts | |
| /** | |
| * Font families used by the markdown-to-docs direction for code styling. | |
| * When these are detected on a text run, we render backtick code in markdown. | |
| */ | |
| const CODE_FONT_FAMILIES = new Set(['Roboto Mono', 'Courier New', 'Consolas', 'monospace']); | |
| // --- Main Conversion --- | |
| /** | |
| * Converts Google Docs JSON structure to a markdown string. | |
| * | |
| * Accepts the raw response from `docs.documents.get()`, or a subset with | |
| * `{ body, lists }` (e.g. when extracting a specific tab). | |
| * | |
| * Handles headings, paragraphs, text formatting (bold, italic, strikethrough, | |
| * underline, links, code), ordered & unordered lists with nesting, tables, | |
| * and section breaks. | |
| */ | |
| export function docsJsonToMarkdown(docData: { body?: any; lists?: any }): string { | |
| const body = docData.body; | |
| if (!body?.content) { | |
| return ''; | |
| } | |
| const lists: Record<string, any> = docData.lists ?? {}; | |
| let markdown = ''; | |
| for (const element of body.content as any[]) { | |
| if (element.paragraph) { | |
| markdown += convertParagraph(element.paragraph, lists); | |
| } else if (element.table) { | |
| markdown += convertTable(element.table); | |
| } else if (element.sectionBreak) { | |
| markdown += '\n---\n\n'; | |
| } | |
| } | |
| return markdown.trim(); | |
| } | |
| // --- Paragraph Conversion --- | |
| function convertParagraph(paragraph: any, lists: Record<string, any>): string { | |
| // 1. Determine paragraph type | |
| const headingLevel = getHeadingLevel(paragraph); | |
| const listInfo = getListInfo(paragraph, lists); | |
| // 2. Extract text content with inline formatting | |
| const elements: any[] = paragraph.elements ?? []; | |
| const text = extractFormattedText(elements); | |
| // 3. Format based on type | |
| if (headingLevel && text.trim()) { | |
| const hashes = '#'.repeat(Math.min(headingLevel, 6)); | |
| return `${hashes} ${text.trim()}\n\n`; | |
| } | |
| if (listInfo && text.trim()) { | |
| const indent = ' '.repeat(listInfo.nestingLevel); | |
| const marker = listInfo.ordered ? `1.` : `-`; | |
| return `${indent}${marker} ${text.trim()}\n`; | |
| } | |
| if (text.trim()) { | |
| return `${text.trim()}\n\n`; | |
| } | |
| return '\n'; | |
| } | |
| // --- Heading Detection --- | |
| function getHeadingLevel(paragraph: any): number | null { | |
| const styleType = paragraph.paragraphStyle?.namedStyleType; | |
| if (!styleType) return null; | |
| if (styleType === 'TITLE') return 1; | |
| if (styleType === 'SUBTITLE') return 2; | |
| const match = styleType.match(/^HEADING_(\d)$/); | |
| return match ? parseInt(match[1], 10) : null; | |
| } | |
| // --- List Detection --- | |
| interface ListInfo { | |
| ordered: boolean; | |
| nestingLevel: number; | |
| } | |
| function getListInfo(paragraph: any, lists: Record<string, any>): ListInfo | null { | |
| if (!paragraph.bullet) return null; | |
| const nestingLevel: number = paragraph.bullet.nestingLevel ?? 0; | |
| const listId: string | undefined = paragraph.bullet.listId; | |
| let ordered = false; | |
| if (listId && lists[listId]?.listProperties?.nestingLevels) { | |
| const nestingLevels: any[] = lists[listId].listProperties.nestingLevels; | |
| const level = nestingLevels[nestingLevel]; | |
| if (level) { | |
| // glyphType is set for ordered lists (e.g., DECIMAL, ALPHA, ROMAN) | |
| // glyphSymbol is set for unordered lists (e.g., bullet characters) | |
| // If glyphType is present and not empty, it's ordered | |
| if (level.glyphType && level.glyphType !== 'GLYPH_TYPE_UNSPECIFIED') { | |
| ordered = true; | |
| } | |
| } | |
| } | |
| return { ordered, nestingLevel }; | |
| } | |
| // --- Text Run Conversion --- | |
| function extractFormattedText(elements: any[]): string { | |
| let result = ''; | |
| for (const element of elements) { | |
| if (element.textRun) { | |
| result += convertTextRun(element.textRun); | |
| } | |
| } | |
| return result; | |
| } | |
| function convertTextRun(textRun: any): string { | |
| let text: string = textRun.content ?? ''; | |
| const style = textRun.textStyle; | |
| if (!style) return text; | |
| // Detect code-styled text (monospace font) -- wrap in backticks and skip | |
| // other formatting since markdown code spans don't support nested formatting. | |
| if (isCodeStyled(style)) { | |
| const trimmed = text.replace(/\n$/, ''); | |
| if (trimmed) { | |
| return `\`${trimmed}\`${text.endsWith('\n') ? '\n' : ''}`; | |
| } | |
| return text; | |
| } | |
| // Strip trailing newline before applying formatting markers, then re-add. | |
| // This prevents markers from wrapping the newline (e.g., "**text\n**"). | |
| const trailingNewline = text.endsWith('\n'); | |
| const content = trailingNewline ? text.slice(0, -1) : text; | |
| if (!content) return text; | |
| let formatted = content; | |
| // Apply inline formatting (bold + italic combined, or individually) | |
| if (style.bold && style.italic) { | |
| formatted = `***${formatted}***`; | |
| } else if (style.bold) { | |
| formatted = `**${formatted}**`; | |
| } else if (style.italic) { | |
| formatted = `*${formatted}*`; | |
| } | |
| if (style.strikethrough) { | |
| formatted = `~~${formatted}~~`; | |
| } | |
| if (style.underline && !style.link) { | |
| formatted = `<u>${formatted}</u>`; | |
| } | |
| if (style.link?.url) { | |
| formatted = `[${formatted}](${style.link.url})`; | |
| } | |
| return formatted + (trailingNewline ? '\n' : ''); | |
| } | |
| function isCodeStyled(style: any): boolean { | |
| const fontFamily = style.weightedFontFamily?.fontFamily; | |
| return typeof fontFamily === 'string' && CODE_FONT_FAMILIES.has(fontFamily); | |
| } | |
| // --- Table Conversion --- | |
| function convertTable(table: any): string { | |
| if (!table.tableRows || table.tableRows.length === 0) { | |
| return ''; | |
| } | |
| // Detect code block tables (1x1 table with monospace font or gray background) | |
| if (isCodeBlockTable(table)) { | |
| return convertCodeBlockTable(table); | |
| } | |
| let markdown = '\n'; | |
| let isFirstRow = true; | |
| for (const row of table.tableRows) { | |
| if (!row.tableCells) continue; | |
| let rowText = '|'; | |
| for (const cell of row.tableCells) { | |
| const cellText = extractCellText(cell); | |
| rowText += ` ${cellText} |`; | |
| } | |
| markdown += rowText + '\n'; | |
| // Add header separator after the first row | |
| if (isFirstRow) { | |
| let separator = '|'; | |
| for (let i = 0; i < row.tableCells.length; i++) { | |
| separator += ' --- |'; | |
| } | |
| markdown += separator + '\n'; | |
| isFirstRow = false; | |
| } | |
| } | |
| return markdown + '\n'; | |
| } | |
| /** | |
| * Detects if a table is a code block (1x1 table with monospace font or gray background). | |
| * Google Docs "Code Block" building blocks are represented as styled 1x1 tables. | |
| */ | |
| function isCodeBlockTable(table: any): boolean { | |
| // Must be a 1x1 table | |
| if (!table.tableRows || table.tableRows.length !== 1) return false; | |
| const row = table.tableRows[0]; | |
| if (!row.tableCells || row.tableCells.length !== 1) return false; | |
| const cell = row.tableCells[0]; | |
| // Check for gray/colored background on the cell | |
| const cellStyle = cell.tableCellStyle; | |
| if (cellStyle?.backgroundColor?.color?.rgbColor) { | |
| const bg = cellStyle.backgroundColor.color.rgbColor; | |
| // Detect light gray backgrounds (typical of code blocks) | |
| // Allow a range of light grays | |
| const r = bg.red ?? 0; | |
| const g = bg.green ?? 0; | |
| const b = bg.blue ?? 0; | |
| if (r > 0.85 && g > 0.85 && b > 0.85 && r < 1 && g < 1 && b < 1) { | |
| return true; | |
| } | |
| } | |
| // Check for monospace font in cell content | |
| if (cell.content) { | |
| for (const element of cell.content) { | |
| if (element.paragraph?.elements) { | |
| for (const pe of element.paragraph.elements) { | |
| if (pe.textRun?.textStyle) { | |
| if (isCodeStyled(pe.textRun.textStyle)) { | |
| return true; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return false; | |
| } | |
| /** | |
| * Converts a code block table (1x1 table) to a fenced markdown code block. | |
| */ | |
| function convertCodeBlockTable(table: any): string { | |
| const cell = table.tableRows[0].tableCells[0]; | |
| let codeText = ''; | |
| if (cell.content) { | |
| for (const element of cell.content) { | |
| if (element.paragraph?.elements) { | |
| for (const pe of element.paragraph.elements) { | |
| if (pe.textRun?.content) { | |
| codeText += pe.textRun.content; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| // Remove trailing newline (cells always end with one) | |
| if (codeText.endsWith('\n')) { | |
| codeText = codeText.slice(0, -1); | |
| } | |
| return '\n```\n' + codeText + '\n```\n\n'; | |
| } | |
| function extractCellText(cell: any): string { | |
| let text = ''; | |
| if (!cell.content) return text; | |
| for (const element of cell.content) { | |
| if (element.paragraph?.elements) { | |
| for (const pe of element.paragraph.elements) { | |
| if (pe.textRun?.content) { | |
| text += pe.textRun.content.replace(/\n/g, ' ').trim(); | |
| } | |
| } | |
| } | |
| } | |
| return text; | |
| } | |