sdlc-agent / src /data /convert_to_word.py
Veeru-c's picture
initial commit
06bd253
#!/usr/bin/env python3
"""
Convert markdown to Word document using python-docx
"""
import sys
import subprocess
import re
try:
from docx import Document
from docx.shared import Pt, Inches, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
except ImportError:
print("python-docx not installed. Installing...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "python-docx", "-q"])
from docx import Document
from docx.shared import Pt, Inches, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
def markdown_to_word(md_file, docx_file):
"""Convert markdown file to Word document"""
# Read markdown file
with open(md_file, 'r', encoding='utf-8') as f:
content = f.read()
# Create Word document
doc = Document()
# Set default font
style = doc.styles['Normal']
font = style.font
font.name = 'Calibri'
font.size = Pt(11)
lines = content.split('\n')
i = 0
while i < len(lines):
line = lines[i].strip()
# Skip empty lines (but add spacing)
if not line:
i += 1
continue
# Title (first line with #)
if line.startswith('# ') and i < 5:
title = line[2:].strip()
heading = doc.add_heading(title, level=0)
heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
i += 1
continue
# Headings
if line.startswith('## '):
doc.add_heading(line[3:].strip(), level=1)
i += 1
continue
elif line.startswith('### '):
doc.add_heading(line[4:].strip(), level=2)
i += 1
continue
elif line.startswith('#### '):
doc.add_heading(line[5:].strip(), level=3)
i += 1
continue
# Tables
if '|' in line and line.count('|') >= 2:
# Collect table rows
table_rows = []
j = i
while j < len(lines) and '|' in lines[j] and lines[j].count('|') >= 2:
if not lines[j].strip().startswith('|---'):
table_rows.append(lines[j])
j += 1
if table_rows:
# Parse table
headers = [cell.strip() for cell in table_rows[0].split('|')[1:-1]]
data_rows = []
for row in table_rows[1:]:
cells = [cell.strip() for cell in row.split('|')[1:-1]]
if cells:
data_rows.append(cells)
# Create table
if headers:
table = doc.add_table(rows=1, cols=len(headers))
table.style = 'Light Grid Accent 1'
# Header row
header_cells = table.rows[0].cells
for idx, header in enumerate(headers):
header_cells[idx].text = header
header_cells[idx].paragraphs[0].runs[0].font.bold = True
# Data rows
for row_data in data_rows:
if len(row_data) == len(headers):
row_cells = table.add_row().cells
for idx, cell_data in enumerate(row_data):
row_cells[idx].text = cell_data
i = j
continue
# Lists
if line.startswith('- ') or line.startswith('* '):
items = []
j = i
while j < len(lines) and (lines[j].strip().startswith('- ') or
lines[j].strip().startswith('* ') or
lines[j].strip().startswith(' - ') or
lines[j].strip().startswith(' * ')):
items.append(lines[j].strip()[2:].strip())
j += 1
for item in items:
# Remove markdown formatting
item = re.sub(r'\*\*(.*?)\*\*', r'\1', item)
item = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', item)
p = doc.add_paragraph(item, style='List Bullet')
i = j
continue
# Regular paragraph
# Remove markdown formatting
text = line
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
text = re.sub(r'`(.*?)`', r'\1', text)
if text.strip():
p = doc.add_paragraph(text)
i += 1
# Save document
doc.save(docx_file)
print(f"βœ“ Word document created: {docx_file}")
if __name__ == "__main__":
md_file = "docs/tokyo_auto_insurance_product_design_filled.md"
docx_file = "docs/tokyo_auto_insurance_product_design.docx"
try:
markdown_to_word(md_file, docx_file)
except Exception as e:
print(f"Error: {e}")
print("\nTrying alternative: Creating a simpler Word document...")
# Fallback: create a basic Word doc
from docx import Document
doc = Document()
doc.add_heading('Auto Insurance Product Design Document for Tokyo, Japan', 0)
with open(md_file, 'r', encoding='utf-8') as f:
content = f.read()
# Simple conversion - just paragraphs
for line in content.split('\n'):
line = line.strip()
if not line:
continue
if line.startswith('#'):
level = len(line) - len(line.lstrip('#'))
doc.add_heading(line.lstrip('# '), level=min(level, 3))
else:
# Remove markdown
line = line.replace('**', '')
line = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', line)
if line:
doc.add_paragraph(line)
doc.save(docx_file)
print(f"βœ“ Word document created (simple format): {docx_file}")