verolabz / docx_builder.py
omgy's picture
Update docx_builder.py
18a9e0d verified
from io import BytesIO
import re, tempfile, os, mimetypes, urllib.parse, requests
from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_UNDERLINE, WD_ALIGN_PARAGRAPH, WD_COLOR_INDEX
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.enum.style import WD_STYLE_TYPE
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.opc.constants import RELATIONSHIP_TYPE as RT
# ───────────────────────── INLINE TOKEN REGEX ──────────────────────────
INLINE_RE = re.compile(
r"""
(
```[^`]+?``` |
\$\$[^$]+?\$\$ |
\$[^$\n]+?\$ |
\*\*\*.+?\*\*\* |
\*\*.+?\*\* |
__.+?__ |
\*[^ *].+?\* |
~~.+?~~ |
==.+?== |
`[^`]+?` |
!\[([^\]]*)\]\(([^)]+)\) |
\[([^\]]+)\]\(([^)]+)\)
)
""",
flags=re.VERBOSE | re.DOTALL,
)
# ─────────────────────── CUSTOM STYLE REGISTRY ────────────────────────
CUSTOM_STYLES = {
"heading1": {"name": "Heading 1", "font_size": Pt(16), "bold": True},
"heading2": {"name": "Heading 2", "font_size": Pt(14), "bold": True},
"heading3": {"name": "Heading 3", "font_size": Pt(12), "bold": True},
"quote": {"name": "Quote", "font_size": Pt(12), "italic": True, "color": RGBColor(50, 50, 50)},
"small": {"name": "SmallText", "font_size": Pt(10)},
"large": {"name": "LargeText", "font_size": Pt(14)},
"code": {"name": "CodeBlock", "font_size": Pt(10), "font_name": "Courier New"},
"toc": {"name": "TOC", "font_size": Pt(12)},
}
# ───────────────────── helpers for hyperlinks / images ──────────────────
def add_hyperlink(paragraph, url, text, color="0000EE"):
part = paragraph.part
r_id = part.relate_to(url, RT.HYPERLINK, is_external=True)
hyperlink = OxmlElement("w:hyperlink")
hyperlink.set(qn("r:id"), r_id)
new_run = OxmlElement("w:r")
rPr = OxmlElement("w:rPr")
col = OxmlElement("w:color"); col.set(qn("w:val"), color); rPr.append(col)
u = OxmlElement("w:u"); u.set(qn("w:val"), "single"); rPr.append(u)
new_run.append(rPr)
t = OxmlElement("w:t"); t.text = text
new_run.append(t)
hyperlink.append(new_run)
paragraph._p.append(hyperlink)
def add_image(paragraph, src, width_inches=4.0, alt="image"):
try:
if urllib.parse.urlparse(src).scheme in {"http", "https"}:
rsp = requests.get(src, timeout=10)
rsp.raise_for_status()
content_type = rsp.headers.get("content-type", "image/png")
ext = mimetypes.guess_extension(content_type) or ".png"
fd, path = tempfile.mkstemp(suffix=ext)
with os.fdopen(fd, 'wb') as tmp:
tmp.write(rsp.content)
else:
path = src
paragraph.add_run().add_picture(path, width=Inches(width_inches))
except Exception as e:
paragraph.add_run(f"[Image failed: {alt} - {e}]")
# ─────────────────────── Register Custom Styles ────────────────────────
def register_custom_styles(doc):
styles = doc.styles
for style_key, props in CUSTOM_STYLES.items():
style_name = props["name"]
try:
style = styles[style_name]
except KeyError:
style = styles.add_style(style_name, WD_STYLE_TYPE.PARAGRAPH)
style.font.name = props.get("font_name", "Calibri")
style.font.size = props.get("font_size", Pt(11))
style.font.bold = props.get("bold", False)
style.font.italic = props.get("italic", False)
if "color" in props:
style.font.color.rgb = props["color"]
# ─────────────────────── inline-token dispatcher ────────────────────────
def _add_run(par, token: str, font_size=None):
m = re.fullmatch(r"!\[([^\]]*)\]\(([^)]+)\)", token)
if m:
return add_image(par, m.group(2), alt=m.group(1))
m = re.fullmatch(r"\[([^\]]+)\]\(([^)]+)\)", token)
if m:
return add_hyperlink(par, m.group(2), m.group(1))
if token.startswith("$$") and token.endswith("$$"):
token = token[2:-2]
elif token.startswith("$") and token.endswith("$"):
token = token[1:-1]
elif token.startswith("```") and token.endswith("```"):
token = token[3:-3]
style = {"bold": False, "italic": False, "underline": False,
"strike": False, "highlight": False, "mono": False}
def strip(mark, attr):
nonlocal token
style[attr] = True
token = token[len(mark):-len(mark)]
if token.startswith("***") and token.endswith("***"):
strip("***", "bold"); style["italic"] = True
elif token.startswith("**") and token.endswith("**"):
strip("**", "bold")
elif token.startswith("__") and token.endswith("__"):
strip("__", "underline")
elif token.startswith("*") and token.endswith("*"):
strip("*", "italic")
elif token.startswith("~~") and token.endswith("~~"):
strip("~~", "strike")
elif token.startswith("==") and token.endswith("=="):
strip("==", "highlight")
elif token.startswith("`") and token.endswith("`"):
strip("`", "mono")
run = par.add_run(token)
run.bold = style["bold"]
run.italic = style["italic"]
if style["underline"]:
run.underline = WD_UNDERLINE.SINGLE
if style["strike"]:
run.font.strike = True
if style["highlight"]:
run.font.highlight_color = WD_COLOR_INDEX.YELLOW
if style["mono"]:
run.font.name = "Courier New"
if font_size:
run.font.size = font_size
# ─────────────────────── enhanced paragraph builder ───────────────────────
def add_paragraph_with_tokens(doc, raw, style_name="Normal", indent=0, alignment=None, spacing_before=0, spacing_after=0):
"""
Adds a paragraph preserving inline formatting and enforced alignment.
"""
try:
p = doc.add_paragraph(style=style_name)
except KeyError:
p = doc.add_paragraph(style="Normal")
# Add text content first
for part in INLINE_RE.split(raw):
if not part:
continue
if INLINE_RE.fullmatch(part):
_add_run(p, part)
else:
p.add_run(part)
# Apply paragraph formatting
fmt = p.paragraph_format
if indent:
fmt.left_indent = Inches(indent)
if spacing_before:
fmt.space_before = Pt(spacing_before)
if spacing_after:
fmt.space_after = Pt(spacing_after)
# Force alignment after adding runs
if alignment is not None:
p.alignment = alignment
else:
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
# XML-level enforcement for safety
pPr = p._p.get_or_add_pPr()
jc = OxmlElement("w:jc")
jc.set(qn("w:val"), {
WD_ALIGN_PARAGRAPH.LEFT: "left",
WD_ALIGN_PARAGRAPH.CENTER: "center",
WD_ALIGN_PARAGRAPH.RIGHT: "right",
WD_ALIGN_PARAGRAPH.JUSTIFY: "both"
}.get(p.alignment, "left"))
pPr.append(jc)
return p
# ───────────────────────── table helper ─────────────────────
def _flush_table(doc, buf):
if not buf:
return
def cells(row):
return [c.strip() for c in row.strip("| \t").split("|")]
header = buf[0]
body = buf[1:] if len(buf) > 1 else []
sep_row = body and re.match(r"^\s*:?-+:?\s*(\|\s*:?-+:?\s*)+$", body[0])
if sep_row:
body = body[1:]
header_cells = cells(header)
if not header_cells:
return
tbl = doc.add_table(rows=1, cols=len(header_cells))
tbl.style = "Table Grid"
for j, txt in enumerate(header_cells):
cell = tbl.cell(0, j)
cell.text = txt
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
for line in body:
if not line.strip():
continue
row_cells = cells(line)
row = tbl.add_row().cells
for j, txt in enumerate(row_cells):
if j < len(row):
row[j].text = txt
buf.clear()
# ─────────────────────── Add Headers/Footers ─────────
def add_headers_footers(doc):
for section in doc.sections:
header = section.header
footer = section.footer
h_p = header.paragraphs[0]
h_p.text = "Confidential Draft - DO NOT DISTRIBUTE"
h_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
f_p = footer.paragraphs[0]
f_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
f_p.text = "Page "
f_run = f_p.add_run()
fld_char = OxmlElement('w:fldChar')
fld_char.set(qn('w:fldCharType'), 'begin')
f_run._r.append(fld_char)
instr_text = OxmlElement('w:instrText')
instr_text.set(qn('xml:space'), 'preserve')
instr_text.text = 'PAGE'
f_run._r.append(instr_text)
fld_char = OxmlElement('w:fldChar')
fld_char.set(qn('w:fldCharType'), 'end')
f_run._r.append(fld_char)
# ───────────────────────────── main entry ‐──────────────────────────────
def create_docx_with_layout(text: str, title=None, author=None) -> BytesIO:
if "Option 1" in text:
text = text.split("Option 1", 1)[1]
if "Option 2" in text:
text = text.split("Option 2", 1)[0]
doc = Document()
doc.core_properties.title = title or "Generated Document"
doc.core_properties.author = author or "System"
register_custom_styles(doc)
add_headers_footers(doc)
sect = doc.sections[0]
sect.page_height = Inches(11)
sect.page_width = Inches(8.5)
sect.top_margin = Inches(1)
sect.bottom_margin = Inches(1)
sect.left_margin = Inches(1)
sect.right_margin = Inches(1)
indent_unit = 0.3
table_buf = []
in_fenced_code, fenced_buf = False, []
toc_entries = []
for raw in text.strip("\n").splitlines():
line = raw.rstrip("\n")
# ── alignment detection ──
alignment = None
align_line = line.strip()
if (align_line.startswith("->") and align_line.endswith("<-")) or (align_line.startswith("::") and align_line.endswith("::")):
alignment = WD_ALIGN_PARAGRAPH.CENTER
line = align_line[2:-2].strip()
elif align_line.endswith("->"):
alignment = WD_ALIGN_PARAGRAPH.RIGHT
line = align_line[:-2].strip()
elif align_line.startswith("<center>") and align_line.endswith("</center>"):
alignment = WD_ALIGN_PARAGRAPH.CENTER
line = align_line[8:-9].strip()
elif align_line.startswith("<right>") and align_line.endswith("</right>"):
alignment = WD_ALIGN_PARAGRAPH.RIGHT
line = align_line[7:-8].strip()
elif align_line.startswith("<left>") and align_line.endswith("</left>"):
alignment = WD_ALIGN_PARAGRAPH.LEFT
line = align_line[6:-7].strip()
elif align_line.startswith("<justify>") and align_line.endswith("</justify>"):
alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
line = align_line[9:-10].strip()
# ── fenced code ──
fence = re.match(r"^```(\w+)?\s*$", line)
if fence:
if in_fenced_code:
for ln in fenced_buf:
p = doc.add_paragraph(style="CodeBlock")
p.paragraph_format.left_indent = Inches(0.25)
r = p.add_run(ln)
r.font.name = "Courier New"
shd = OxmlElement("w:shd")
shd.set(qn("w:fill"), "EEEEEE")
p._p.get_or_add_pPr().append(shd)
in_fenced_code, fenced_buf = False, []
else:
in_fenced_code = True
continue
if in_fenced_code:
fenced_buf.append(line)
continue
# ── tables ──
if re.match(r"^\s*\|.*\|\s*$", line):
table_buf.append(line)
continue
_flush_table(doc, table_buf)
# ── empty / pagebreak / horizontal rule ──
if not line.strip():
doc.add_paragraph()
continue
if line.strip() in {"[[PAGEBREAK]]", "\f", "<PAGEBREAK>"}:
doc.add_page_break()
continue
if re.fullmatch(r"\s*(\*\s*\*\s*\*|---+)\s*", line):
p = doc.add_paragraph()
p_pr = p._p.get_or_add_pPr()
bdr = OxmlElement("w:pBdr")
bottom = OxmlElement("w:bottom")
bottom.set(qn("w:val"), "single")
bottom.set(qn("w:sz"), "6")
bottom.set(qn("w:space"), "1")
bottom.set(qn("w:color"), "auto")
bdr.append(bottom)
p_pr.append(bdr)
continue
# ── headings ──
m = re.match(r"^(#{1,6})\s+(.+)$", line)
if m:
level = len(m.group(1))
heading_text = m.group(2).strip()
heading_style = f"Heading {level}"
add_paragraph_with_tokens(doc, heading_text, style_name=heading_style, alignment=alignment)
toc_entries.append((level, heading_text, "1"))
continue
# ── subject / key section ──
if re.match(r"^subject:", line, flags=re.I):
add_paragraph_with_tokens(doc, line.split(":", 1)[1].strip(), style_name="Heading 2", alignment=alignment)
continue
if line.isupper() and ":" in line:
add_paragraph_with_tokens(doc, line, style_name="Heading 3", alignment=alignment)
continue
# ── blockquote ──
m = re.match(r"^\s*>\s*(.+)$", line)
if m:
add_paragraph_with_tokens(doc, m.group(1), style_name="Quote", indent=0.3, alignment=alignment)
continue
# ── list items ──
bul = re.match(r"^(\s*)[-*β€’]\s+(.+)$", line)
num = re.match(r"^(\s*)(\d+)[.)]\s+(.+)$", line)
if bul or num:
indent = len(bul.group(1) if bul else num.group(1))
level = indent // 4
txt = bul.group(2) if bul else num.group(3)
style = "List Bullet" if bul else "List Number"
p = add_paragraph_with_tokens(doc, txt, style_name=style, alignment=alignment)
if level:
p.paragraph_format.left_indent = Inches(0.25 * level)
continue
# ── normal paragraph ──
add_paragraph_with_tokens(doc, line, style_name="Normal", alignment=alignment, spacing_before=6, spacing_after=3)
_flush_table(doc, table_buf)
# ── save ──
buf = BytesIO()
doc.save(buf)
buf.seek(0)
return buf