| from io import BytesIO |
| import re, tempfile, os, mimetypes, urllib.parse, requests |
| from docx import Document |
| from docx.shared import Inches, Pt, RGBColor |
| from docx.enum.text import WD_UNDERLINE, WD_ALIGN_PARAGRAPH, WD_COLOR_INDEX |
| from docx.enum.table import WD_TABLE_ALIGNMENT |
| from docx.enum.style import WD_STYLE_TYPE |
| from docx.oxml import OxmlElement |
| from docx.oxml.ns import qn |
| from docx.opc.constants import RELATIONSHIP_TYPE as RT |
|
|
| |
| INLINE_RE = re.compile( |
| r""" |
| ( |
| ```[^`]+?``` | |
| \$\$[^$]+?\$\$ | |
| \$[^$\n]+?\$ | |
| \*\*\*.+?\*\*\* | |
| \*\*.+?\*\* | |
| __.+?__ | |
| \*[^ *].+?\* | |
| ~~.+?~~ | |
| ==.+?== | |
| `[^`]+?` | |
| !\[([^\]]*)\]\(([^)]+)\) | |
| \[([^\]]+)\]\(([^)]+)\) |
| ) |
| """, |
| flags=re.VERBOSE | re.DOTALL, |
| ) |
|
|
| |
| CUSTOM_STYLES = { |
| "heading1": {"name": "Heading 1", "font_size": Pt(16), "bold": True}, |
| "heading2": {"name": "Heading 2", "font_size": Pt(14), "bold": True}, |
| "heading3": {"name": "Heading 3", "font_size": Pt(12), "bold": True}, |
| "quote": {"name": "Quote", "font_size": Pt(12), "italic": True, "color": RGBColor(50, 50, 50)}, |
| "small": {"name": "SmallText", "font_size": Pt(10)}, |
| "large": {"name": "LargeText", "font_size": Pt(14)}, |
| "code": {"name": "CodeBlock", "font_size": Pt(10), "font_name": "Courier New"}, |
| "toc": {"name": "TOC", "font_size": Pt(12)}, |
| } |
|
|
| |
| def add_hyperlink(paragraph, url, text, color="0000EE"): |
| part = paragraph.part |
| r_id = part.relate_to(url, RT.HYPERLINK, is_external=True) |
| hyperlink = OxmlElement("w:hyperlink") |
| hyperlink.set(qn("r:id"), r_id) |
|
|
| new_run = OxmlElement("w:r") |
| rPr = OxmlElement("w:rPr") |
| col = OxmlElement("w:color"); col.set(qn("w:val"), color); rPr.append(col) |
| u = OxmlElement("w:u"); u.set(qn("w:val"), "single"); rPr.append(u) |
| new_run.append(rPr) |
|
|
| t = OxmlElement("w:t"); t.text = text |
| new_run.append(t) |
| hyperlink.append(new_run) |
| paragraph._p.append(hyperlink) |
|
|
| def add_image(paragraph, src, width_inches=4.0, alt="image"): |
| try: |
| if urllib.parse.urlparse(src).scheme in {"http", "https"}: |
| rsp = requests.get(src, timeout=10) |
| rsp.raise_for_status() |
| content_type = rsp.headers.get("content-type", "image/png") |
| ext = mimetypes.guess_extension(content_type) or ".png" |
| fd, path = tempfile.mkstemp(suffix=ext) |
| with os.fdopen(fd, 'wb') as tmp: |
| tmp.write(rsp.content) |
| else: |
| path = src |
| paragraph.add_run().add_picture(path, width=Inches(width_inches)) |
| except Exception as e: |
| paragraph.add_run(f"[Image failed: {alt} - {e}]") |
|
|
| |
| def register_custom_styles(doc): |
| styles = doc.styles |
| for style_key, props in CUSTOM_STYLES.items(): |
| style_name = props["name"] |
| try: |
| style = styles[style_name] |
| except KeyError: |
| style = styles.add_style(style_name, WD_STYLE_TYPE.PARAGRAPH) |
| |
| style.font.name = props.get("font_name", "Calibri") |
| style.font.size = props.get("font_size", Pt(11)) |
| style.font.bold = props.get("bold", False) |
| style.font.italic = props.get("italic", False) |
| if "color" in props: |
| style.font.color.rgb = props["color"] |
|
|
| |
| def _add_run(par, token: str, font_size=None): |
| m = re.fullmatch(r"!\[([^\]]*)\]\(([^)]+)\)", token) |
| if m: |
| return add_image(par, m.group(2), alt=m.group(1)) |
|
|
| m = re.fullmatch(r"\[([^\]]+)\]\(([^)]+)\)", token) |
| if m: |
| return add_hyperlink(par, m.group(2), m.group(1)) |
|
|
| if token.startswith("$$") and token.endswith("$$"): |
| token = token[2:-2] |
| elif token.startswith("$") and token.endswith("$"): |
| token = token[1:-1] |
| elif token.startswith("```") and token.endswith("```"): |
| token = token[3:-3] |
|
|
| style = {"bold": False, "italic": False, "underline": False, |
| "strike": False, "highlight": False, "mono": False} |
|
|
| def strip(mark, attr): |
| nonlocal token |
| style[attr] = True |
| token = token[len(mark):-len(mark)] |
|
|
| if token.startswith("***") and token.endswith("***"): |
| strip("***", "bold"); style["italic"] = True |
| elif token.startswith("**") and token.endswith("**"): |
| strip("**", "bold") |
| elif token.startswith("__") and token.endswith("__"): |
| strip("__", "underline") |
| elif token.startswith("*") and token.endswith("*"): |
| strip("*", "italic") |
| elif token.startswith("~~") and token.endswith("~~"): |
| strip("~~", "strike") |
| elif token.startswith("==") and token.endswith("=="): |
| strip("==", "highlight") |
| elif token.startswith("`") and token.endswith("`"): |
| strip("`", "mono") |
|
|
| run = par.add_run(token) |
| run.bold = style["bold"] |
| run.italic = style["italic"] |
| if style["underline"]: |
| run.underline = WD_UNDERLINE.SINGLE |
| if style["strike"]: |
| run.font.strike = True |
| if style["highlight"]: |
| run.font.highlight_color = WD_COLOR_INDEX.YELLOW |
| if style["mono"]: |
| run.font.name = "Courier New" |
| if font_size: |
| run.font.size = font_size |
|
|
|
|
| |
| def add_paragraph_with_tokens(doc, raw, style_name="Normal", indent=0, alignment=None, spacing_before=0, spacing_after=0): |
| """ |
| Adds a paragraph preserving inline formatting and enforced alignment. |
| """ |
| try: |
| p = doc.add_paragraph(style=style_name) |
| except KeyError: |
| p = doc.add_paragraph(style="Normal") |
|
|
| |
| for part in INLINE_RE.split(raw): |
| if not part: |
| continue |
| if INLINE_RE.fullmatch(part): |
| _add_run(p, part) |
| else: |
| p.add_run(part) |
|
|
| |
| fmt = p.paragraph_format |
| if indent: |
| fmt.left_indent = Inches(indent) |
| if spacing_before: |
| fmt.space_before = Pt(spacing_before) |
| if spacing_after: |
| fmt.space_after = Pt(spacing_after) |
|
|
| |
| if alignment is not None: |
| p.alignment = alignment |
| else: |
| p.alignment = WD_ALIGN_PARAGRAPH.LEFT |
|
|
| |
| pPr = p._p.get_or_add_pPr() |
| jc = OxmlElement("w:jc") |
| jc.set(qn("w:val"), { |
| WD_ALIGN_PARAGRAPH.LEFT: "left", |
| WD_ALIGN_PARAGRAPH.CENTER: "center", |
| WD_ALIGN_PARAGRAPH.RIGHT: "right", |
| WD_ALIGN_PARAGRAPH.JUSTIFY: "both" |
| }.get(p.alignment, "left")) |
| pPr.append(jc) |
|
|
| return p |
|
|
| |
| def _flush_table(doc, buf): |
| if not buf: |
| return |
| def cells(row): |
| return [c.strip() for c in row.strip("| \t").split("|")] |
| |
| header = buf[0] |
| body = buf[1:] if len(buf) > 1 else [] |
| |
| sep_row = body and re.match(r"^\s*:?-+:?\s*(\|\s*:?-+:?\s*)+$", body[0]) |
| if sep_row: |
| body = body[1:] |
| |
| header_cells = cells(header) |
| if not header_cells: |
| return |
| |
| tbl = doc.add_table(rows=1, cols=len(header_cells)) |
| tbl.style = "Table Grid" |
| |
| for j, txt in enumerate(header_cells): |
| cell = tbl.cell(0, j) |
| cell.text = txt |
| for paragraph in cell.paragraphs: |
| for run in paragraph.runs: |
| run.bold = True |
| |
| for line in body: |
| if not line.strip(): |
| continue |
| row_cells = cells(line) |
| row = tbl.add_row().cells |
| for j, txt in enumerate(row_cells): |
| if j < len(row): |
| row[j].text = txt |
| buf.clear() |
|
|
| |
| def add_headers_footers(doc): |
| for section in doc.sections: |
| header = section.header |
| footer = section.footer |
|
|
| h_p = header.paragraphs[0] |
| h_p.text = "Confidential Draft - DO NOT DISTRIBUTE" |
| h_p.alignment = WD_ALIGN_PARAGRAPH.CENTER |
|
|
| f_p = footer.paragraphs[0] |
| f_p.alignment = WD_ALIGN_PARAGRAPH.CENTER |
| f_p.text = "Page " |
|
|
| f_run = f_p.add_run() |
| fld_char = OxmlElement('w:fldChar') |
| fld_char.set(qn('w:fldCharType'), 'begin') |
| f_run._r.append(fld_char) |
| |
| instr_text = OxmlElement('w:instrText') |
| instr_text.set(qn('xml:space'), 'preserve') |
| instr_text.text = 'PAGE' |
| f_run._r.append(instr_text) |
|
|
| fld_char = OxmlElement('w:fldChar') |
| fld_char.set(qn('w:fldCharType'), 'end') |
| f_run._r.append(fld_char) |
|
|
| |
| def create_docx_with_layout(text: str, title=None, author=None) -> BytesIO: |
| if "Option 1" in text: |
| text = text.split("Option 1", 1)[1] |
| if "Option 2" in text: |
| text = text.split("Option 2", 1)[0] |
|
|
| doc = Document() |
| doc.core_properties.title = title or "Generated Document" |
| doc.core_properties.author = author or "System" |
|
|
| register_custom_styles(doc) |
| add_headers_footers(doc) |
|
|
| sect = doc.sections[0] |
| sect.page_height = Inches(11) |
| sect.page_width = Inches(8.5) |
| sect.top_margin = Inches(1) |
| sect.bottom_margin = Inches(1) |
| sect.left_margin = Inches(1) |
| sect.right_margin = Inches(1) |
|
|
| indent_unit = 0.3 |
| table_buf = [] |
| in_fenced_code, fenced_buf = False, [] |
| toc_entries = [] |
|
|
| for raw in text.strip("\n").splitlines(): |
| line = raw.rstrip("\n") |
|
|
| |
| alignment = None |
| align_line = line.strip() |
| if (align_line.startswith("->") and align_line.endswith("<-")) or (align_line.startswith("::") and align_line.endswith("::")): |
| alignment = WD_ALIGN_PARAGRAPH.CENTER |
| line = align_line[2:-2].strip() |
| elif align_line.endswith("->"): |
| alignment = WD_ALIGN_PARAGRAPH.RIGHT |
| line = align_line[:-2].strip() |
| elif align_line.startswith("<center>") and align_line.endswith("</center>"): |
| alignment = WD_ALIGN_PARAGRAPH.CENTER |
| line = align_line[8:-9].strip() |
| elif align_line.startswith("<right>") and align_line.endswith("</right>"): |
| alignment = WD_ALIGN_PARAGRAPH.RIGHT |
| line = align_line[7:-8].strip() |
| elif align_line.startswith("<left>") and align_line.endswith("</left>"): |
| alignment = WD_ALIGN_PARAGRAPH.LEFT |
| line = align_line[6:-7].strip() |
| elif align_line.startswith("<justify>") and align_line.endswith("</justify>"): |
| alignment = WD_ALIGN_PARAGRAPH.JUSTIFY |
| line = align_line[9:-10].strip() |
|
|
| |
| fence = re.match(r"^```(\w+)?\s*$", line) |
| if fence: |
| if in_fenced_code: |
| for ln in fenced_buf: |
| p = doc.add_paragraph(style="CodeBlock") |
| p.paragraph_format.left_indent = Inches(0.25) |
| r = p.add_run(ln) |
| r.font.name = "Courier New" |
| shd = OxmlElement("w:shd") |
| shd.set(qn("w:fill"), "EEEEEE") |
| p._p.get_or_add_pPr().append(shd) |
| in_fenced_code, fenced_buf = False, [] |
| else: |
| in_fenced_code = True |
| continue |
| if in_fenced_code: |
| fenced_buf.append(line) |
| continue |
|
|
| |
| if re.match(r"^\s*\|.*\|\s*$", line): |
| table_buf.append(line) |
| continue |
| _flush_table(doc, table_buf) |
|
|
| |
| if not line.strip(): |
| doc.add_paragraph() |
| continue |
| if line.strip() in {"[[PAGEBREAK]]", "\f", "<PAGEBREAK>"}: |
| doc.add_page_break() |
| continue |
| if re.fullmatch(r"\s*(\*\s*\*\s*\*|---+)\s*", line): |
| p = doc.add_paragraph() |
| p_pr = p._p.get_or_add_pPr() |
| bdr = OxmlElement("w:pBdr") |
| bottom = OxmlElement("w:bottom") |
| bottom.set(qn("w:val"), "single") |
| bottom.set(qn("w:sz"), "6") |
| bottom.set(qn("w:space"), "1") |
| bottom.set(qn("w:color"), "auto") |
| bdr.append(bottom) |
| p_pr.append(bdr) |
| continue |
|
|
| |
| m = re.match(r"^(#{1,6})\s+(.+)$", line) |
| if m: |
| level = len(m.group(1)) |
| heading_text = m.group(2).strip() |
| heading_style = f"Heading {level}" |
| add_paragraph_with_tokens(doc, heading_text, style_name=heading_style, alignment=alignment) |
| toc_entries.append((level, heading_text, "1")) |
| continue |
| |
| |
| if re.match(r"^subject:", line, flags=re.I): |
| add_paragraph_with_tokens(doc, line.split(":", 1)[1].strip(), style_name="Heading 2", alignment=alignment) |
| continue |
| |
| if line.isupper() and ":" in line: |
| add_paragraph_with_tokens(doc, line, style_name="Heading 3", alignment=alignment) |
| continue |
|
|
| |
| m = re.match(r"^\s*>\s*(.+)$", line) |
| if m: |
| add_paragraph_with_tokens(doc, m.group(1), style_name="Quote", indent=0.3, alignment=alignment) |
| continue |
|
|
| |
| bul = re.match(r"^(\s*)[-*β’]\s+(.+)$", line) |
| num = re.match(r"^(\s*)(\d+)[.)]\s+(.+)$", line) |
| if bul or num: |
| indent = len(bul.group(1) if bul else num.group(1)) |
| level = indent // 4 |
| txt = bul.group(2) if bul else num.group(3) |
| style = "List Bullet" if bul else "List Number" |
| p = add_paragraph_with_tokens(doc, txt, style_name=style, alignment=alignment) |
| if level: |
| p.paragraph_format.left_indent = Inches(0.25 * level) |
| continue |
|
|
| |
| add_paragraph_with_tokens(doc, line, style_name="Normal", alignment=alignment, spacing_before=6, spacing_after=3) |
|
|
| _flush_table(doc, table_buf) |
|
|
| |
| buf = BytesIO() |
| doc.save(buf) |
| buf.seek(0) |
| return buf |
|
|