Spaces:

omgy
/

verolabz

Sleeping

App Files Files Community

verolabz / docx_builder.py

omgy

Update docx_builder.py

18a9e0d verified 6 months ago

raw

history blame contribute delete

15.4 kB

	from io import BytesIO
	import re, tempfile, os, mimetypes, urllib.parse, requests
	from docx import Document
	from docx.shared import Inches, Pt, RGBColor
	from docx.enum.text import WD_UNDERLINE, WD_ALIGN_PARAGRAPH, WD_COLOR_INDEX
	from docx.enum.table import WD_TABLE_ALIGNMENT
	from docx.enum.style import WD_STYLE_TYPE
	from docx.oxml import OxmlElement
	from docx.oxml.ns import qn
	from docx.opc.constants import RELATIONSHIP_TYPE as RT

	# ───────────────────────── INLINE TOKEN REGEX ──────────────────────────
	INLINE_RE = re.compile(
	r"""
	(
	```[^`]+?``` \|
	\$\$[^$]+?\$\$ \|
	\$[^$\n]+?\$ \|
	\\\.+?\\\ \|
	\\.+?\\ \|
	__.+?__ \|
	\[^ ].+?\* \|
	~~.+?~~ \|
	==.+?== \|
	`[^`]+?` \|
	!\[([^\]]*)\]$([^)]+)$ \|
	\[([^\]]+)\]$([^)]+)$
	)
	""",
	flags=re.VERBOSE \| re.DOTALL,
	)

	# ─────────────────────── CUSTOM STYLE REGISTRY ────────────────────────
	CUSTOM_STYLES = {
	"heading1": {"name": "Heading 1", "font_size": Pt(16), "bold": True},
	"heading2": {"name": "Heading 2", "font_size": Pt(14), "bold": True},
	"heading3": {"name": "Heading 3", "font_size": Pt(12), "bold": True},
	"quote": {"name": "Quote", "font_size": Pt(12), "italic": True, "color": RGBColor(50, 50, 50)},
	"small": {"name": "SmallText", "font_size": Pt(10)},
	"large": {"name": "LargeText", "font_size": Pt(14)},
	"code": {"name": "CodeBlock", "font_size": Pt(10), "font_name": "Courier New"},
	"toc": {"name": "TOC", "font_size": Pt(12)},
	}

	# ───────────────────── helpers for hyperlinks / images ──────────────────
	def add_hyperlink(paragraph, url, text, color="0000EE"):
	part = paragraph.part
	r_id = part.relate_to(url, RT.HYPERLINK, is_external=True)
	hyperlink = OxmlElement("w:hyperlink")
	hyperlink.set(qn("r:id"), r_id)

	new_run = OxmlElement("w:r")
	rPr = OxmlElement("w:rPr")
	col = OxmlElement("w:color"); col.set(qn("w:val"), color); rPr.append(col)
	u = OxmlElement("w:u"); u.set(qn("w:val"), "single"); rPr.append(u)
	new_run.append(rPr)

	t = OxmlElement("w:t"); t.text = text
	new_run.append(t)
	hyperlink.append(new_run)
	paragraph._p.append(hyperlink)

	def add_image(paragraph, src, width_inches=4.0, alt="image"):
	try:
	if urllib.parse.urlparse(src).scheme in {"http", "https"}:
	rsp = requests.get(src, timeout=10)
	rsp.raise_for_status()
	content_type = rsp.headers.get("content-type", "image/png")
	ext = mimetypes.guess_extension(content_type) or ".png"
	fd, path = tempfile.mkstemp(suffix=ext)
	with os.fdopen(fd, 'wb') as tmp:
	tmp.write(rsp.content)
	else:
	path = src
	paragraph.add_run().add_picture(path, width=Inches(width_inches))
	except Exception as e:
	paragraph.add_run(f"[Image failed: {alt} - {e}]")

	# ─────────────────────── Register Custom Styles ────────────────────────
	def register_custom_styles(doc):
	styles = doc.styles
	for style_key, props in CUSTOM_STYLES.items():
	style_name = props["name"]
	try:
	style = styles[style_name]
	except KeyError:
	style = styles.add_style(style_name, WD_STYLE_TYPE.PARAGRAPH)

	style.font.name = props.get("font_name", "Calibri")
	style.font.size = props.get("font_size", Pt(11))
	style.font.bold = props.get("bold", False)
	style.font.italic = props.get("italic", False)
	if "color" in props:
	style.font.color.rgb = props["color"]

	# ─────────────────────── inline-token dispatcher ────────────────────────
	def _add_run(par, token: str, font_size=None):
	m = re.fullmatch(r"!\[([^\]]*)\]$([^)]+)$", token)
	if m:
	return add_image(par, m.group(2), alt=m.group(1))

	m = re.fullmatch(r"\[([^\]]+)\]$([^)]+)$", token)
	if m:
	return add_hyperlink(par, m.group(2), m.group(1))

	if token.startswith("$$") and token.endswith("$$"):
	token = token[2:-2]
	elif token.startswith("$") and token.endswith("$"):
	token = token[1:-1]
	elif token.startswith("```") and token.endswith("```"):
	token = token[3:-3]

	style = {"bold": False, "italic": False, "underline": False,
	"strike": False, "highlight": False, "mono": False}

	def strip(mark, attr):
	nonlocal token
	style[attr] = True
	token = token[len(mark):-len(mark)]

	if token.startswith("*") and token.endswith("*"):
	strip("***", "bold"); style["italic"] = True
	elif token.startswith("") and token.endswith(""):
	strip("**", "bold")
	elif token.startswith("__") and token.endswith("__"):
	strip("__", "underline")
	elif token.startswith("") and token.endswith(""):
	strip("*", "italic")
	elif token.startswith("~~") and token.endswith("~~"):
	strip("~~", "strike")
	elif token.startswith("==") and token.endswith("=="):
	strip("==", "highlight")
	elif token.startswith("`") and token.endswith("`"):
	strip("`", "mono")

	run = par.add_run(token)
	run.bold = style["bold"]
	run.italic = style["italic"]
	if style["underline"]:
	run.underline = WD_UNDERLINE.SINGLE
	if style["strike"]:
	run.font.strike = True
	if style["highlight"]:
	run.font.highlight_color = WD_COLOR_INDEX.YELLOW
	if style["mono"]:
	run.font.name = "Courier New"
	if font_size:
	run.font.size = font_size


	# ─────────────────────── enhanced paragraph builder ───────────────────────
	def add_paragraph_with_tokens(doc, raw, style_name="Normal", indent=0, alignment=None, spacing_before=0, spacing_after=0):
	"""
	Adds a paragraph preserving inline formatting and enforced alignment.
	"""
	try:
	p = doc.add_paragraph(style=style_name)
	except KeyError:
	p = doc.add_paragraph(style="Normal")

	# Add text content first
	for part in INLINE_RE.split(raw):
	if not part:
	continue
	if INLINE_RE.fullmatch(part):
	_add_run(p, part)
	else:
	p.add_run(part)

	# Apply paragraph formatting
	fmt = p.paragraph_format
	if indent:
	fmt.left_indent = Inches(indent)
	if spacing_before:
	fmt.space_before = Pt(spacing_before)
	if spacing_after:
	fmt.space_after = Pt(spacing_after)

	# Force alignment after adding runs
	if alignment is not None:
	p.alignment = alignment
	else:
	p.alignment = WD_ALIGN_PARAGRAPH.LEFT

	# XML-level enforcement for safety
	pPr = p._p.get_or_add_pPr()
	jc = OxmlElement("w:jc")
	jc.set(qn("w:val"), {
	WD_ALIGN_PARAGRAPH.LEFT: "left",
	WD_ALIGN_PARAGRAPH.CENTER: "center",
	WD_ALIGN_PARAGRAPH.RIGHT: "right",
	WD_ALIGN_PARAGRAPH.JUSTIFY: "both"
	}.get(p.alignment, "left"))
	pPr.append(jc)

	return p

	# ───────────────────────── table helper ─────────────────────
	def _flush_table(doc, buf):
	if not buf:
	return
	def cells(row):
	return [c.strip() for c in row.strip("\| \t").split("\|")]

	header = buf[0]
	body = buf[1:] if len(buf) > 1 else []

	sep_row = body and re.match(r"^\s:?-+:?\s(\\|\s:?-+:?\s)+$", body[0])
	if sep_row:
	body = body[1:]

	header_cells = cells(header)
	if not header_cells:
	return

	tbl = doc.add_table(rows=1, cols=len(header_cells))
	tbl.style = "Table Grid"

	for j, txt in enumerate(header_cells):
	cell = tbl.cell(0, j)
	cell.text = txt
	for paragraph in cell.paragraphs:
	for run in paragraph.runs:
	run.bold = True

	for line in body:
	if not line.strip():
	continue
	row_cells = cells(line)
	row = tbl.add_row().cells
	for j, txt in enumerate(row_cells):
	if j < len(row):
	row[j].text = txt
	buf.clear()

	# ─────────────────────── Add Headers/Footers ─────────
	def add_headers_footers(doc):
	for section in doc.sections:
	header = section.header
	footer = section.footer

	h_p = header.paragraphs[0]
	h_p.text = "Confidential Draft - DO NOT DISTRIBUTE"
	h_p.alignment = WD_ALIGN_PARAGRAPH.CENTER

	f_p = footer.paragraphs[0]
	f_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
	f_p.text = "Page "

	f_run = f_p.add_run()
	fld_char = OxmlElement('w:fldChar')
	fld_char.set(qn('w:fldCharType'), 'begin')
	f_run._r.append(fld_char)

	instr_text = OxmlElement('w:instrText')
	instr_text.set(qn('xml:space'), 'preserve')
	instr_text.text = 'PAGE'
	f_run._r.append(instr_text)

	fld_char = OxmlElement('w:fldChar')
	fld_char.set(qn('w:fldCharType'), 'end')
	f_run._r.append(fld_char)

	# ───────────────────────────── main entry ‐──────────────────────────────
	def create_docx_with_layout(text: str, title=None, author=None) -> BytesIO:
	if "Option 1" in text:
	text = text.split("Option 1", 1)[1]
	if "Option 2" in text:
	text = text.split("Option 2", 1)[0]

	doc = Document()
	doc.core_properties.title = title or "Generated Document"
	doc.core_properties.author = author or "System"

	register_custom_styles(doc)
	add_headers_footers(doc)

	sect = doc.sections[0]
	sect.page_height = Inches(11)
	sect.page_width = Inches(8.5)
	sect.top_margin = Inches(1)
	sect.bottom_margin = Inches(1)
	sect.left_margin = Inches(1)
	sect.right_margin = Inches(1)

	indent_unit = 0.3
	table_buf = []
	in_fenced_code, fenced_buf = False, []
	toc_entries = []

	for raw in text.strip("\n").splitlines():
	line = raw.rstrip("\n")

	# ── alignment detection ──
	alignment = None
	align_line = line.strip()
	if (align_line.startswith("->") and align_line.endswith("<-")) or (align_line.startswith("::") and align_line.endswith("::")):
	alignment = WD_ALIGN_PARAGRAPH.CENTER
	line = align_line[2:-2].strip()
	elif align_line.endswith("->"):
	alignment = WD_ALIGN_PARAGRAPH.RIGHT
	line = align_line[:-2].strip()
	elif align_line.startswith("<center>") and align_line.endswith("</center>"):
	alignment = WD_ALIGN_PARAGRAPH.CENTER
	line = align_line[8:-9].strip()
	elif align_line.startswith("<right>") and align_line.endswith("</right>"):
	alignment = WD_ALIGN_PARAGRAPH.RIGHT
	line = align_line[7:-8].strip()
	elif align_line.startswith("<left>") and align_line.endswith("</left>"):
	alignment = WD_ALIGN_PARAGRAPH.LEFT
	line = align_line[6:-7].strip()
	elif align_line.startswith("<justify>") and align_line.endswith("</justify>"):
	alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
	line = align_line[9:-10].strip()

	# ── fenced code ──
	fence = re.match(r"^```(\w+)?\s*$", line)
	if fence:
	if in_fenced_code:
	for ln in fenced_buf:
	p = doc.add_paragraph(style="CodeBlock")
	p.paragraph_format.left_indent = Inches(0.25)
	r = p.add_run(ln)
	r.font.name = "Courier New"
	shd = OxmlElement("w:shd")
	shd.set(qn("w:fill"), "EEEEEE")
	p._p.get_or_add_pPr().append(shd)
	in_fenced_code, fenced_buf = False, []
	else:
	in_fenced_code = True
	continue
	if in_fenced_code:
	fenced_buf.append(line)
	continue

	# ── tables ──
	if re.match(r"^\s\\|.\\|\s*$", line):
	table_buf.append(line)
	continue
	_flush_table(doc, table_buf)

	# ── empty / pagebreak / horizontal rule ──
	if not line.strip():
	doc.add_paragraph()
	continue
	if line.strip() in {"[[PAGEBREAK]]", "\f", "<PAGEBREAK>"}:
	doc.add_page_break()
	continue
	if re.fullmatch(r"\s(\\s\\s\\|---+)\s*", line):
	p = doc.add_paragraph()
	p_pr = p._p.get_or_add_pPr()
	bdr = OxmlElement("w:pBdr")
	bottom = OxmlElement("w:bottom")
	bottom.set(qn("w:val"), "single")
	bottom.set(qn("w:sz"), "6")
	bottom.set(qn("w:space"), "1")
	bottom.set(qn("w:color"), "auto")
	bdr.append(bottom)
	p_pr.append(bdr)
	continue

	# ── headings ──
	m = re.match(r"^(#{1,6})\s+(.+)$", line)
	if m:
	level = len(m.group(1))
	heading_text = m.group(2).strip()
	heading_style = f"Heading {level}"
	add_paragraph_with_tokens(doc, heading_text, style_name=heading_style, alignment=alignment)
	toc_entries.append((level, heading_text, "1"))
	continue

	# ── subject / key section ──
	if re.match(r"^subject:", line, flags=re.I):
	add_paragraph_with_tokens(doc, line.split(":", 1)[1].strip(), style_name="Heading 2", alignment=alignment)
	continue

	if line.isupper() and ":" in line:
	add_paragraph_with_tokens(doc, line, style_name="Heading 3", alignment=alignment)
	continue

	# ── blockquote ──
	m = re.match(r"^\s>\s(.+)$", line)
	if m:
	add_paragraph_with_tokens(doc, m.group(1), style_name="Quote", indent=0.3, alignment=alignment)
	continue

	# ── list items ──
	bul = re.match(r"^(\s)[-•]\s+(.+)$", line)
	num = re.match(r"^(\s*)(\d+)[.)]\s+(.+)$", line)
	if bul or num:
	indent = len(bul.group(1) if bul else num.group(1))
	level = indent // 4
	txt = bul.group(2) if bul else num.group(3)
	style = "List Bullet" if bul else "List Number"
	p = add_paragraph_with_tokens(doc, txt, style_name=style, alignment=alignment)
	if level:
	p.paragraph_format.left_indent = Inches(0.25 * level)
	continue

	# ── normal paragraph ──
	add_paragraph_with_tokens(doc, line, style_name="Normal", alignment=alignment, spacing_before=6, spacing_after=3)

	_flush_table(doc, table_buf)

	# ── save ──
	buf = BytesIO()
	doc.save(buf)
	buf.seek(0)
	return buf