Spaces:

ICA-PUC
/

beta-NORM

Sleeping

beta-NORM / utils /conversation_word_export.py

GitHub Actions

Sync from GitHub master

92145af 18 days ago

5.82 kB

	import re
	from io import BytesIO
	from typing import Any, Dict, List, Tuple

	from docx import Document
	from docx.oxml import OxmlElement
	from docx.oxml.ns import qn
	from docx.shared import Pt, RGBColor


	# ---------------------------------------------------------------------------
	# Markdown table helpers
	# ---------------------------------------------------------------------------

	_TABLE_ROW_RE = re.compile(r"^\\|(.+)\\|$")
	_SEPARATOR_RE = re.compile(r"^\\|[-:\| ]+\\|$")
	_INLINE_MD_RE = re.compile(r"\{1,2}([^]+)\*{1,2}\|`([^`]+)`")


	def _strip_inline_md(text: str) -> str:
	"""Remove common inline markdown markers (bold, italic, code) from text."""
	return _INLINE_MD_RE.sub(lambda m: m.group(1) or m.group(2), text)


	def _is_table_separator(line: str) -> bool:
	return bool(_SEPARATOR_RE.match(line.strip()))


	def _parse_table_rows(lines: List[str]) -> List[List[str]]:
	"""Convert markdown table lines into a list of rows (list of cell strings)."""
	rows: List[List[str]] = []
	for line in lines:
	if _is_table_separator(line):
	continue
	m = _TABLE_ROW_RE.match(line.strip())
	if m:
	cells = [_strip_inline_md(c.strip()) for c in m.group(1).split("\|")]
	rows.append(cells)
	return rows


	def _shade_cell(cell, hex_color: str) -> None:
	"""Apply a background fill colour to a table cell."""
	tc = cell._tc
	tcPr = tc.get_or_add_tcPr()
	shd = OxmlElement("w:shd")
	shd.set(qn("w:val"), "clear")
	shd.set(qn("w:color"), "auto")
	shd.set(qn("w:fill"), hex_color)
	tcPr.append(shd)


	def _add_markdown_table(doc: Document, lines: List[str]) -> None:
	"""Render a markdown table as a formatted Word table."""
	rows = _parse_table_rows(lines)
	if not rows:
	return

	max_cols = max(len(r) for r in rows)
	table = doc.add_table(rows=len(rows), cols=max_cols)
	table.style = "Table Grid"

	for r_idx, row in enumerate(rows):
	tr = table.rows[r_idx]
	for c_idx in range(max_cols):
	cell_text = row[c_idx] if c_idx < len(row) else ""
	cell = tr.cells[c_idx]
	para = cell.paragraphs[0]
	run = para.add_run(cell_text)
	if r_idx == 0:
	run.bold = True
	run.font.color.rgb = RGBColor(0xFF, 0xFF, 0xFF)
	_shade_cell(cell, "2E74B5") # blue header

	doc.add_paragraph() # spacing after table


	# ---------------------------------------------------------------------------
	# Content block splitter
	# ---------------------------------------------------------------------------

	def _split_into_blocks(content: str) -> List[Tuple[str, Any]]:
	"""
	Split markdown content into alternating ("text", str) and ("table", list[str])
	blocks so each can be rendered appropriately.
	"""
	blocks: List[Tuple[str, Any]] = []
	text_lines: List[str] = []
	table_lines: List[str] = []
	in_table = False

	for line in content.split("\n"):
	stripped = line.strip()
	is_table_line = (
	stripped.startswith("\|")
	and stripped.endswith("\|")
	and len(stripped) > 2
	)

	if is_table_line:
	if not in_table:
	if text_lines:
	blocks.append(("text", "\n".join(text_lines)))
	text_lines = []
	in_table = True
	table_lines.append(line)
	else:
	if in_table:
	blocks.append(("table", list(table_lines)))
	table_lines = []
	in_table = False
	text_lines.append(line)

	if in_table and table_lines:
	blocks.append(("table", table_lines))
	elif text_lines:
	blocks.append(("text", "\n".join(text_lines)))

	return blocks


	def _add_content(doc: Document, content: str) -> None:
	"""Add message content to doc, converting markdown tables to Word tables."""
	if not content:
	return
	for block_type, data in _split_into_blocks(content):
	if block_type == "table":
	_add_markdown_table(doc, data)
	else:
	text = data.strip()
	if text:
	doc.add_paragraph(text)


	# ---------------------------------------------------------------------------
	# Public API
	# ---------------------------------------------------------------------------

	def build_conversation_docx(messages: List[Dict[str, Any]]) -> bytes:
	"""Build a .docx file from chat messages and return raw bytes."""
	doc = Document()
	doc.add_heading("Conversa Chatbot NORM ⚛", level=1)

	for msg in messages:
	role = str(msg.get("role") or "")
	content = str(msg.get("content") or "").strip()

	if not content:
	continue

	doc.add_heading(role, level=2)
	_add_content(doc, content)

	references = str(msg.get("references") or "").strip()
	if references:
	cleaned_refs = references.replace("<br>", "\n")
	doc.add_paragraph("Referencias:")
	doc.add_paragraph(cleaned_refs)

	buffer = BytesIO()
	doc.save(buffer)
	buffer.seek(0)
	return buffer.getvalue()


	def build_single_response_docx(message: Dict[str, Any]) -> bytes:
	"""Build a .docx file for a single assistant response and return raw bytes."""
	doc = Document()
	doc.add_heading("Ultima resposta do chatbot ⚛", level=1)

	content = str(message.get("content") or "").strip()
	_add_content(doc, content)

	references = str(message.get("references") or "").strip()
	if references:
	cleaned_refs = references.replace("<br>", "\n")
	doc.add_paragraph("Referencias:")
	doc.add_paragraph(cleaned_refs)

	buffer = BytesIO()
	doc.save(buffer)
	buffer.seek(0)
	return buffer.getvalue()