beta-NORM / utils /conversation_word_export.py
GitHub Actions
Sync from GitHub master
92145af
import re
from io import BytesIO
from typing import Any, Dict, List, Tuple
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Pt, RGBColor
# ---------------------------------------------------------------------------
# Markdown table helpers
# ---------------------------------------------------------------------------
_TABLE_ROW_RE = re.compile(r"^\|(.+)\|$")
_SEPARATOR_RE = re.compile(r"^\|[-:| ]+\|$")
_INLINE_MD_RE = re.compile(r"\*{1,2}([^*]+)\*{1,2}|`([^`]+)`")
def _strip_inline_md(text: str) -> str:
"""Remove common inline markdown markers (bold, italic, code) from text."""
return _INLINE_MD_RE.sub(lambda m: m.group(1) or m.group(2), text)
def _is_table_separator(line: str) -> bool:
return bool(_SEPARATOR_RE.match(line.strip()))
def _parse_table_rows(lines: List[str]) -> List[List[str]]:
"""Convert markdown table lines into a list of rows (list of cell strings)."""
rows: List[List[str]] = []
for line in lines:
if _is_table_separator(line):
continue
m = _TABLE_ROW_RE.match(line.strip())
if m:
cells = [_strip_inline_md(c.strip()) for c in m.group(1).split("|")]
rows.append(cells)
return rows
def _shade_cell(cell, hex_color: str) -> None:
"""Apply a background fill colour to a table cell."""
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
shd = OxmlElement("w:shd")
shd.set(qn("w:val"), "clear")
shd.set(qn("w:color"), "auto")
shd.set(qn("w:fill"), hex_color)
tcPr.append(shd)
def _add_markdown_table(doc: Document, lines: List[str]) -> None:
"""Render a markdown table as a formatted Word table."""
rows = _parse_table_rows(lines)
if not rows:
return
max_cols = max(len(r) for r in rows)
table = doc.add_table(rows=len(rows), cols=max_cols)
table.style = "Table Grid"
for r_idx, row in enumerate(rows):
tr = table.rows[r_idx]
for c_idx in range(max_cols):
cell_text = row[c_idx] if c_idx < len(row) else ""
cell = tr.cells[c_idx]
para = cell.paragraphs[0]
run = para.add_run(cell_text)
if r_idx == 0:
run.bold = True
run.font.color.rgb = RGBColor(0xFF, 0xFF, 0xFF)
_shade_cell(cell, "2E74B5") # blue header
doc.add_paragraph() # spacing after table
# ---------------------------------------------------------------------------
# Content block splitter
# ---------------------------------------------------------------------------
def _split_into_blocks(content: str) -> List[Tuple[str, Any]]:
"""
Split markdown content into alternating ("text", str) and ("table", list[str])
blocks so each can be rendered appropriately.
"""
blocks: List[Tuple[str, Any]] = []
text_lines: List[str] = []
table_lines: List[str] = []
in_table = False
for line in content.split("\n"):
stripped = line.strip()
is_table_line = (
stripped.startswith("|")
and stripped.endswith("|")
and len(stripped) > 2
)
if is_table_line:
if not in_table:
if text_lines:
blocks.append(("text", "\n".join(text_lines)))
text_lines = []
in_table = True
table_lines.append(line)
else:
if in_table:
blocks.append(("table", list(table_lines)))
table_lines = []
in_table = False
text_lines.append(line)
if in_table and table_lines:
blocks.append(("table", table_lines))
elif text_lines:
blocks.append(("text", "\n".join(text_lines)))
return blocks
def _add_content(doc: Document, content: str) -> None:
"""Add message content to *doc*, converting markdown tables to Word tables."""
if not content:
return
for block_type, data in _split_into_blocks(content):
if block_type == "table":
_add_markdown_table(doc, data)
else:
text = data.strip()
if text:
doc.add_paragraph(text)
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def build_conversation_docx(messages: List[Dict[str, Any]]) -> bytes:
"""Build a .docx file from chat messages and return raw bytes."""
doc = Document()
doc.add_heading("Conversa Chatbot NORM ⚛", level=1)
for msg in messages:
role = str(msg.get("role") or "")
content = str(msg.get("content") or "").strip()
if not content:
continue
doc.add_heading(role, level=2)
_add_content(doc, content)
references = str(msg.get("references") or "").strip()
if references:
cleaned_refs = references.replace("<br>", "\n")
doc.add_paragraph("Referencias:")
doc.add_paragraph(cleaned_refs)
buffer = BytesIO()
doc.save(buffer)
buffer.seek(0)
return buffer.getvalue()
def build_single_response_docx(message: Dict[str, Any]) -> bytes:
"""Build a .docx file for a single assistant response and return raw bytes."""
doc = Document()
doc.add_heading("Ultima resposta do chatbot ⚛", level=1)
content = str(message.get("content") or "").strip()
_add_content(doc, content)
references = str(message.get("references") or "").strip()
if references:
cleaned_refs = references.replace("<br>", "\n")
doc.add_paragraph("Referencias:")
doc.add_paragraph(cleaned_refs)
buffer = BytesIO()
doc.save(buffer)
buffer.seek(0)
return buffer.getvalue()