""" Document Generator Module ========================= Exports meeting minutes to formatted .docx using python-docx. """ from __future__ import annotations import re import warnings from dataclasses import dataclass, field from datetime import datetime from pathlib import Path from typing import Dict, List, Optional try: from docx import Document from docx.enum.table import WD_TABLE_ALIGNMENT from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.oxml import OxmlElement from docx.oxml.ns import qn from docx.shared import Cm, Pt, RGBColor DOCX_AVAILABLE = True except Exception: # Minimal fallback implementations for environments without python-docx (used in tests) DOCX_AVAILABLE = False class Document: def __init__(self): self._paragraphs = [] self.sections = [] # Minimal styles container to mimic python-docx for tests class DummyStyle: def __init__(self): self.font = type("F", (), {"name": None, "size": None}) class RFonts: def set(self, *args, **kwargs): pass class RPr: def __init__(self): self.rFonts = RFonts() class Element: def __init__(self): self.rPr = RPr() self._element = Element() class Styles: def __init__(self): self._styles = {"Normal": DummyStyle()} def __getitem__(self, key): return self._styles.setdefault(key, DummyStyle()) self.styles = Styles() class Run: def __init__(self, text=""): self.text = str(text) self.bold = False self.italic = False self.font = type("F", (), {"size": None, "color": type("C", (), {"rgb": None})()}) class Paragraph: def __init__(self, text=""): self.runs = [] self.paragraph_format = type("PF", (), {"space_after": None}) self.alignment = None if text: self.add_run(text) def add_run(self, text=""): # Create a lightweight run-like object for fallback run = type( "Run", (), { "text": str(text), "bold": False, "italic": False, "font": type( "F", (), {"size": None, "color": type("C", (), {"rgb": None})()} )(), }, )() self.runs.append(run) return run def add_paragraph(self, text="", **kwargs): # Accept style and other kwargs for compatibility para = self.Paragraph(text) self._paragraphs.append(para) return para def add_heading(self, text, level=None, **kwargs): para = self.Paragraph(text) self._paragraphs.append(para) return para def add_table(self, rows, cols): outer = self class Cell: def __init__(self): self.paragraphs = [outer.Paragraph()] # Minimal _tc structure to support shading and other docx operations in fallback class TCPr: def append(self, *args, **kwargs): pass class TC: def get_or_add_tcPr(self): return TCPr() self._tc = TC() @property def text(self): if self.paragraphs and self.paragraphs[0].runs: return " ".join(run.text for run in self.paragraphs[0].runs) return "" @text.setter def text(self, value): # Create lightweight run-like object self.paragraphs[0].runs = [ type( "Run", (), { "text": str(value), "bold": False, "italic": False, "font": type( "F", (), {"size": None, "color": type("C", (), {"rgb": None})()} )(), }, )() ] class Row: def __init__(self, cols): self.cells = [Cell() for _ in range(cols)] table = type( "Table", (), {"rows": [Row(cols) for _ in range(rows)], "style": None, "alignment": None}, ) return table def save(self, path): # Save a plain text fallback document so tests can verify file exists lines = [] for p in self._paragraphs: if hasattr(p, "runs"): lines.append(" ".join(getattr(r, "text", "") for r in p.runs)) else: lines.append(str(p)) with open(path, "w", encoding="utf-8") as f: f.write("\n".join(lines)) class Pt: def __init__(self, value): self.value = value class Cm: def __init__(self, value): self.value = value class RGBColor: def __init__(self, r, g, b): pass class WD_ALIGN_PARAGRAPH: CENTER = 1 class WD_TABLE_ALIGNMENT: LEFT = 1 class OxmlElement: def __init__(self, *args, **kwargs): pass def set(self, *args, **kwargs): pass def qn(x): return x from src.summarizer import MeetingSummary from src.transcriber import TranscriptSegment @dataclass class MeetingMetadata: """Meeting information for document header""" title: str date: str time: str = "" location: str = "" duration: str = "" participants: Optional[List[str]] = None organizer: str = "" agenda: str = "" @classmethod def create_default(cls, audio_duration_sec: float = 0) -> "MeetingMetadata": """Create default metadata""" duration_str = "" if audio_duration_sec > 0: hours = int(audio_duration_sec // 3600) minutes = int((audio_duration_sec % 3600) // 60) seconds = int(audio_duration_sec % 60) if hours > 0: duration_str = f"{hours} jam {minutes} menit {seconds} detik" else: duration_str = f"{minutes} menit {seconds} detik" return cls( title="Notulensi Rapat", date=datetime.now().strftime("%d %B %Y"), time=datetime.now().strftime("%H:%M"), duration=duration_str, ) @dataclass class DocumentConfig: """Configuration for document generation""" # Font settings title_font_size: int = 18 heading1_font_size: int = 14 heading2_font_size: int = 12 body_font_size: int = 11 font_family: str = "Calibri" # Layout page_width: float = 21.0 # cm (A4) page_height: float = 29.7 # cm (A4) margin_top: float = 2.5 margin_bottom: float = 2.5 margin_left: float = 3.0 margin_right: float = 2.5 # Content options include_timestamps: bool = True include_speaker_colors: bool = True include_table_of_contents: bool = False include_page_numbers: bool = True # Sections to include sections: Dict[str, bool] = field( default_factory=lambda: { "header": True, "meeting_info": True, "summary": True, "decisions": True, "action_items": True, "transcript": True, "footer": True, } ) class DocumentGenerator: """ Generates formatted .docx meeting minutes. Structure: - Title - Meeting Information - Executive Summary - Key Points - Decisions - Action Items - Full Transcript - Footer Attributes: config: DocumentConfig object output_dir: Output directory path Example: >>> generator = DocumentGenerator() >>> doc_path = generator.generate(metadata, summary, transcript) >>> print(f"Document saved: {doc_path}") """ # Speaker colors for visual distinction SPEAKER_COLORS = [ RGBColor(0, 102, 204), # Blue RGBColor(204, 51, 0), # Red RGBColor(0, 153, 51), # Green RGBColor(153, 51, 153), # Purple RGBColor(204, 102, 0), # Orange RGBColor(0, 153, 153), # Teal RGBColor(102, 102, 0), # Olive RGBColor(153, 0, 76), # Maroon ] def __init__(self, config: Optional[DocumentConfig] = None, output_dir: str = "./data/output"): """ Initialize DocumentGenerator. Args: config: DocumentConfig object output_dir: Directory for output files """ self.config = config or DocumentConfig() self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) self._speaker_color_map: Dict[str, RGBColor] = {} def generate( self, metadata: MeetingMetadata, summary: MeetingSummary, transcript: List[TranscriptSegment], output_filename: Optional[str] = None, ) -> str: """ Generate complete meeting minutes document. Args: metadata: Meeting information summary: Generated summary transcript: Transcribed segments with speakers output_filename: Output file name (auto-generated if None) Returns: Path to generated document """ # Create document doc = Document() # Setup document self._setup_document(doc) self._setup_styles(doc) # Build speaker color map self._build_speaker_color_map(transcript) # Add sections if self.config.sections.get("header", True): self._add_title(doc, metadata) if self.config.sections.get("meeting_info", True): self._add_meeting_info(doc, metadata) if self.config.sections.get("summary", True): self._add_summary_section(doc, summary) if self.config.sections.get("decisions", True): self._add_decisions_section(doc, summary.decisions) if self.config.sections.get("action_items", True): self._add_action_items_section(doc, summary.action_items) if self.config.sections.get("transcript", True): self._add_transcript_section(doc, transcript) if self.config.sections.get("footer", True): self._add_footer(doc) # Generate filename if not provided if output_filename is None: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") safe_title = self._sanitize_filename(metadata.title)[:30] ext = ".docx" if DOCX_AVAILABLE else ".txt" output_filename = f"notulensi_{safe_title}_{timestamp}{ext}" # Ensure .docx extension if not output_filename.endswith(".docx"): output_filename = Path(output_filename).with_suffix(".docx").name output_path = self.output_dir / output_filename # Save document if DOCX_AVAILABLE: doc.save(str(output_path)) else: # If python-docx is not available, build a minimal valid .docx package so Word can open it. warnings.warn( "python-docx is not available in the current environment; generating a minimal .docx package instead." ) paragraphs = self._extract_paragraph_texts(doc) self._save_minimal_docx(str(output_path), paragraphs) return str(output_path) def _setup_document(self, doc: Document): """Configure document settings""" # Set page margins sections = doc.sections for section in sections: section.top_margin = Cm(self.config.margin_top) section.bottom_margin = Cm(self.config.margin_bottom) section.left_margin = Cm(self.config.margin_left) section.right_margin = Cm(self.config.margin_right) def _setup_styles(self, doc: Document): """Configure document styles""" # Normal style style = doc.styles["Normal"] style.font.name = self.config.font_family style.font.size = Pt(self.config.body_font_size) # Set font for East Asian text style._element.rPr.rFonts.set(qn("w:eastAsia"), self.config.font_family) def _build_speaker_color_map(self, transcript: List[TranscriptSegment]): """Build consistent color mapping for speakers""" speakers = sorted(set(seg.speaker_id for seg in transcript)) for i, speaker in enumerate(speakers): self._speaker_color_map[speaker] = self.SPEAKER_COLORS[i % len(self.SPEAKER_COLORS)] def _add_title(self, doc: Document, metadata: MeetingMetadata): """Add document title""" # Main title title_para = doc.add_paragraph() title_run = title_para.add_run("NOTULENSI RAPAT") title_run.bold = True title_run.font.size = Pt(self.config.title_font_size) title_run.font.color.rgb = RGBColor(0, 51, 102) title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER # Subtitle with meeting title if metadata.title and metadata.title != "Notulensi Rapat": subtitle_para = doc.add_paragraph() subtitle_run = subtitle_para.add_run(metadata.title) subtitle_run.bold = True subtitle_run.font.size = Pt(self.config.heading1_font_size) subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER # Generated by note note_para = doc.add_paragraph() note_run = note_para.add_run("Generated by AI Meeting Transcriber (SpeechBrain + BERT)") note_run.italic = True note_run.font.size = Pt(9) note_run.font.color.rgb = RGBColor(128, 128, 128) note_para.alignment = WD_ALIGN_PARAGRAPH.CENTER # Spacer doc.add_paragraph() def _add_meeting_info(self, doc: Document, metadata: MeetingMetadata): """Add meeting information section""" # Section heading heading = doc.add_heading("Informasi Rapat", level=1) heading.runs[0].font.size = Pt(self.config.heading1_font_size) # Create info table info_items = [ ("Tanggal", metadata.date), ("Waktu", metadata.time or "-"), ("Lokasi/Platform", metadata.location or "-"), ("Durasi", metadata.duration or "-"), ("Penyelenggara", metadata.organizer or "-"), ] # Filter out empty items info_items = [(label, value) for label, value in info_items if value and value != "-"] if info_items: table = doc.add_table(rows=len(info_items), cols=2) table.style = "Table Grid" table.alignment = WD_TABLE_ALIGNMENT.LEFT for i, (label, value) in enumerate(info_items): row = table.rows[i] # Label cell cell_label = row.cells[0] cell_label.text = label cell_label.paragraphs[0].runs[0].bold = True cell_label.width = Cm(4) # Value cell cell_value = row.cells[1] cell_value.text = value # Add participants if available if metadata.participants: doc.add_paragraph() para = doc.add_paragraph() para.add_run("Peserta Rapat: ").bold = True para.add_run(", ".join(metadata.participants)) # Add agenda if available if metadata.agenda: doc.add_paragraph() para = doc.add_paragraph() para.add_run("Agenda: ").bold = True para.add_run(metadata.agenda) # Spacer doc.add_paragraph() def _add_summary_section(self, doc: Document, summary: MeetingSummary): """Add executive summary section""" # Section heading heading = doc.add_heading("Ringkasan Eksekutif", level=1) heading.runs[0].font.size = Pt(self.config.heading1_font_size) # Overview if summary.overview and not self._is_placeholder_text(summary.overview): overview_para = doc.add_paragraph() overview_para.add_run(summary.overview) overview_para.paragraph_format.space_after = Pt(12) else: overview_para = doc.add_paragraph() overview_para.add_run( "Ringkasan tidak tersedia. (Model ringkasan tidak dimuat atau data tidak mencukupi.)" ) overview_para.runs[0].italic = True overview_para.runs[0].font.color.rgb = RGBColor(128, 128, 128) # Key points (filter placeholders) filtered_points = [ p for p in (summary.key_points or []) if not self._is_placeholder_text(p) ] if filtered_points: subheading = doc.add_heading("Poin-Poin Penting", level=2) subheading.runs[0].font.size = Pt(self.config.heading2_font_size) for point in filtered_points: para = doc.add_paragraph(point, style="List Bullet") else: para = doc.add_paragraph() para.add_run("Tidak ada poin penting yang dihasilkan secara otomatis.") para.runs[0].italic = True para.runs[0].font.color.rgb = RGBColor(128, 128, 128) # Topics discussed (filter placeholders) topics_filtered = [t for t in (summary.topics or []) if not self._is_placeholder_text(t)] if topics_filtered: doc.add_paragraph() para = doc.add_paragraph() para.add_run("Topik yang dibahas: ").bold = True para.add_run(", ".join(topics_filtered)) # Spacer doc.add_paragraph() def _add_decisions_section(self, doc: Document, decisions: List[str]): """Add decisions section""" # Section heading heading = doc.add_heading("Keputusan Rapat", level=1) heading.runs[0].font.size = Pt(self.config.heading1_font_size) if decisions: for i, decision in enumerate(decisions, 1): para = doc.add_paragraph() para.add_run(f"{i}. ").bold = True para.add_run(decision) else: para = doc.add_paragraph() para.add_run("Tidak ada keputusan yang teridentifikasi secara otomatis.") para.runs[0].italic = True para.runs[0].font.color.rgb = RGBColor(128, 128, 128) # Spacer doc.add_paragraph() def _add_action_items_section(self, doc: Document, action_items: List[Dict[str, str]]): """Add action items section""" # Section heading heading = doc.add_heading("Action Items / Tindak Lanjut", level=1) heading.runs[0].font.size = Pt(self.config.heading1_font_size) if action_items: # Create table table = doc.add_table(rows=len(action_items) + 1, cols=4) table.style = "Table Grid" table.alignment = WD_TABLE_ALIGNMENT.LEFT # Header row headers = ["No.", "Penanggung Jawab", "Tugas", "Deadline"] header_row = table.rows[0] for i, header_text in enumerate(headers): cell = header_row.cells[i] cell.text = header_text # Style header for paragraph in cell.paragraphs: for run in paragraph.runs: run.bold = True # Set header background color shading = OxmlElement("w:shd") shading.set(qn("w:fill"), "D9E2F3") cell._tc.get_or_add_tcPr().append(shading) # Data rows for i, item in enumerate(action_items, 1): row = table.rows[i] row.cells[0].text = str(i) row.cells[1].text = item.get("owner", "-") row.cells[2].text = item.get("task", "-") row.cells[3].text = item.get("due", "-") # Set column widths for row in table.rows: row.cells[0].width = Cm(1.0) row.cells[1].width = Cm(3.5) row.cells[2].width = Cm(9.0) row.cells[3].width = Cm(2.5) else: para = doc.add_paragraph() para.add_run("Tidak ada action item yang teridentifikasi secara otomatis.") para.runs[0].italic = True para.runs[0].font.color.rgb = RGBColor(128, 128, 128) # Spacer doc.add_paragraph() def _add_transcript_section(self, doc: Document, transcript: List[TranscriptSegment]): """Add full transcript section""" # Section heading heading = doc.add_heading("Transkrip Percakapan", level=1) heading.runs[0].font.size = Pt(self.config.heading1_font_size) if not transcript: para = doc.add_paragraph() para.add_run("Tidak ada transkrip yang tersedia.") para.runs[0].italic = True return # Add each segment for seg in transcript: para = doc.add_paragraph() # Timestamp if self.config.include_timestamps: timestamp = self._format_timestamp(seg.start, seg.end) # Speaker label with color speaker_run = para.add_run(f"{seg.speaker_id} [{timestamp}]: ") speaker_run.bold = True if self.config.include_speaker_colors: color = self._speaker_color_map.get(seg.speaker_id, RGBColor(0, 0, 0)) speaker_run.font.color.rgb = color else: speaker_run = para.add_run(f"{seg.speaker_id}: ") speaker_run.bold = True # Transcript text (sanitize placeholder/fallback strings) text = seg.text or "" cleaned = self._clean_text_for_doc(text) para.add_run(cleaned) # Mark overlapping speech if seg.is_overlap: overlap_run = para.add_run(" [OVERLAP]") overlap_run.italic = True overlap_run.font.color.rgb = RGBColor(255, 102, 0) overlap_run.font.size = Pt(9) def _add_footer(self, doc: Document): """Add document footer""" # Separator line doc.add_paragraph() separator = doc.add_paragraph("─" * 70) separator.alignment = WD_ALIGN_PARAGRAPH.CENTER # Footer text footer_para = doc.add_paragraph() timestamp = datetime.now().strftime("%d %B %Y, %H:%M:%S") footer_text = f"Dokumen ini dihasilkan secara otomatis pada {timestamp}" footer_run = footer_para.add_run(footer_text) footer_run.italic = True footer_run.font.size = Pt(9) footer_run.font.color.rgb = RGBColor(128, 128, 128) footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER # Disclaimer disclaimer_para = doc.add_paragraph() disclaimer_text = ( "Hasil transkripsi dan ringkasan mungkin mengandung ketidakakuratan. " "Harap verifikasi informasi penting." ) disclaimer_run = disclaimer_para.add_run(disclaimer_text) disclaimer_run.italic = True disclaimer_run.font.size = Pt(8) disclaimer_run.font.color.rgb = RGBColor(150, 150, 150) disclaimer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER def _is_placeholder_text(self, text: Optional[str]) -> bool: """Detect summarizer/ASR fallback placeholder text.""" if not text: return True t = str(text).strip() # common placeholder patterns from summarizer / transcriber fallbacks if re.search(r"\[\s*Transkripsi placeholder", t, re.I): return True if re.search(r"placeholder", t, re.I) and len(t) < 120: return True return False def _clean_text_for_doc(self, text: Optional[str]) -> str: """Clean text for document: replace raw placeholders with user-friendly notices.""" if not text or self._is_placeholder_text(text): return "[transkripsi tidak tersedia]" # Remove any bracketed placeholder fragments embedded in text cleaned = re.sub(r"\[\s*Transkripsi placeholder[^\]]*\]", "", str(text), flags=re.I).strip() return cleaned or "[transkripsi tidak tersedia]" @staticmethod def _format_timestamp(start: float, end: float) -> str: """Format time range as HH:MM:SS""" def sec_to_str(sec: float) -> str: sec = max(0.0, float(sec)) h = int(sec // 3600) m = int((sec % 3600) // 60) s = int(sec % 60) if h > 0: return f"{h:02d}:{m:02d}:{s:02d}" return f"{m:02d}:{s:02d}" return f"{sec_to_str(start)}–{sec_to_str(end)}" def _save_minimal_docx(self, path: str, paragraphs: List[str]): """Create a minimal valid .docx (zip package) containing plain paragraphs. This is a lightweight fallback when python-docx is not installed, to ensure the generated file can be opened in Word. """ import zipfile def _escape_xml(s: str) -> str: return ( s.replace("&", "&") .replace("<", "<") .replace(">", ">") .replace('"', """) .replace("'", "'") ) content_types = ( '\n' '\n' ' \n' ' \n' ' \n' "" ) rels = ( '\n' '\n' ' \n' "" ) doc_xml_header = ( '\n' '\n' " \n" ) doc_xml_footer = ( " \n" ' \n' ' \n' " \n" " \n" "" ) # Build paragraphs as simple text paras_xml = [] for p in paragraphs: t = _escape_xml(p.strip()) if not t: # preserve blank line paras_xml.append(" \n") else: paras_xml.append(f' {t}\n') doc_xml = doc_xml_header + "".join(paras_xml) + doc_xml_footer with zipfile.ZipFile(path, "w", compression=zipfile.ZIP_DEFLATED) as z: z.writestr("[Content_Types].xml", content_types) z.writestr("_rels/.rels", rels) z.writestr("word/document.xml", doc_xml) def _extract_paragraph_texts(self, doc: Document) -> List[str]: """Get paragraphs text for python-docx Document or fallback Document""" paras: List[str] = [] # python-docx Document try: # using attribute if present if hasattr(doc, "paragraphs"): for p in doc.paragraphs: paras.append(p.text) return paras except Exception: pass # fallback minimal Document implementation if hasattr(doc, "_paragraphs"): for p in doc._paragraphs: if hasattr(p, "runs"): paras.append(" ".join(getattr(r, "text", "") for r in p.runs)) else: paras.append(str(p)) return paras @staticmethod def _sanitize_filename(filename: str) -> str: """Remove invalid characters from filename""" import re # Remove invalid characters sanitized = re.sub(r'[<>:"/\\|?*]', "", filename) # Replace spaces with underscores sanitized = sanitized.replace(" ", "_") return sanitized