Spaces:
Sleeping
Sleeping
| """ | |
| Document Generator Module | |
| ========================= | |
| Exports meeting minutes to formatted .docx using python-docx. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import warnings | |
| from dataclasses import dataclass, field | |
| from datetime import datetime | |
| from pathlib import Path | |
| from typing import Dict, List, Optional | |
| try: | |
| from docx import Document | |
| from docx.enum.table import WD_TABLE_ALIGNMENT | |
| from docx.enum.text import WD_ALIGN_PARAGRAPH | |
| from docx.oxml import OxmlElement | |
| from docx.oxml.ns import qn | |
| from docx.shared import Cm, Pt, RGBColor | |
| DOCX_AVAILABLE = True | |
| except Exception: | |
| # Minimal fallback implementations for environments without python-docx (used in tests) | |
| DOCX_AVAILABLE = False | |
| class Document: | |
| def __init__(self): | |
| self._paragraphs = [] | |
| self.sections = [] | |
| # Minimal styles container to mimic python-docx for tests | |
| class DummyStyle: | |
| def __init__(self): | |
| self.font = type("F", (), {"name": None, "size": None}) | |
| class RFonts: | |
| def set(self, *args, **kwargs): | |
| pass | |
| class RPr: | |
| def __init__(self): | |
| self.rFonts = RFonts() | |
| class Element: | |
| def __init__(self): | |
| self.rPr = RPr() | |
| self._element = Element() | |
| class Styles: | |
| def __init__(self): | |
| self._styles = {"Normal": DummyStyle()} | |
| def __getitem__(self, key): | |
| return self._styles.setdefault(key, DummyStyle()) | |
| self.styles = Styles() | |
| class Run: | |
| def __init__(self, text=""): | |
| self.text = str(text) | |
| self.bold = False | |
| self.italic = False | |
| self.font = type("F", (), {"size": None, "color": type("C", (), {"rgb": None})()}) | |
| class Paragraph: | |
| def __init__(self, text=""): | |
| self.runs = [] | |
| self.paragraph_format = type("PF", (), {"space_after": None}) | |
| self.alignment = None | |
| if text: | |
| self.add_run(text) | |
| def add_run(self, text=""): | |
| # Create a lightweight run-like object for fallback | |
| run = type( | |
| "Run", | |
| (), | |
| { | |
| "text": str(text), | |
| "bold": False, | |
| "italic": False, | |
| "font": type( | |
| "F", (), {"size": None, "color": type("C", (), {"rgb": None})()} | |
| )(), | |
| }, | |
| )() | |
| self.runs.append(run) | |
| return run | |
| def add_paragraph(self, text="", **kwargs): | |
| # Accept style and other kwargs for compatibility | |
| para = self.Paragraph(text) | |
| self._paragraphs.append(para) | |
| return para | |
| def add_heading(self, text, level=None, **kwargs): | |
| para = self.Paragraph(text) | |
| self._paragraphs.append(para) | |
| return para | |
| def add_table(self, rows, cols): | |
| outer = self | |
| class Cell: | |
| def __init__(self): | |
| self.paragraphs = [outer.Paragraph()] | |
| # Minimal _tc structure to support shading and other docx operations in fallback | |
| class TCPr: | |
| def append(self, *args, **kwargs): | |
| pass | |
| class TC: | |
| def get_or_add_tcPr(self): | |
| return TCPr() | |
| self._tc = TC() | |
| def text(self): | |
| if self.paragraphs and self.paragraphs[0].runs: | |
| return " ".join(run.text for run in self.paragraphs[0].runs) | |
| return "" | |
| def text(self, value): | |
| # Create lightweight run-like object | |
| self.paragraphs[0].runs = [ | |
| type( | |
| "Run", | |
| (), | |
| { | |
| "text": str(value), | |
| "bold": False, | |
| "italic": False, | |
| "font": type( | |
| "F", (), {"size": None, "color": type("C", (), {"rgb": None})()} | |
| )(), | |
| }, | |
| )() | |
| ] | |
| class Row: | |
| def __init__(self, cols): | |
| self.cells = [Cell() for _ in range(cols)] | |
| table = type( | |
| "Table", | |
| (), | |
| {"rows": [Row(cols) for _ in range(rows)], "style": None, "alignment": None}, | |
| ) | |
| return table | |
| def save(self, path): | |
| # Save a plain text fallback document so tests can verify file exists | |
| lines = [] | |
| for p in self._paragraphs: | |
| if hasattr(p, "runs"): | |
| lines.append(" ".join(getattr(r, "text", "") for r in p.runs)) | |
| else: | |
| lines.append(str(p)) | |
| with open(path, "w", encoding="utf-8") as f: | |
| f.write("\n".join(lines)) | |
| class Pt: | |
| def __init__(self, value): | |
| self.value = value | |
| class Cm: | |
| def __init__(self, value): | |
| self.value = value | |
| class RGBColor: | |
| def __init__(self, r, g, b): | |
| pass | |
| class WD_ALIGN_PARAGRAPH: | |
| CENTER = 1 | |
| class WD_TABLE_ALIGNMENT: | |
| LEFT = 1 | |
| class OxmlElement: | |
| def __init__(self, *args, **kwargs): | |
| pass | |
| def set(self, *args, **kwargs): | |
| pass | |
| def qn(x): | |
| return x | |
| from src.summarizer import MeetingSummary | |
| from src.transcriber import TranscriptSegment | |
| class MeetingMetadata: | |
| """Meeting information for document header""" | |
| title: str | |
| date: str | |
| time: str = "" | |
| location: str = "" | |
| duration: str = "" | |
| participants: Optional[List[str]] = None | |
| organizer: str = "" | |
| agenda: str = "" | |
| def create_default(cls, audio_duration_sec: float = 0) -> "MeetingMetadata": | |
| """Create default metadata""" | |
| duration_str = "" | |
| if audio_duration_sec > 0: | |
| hours = int(audio_duration_sec // 3600) | |
| minutes = int((audio_duration_sec % 3600) // 60) | |
| seconds = int(audio_duration_sec % 60) | |
| if hours > 0: | |
| duration_str = f"{hours} jam {minutes} menit {seconds} detik" | |
| else: | |
| duration_str = f"{minutes} menit {seconds} detik" | |
| return cls( | |
| title="Notulensi Rapat", | |
| date=datetime.now().strftime("%d %B %Y"), | |
| time=datetime.now().strftime("%H:%M"), | |
| duration=duration_str, | |
| ) | |
| class DocumentConfig: | |
| """Configuration for document generation""" | |
| # Font settings | |
| title_font_size: int = 18 | |
| heading1_font_size: int = 14 | |
| heading2_font_size: int = 12 | |
| body_font_size: int = 11 | |
| font_family: str = "Calibri" | |
| # Layout | |
| page_width: float = 21.0 # cm (A4) | |
| page_height: float = 29.7 # cm (A4) | |
| margin_top: float = 2.5 | |
| margin_bottom: float = 2.5 | |
| margin_left: float = 3.0 | |
| margin_right: float = 2.5 | |
| # Content options | |
| include_timestamps: bool = True | |
| include_speaker_colors: bool = True | |
| include_table_of_contents: bool = False | |
| include_page_numbers: bool = True | |
| # Sections to include | |
| sections: Dict[str, bool] = field( | |
| default_factory=lambda: { | |
| "header": True, | |
| "meeting_info": True, | |
| "summary": True, | |
| "decisions": True, | |
| "action_items": True, | |
| "transcript": True, | |
| "footer": True, | |
| } | |
| ) | |
| class DocumentGenerator: | |
| """ | |
| Generates formatted .docx meeting minutes. | |
| Structure: | |
| - Title | |
| - Meeting Information | |
| - Executive Summary | |
| - Key Points | |
| - Decisions | |
| - Action Items | |
| - Full Transcript | |
| - Footer | |
| Attributes: | |
| config: DocumentConfig object | |
| output_dir: Output directory path | |
| Example: | |
| >>> generator = DocumentGenerator() | |
| >>> doc_path = generator.generate(metadata, summary, transcript) | |
| >>> print(f"Document saved: {doc_path}") | |
| """ | |
| # Speaker colors for visual distinction | |
| SPEAKER_COLORS = [ | |
| RGBColor(0, 102, 204), # Blue | |
| RGBColor(204, 51, 0), # Red | |
| RGBColor(0, 153, 51), # Green | |
| RGBColor(153, 51, 153), # Purple | |
| RGBColor(204, 102, 0), # Orange | |
| RGBColor(0, 153, 153), # Teal | |
| RGBColor(102, 102, 0), # Olive | |
| RGBColor(153, 0, 76), # Maroon | |
| ] | |
| def __init__(self, config: Optional[DocumentConfig] = None, output_dir: str = "./data/output"): | |
| """ | |
| Initialize DocumentGenerator. | |
| Args: | |
| config: DocumentConfig object | |
| output_dir: Directory for output files | |
| """ | |
| self.config = config or DocumentConfig() | |
| self.output_dir = Path(output_dir) | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| self._speaker_color_map: Dict[str, RGBColor] = {} | |
| def generate( | |
| self, | |
| metadata: MeetingMetadata, | |
| summary: MeetingSummary, | |
| transcript: List[TranscriptSegment], | |
| output_filename: Optional[str] = None, | |
| ) -> str: | |
| """ | |
| Generate complete meeting minutes document. | |
| Args: | |
| metadata: Meeting information | |
| summary: Generated summary | |
| transcript: Transcribed segments with speakers | |
| output_filename: Output file name (auto-generated if None) | |
| Returns: | |
| Path to generated document | |
| """ | |
| # Create document | |
| doc = Document() | |
| # Setup document | |
| self._setup_document(doc) | |
| self._setup_styles(doc) | |
| # Build speaker color map | |
| self._build_speaker_color_map(transcript) | |
| # Add sections | |
| if self.config.sections.get("header", True): | |
| self._add_title(doc, metadata) | |
| if self.config.sections.get("meeting_info", True): | |
| self._add_meeting_info(doc, metadata) | |
| if self.config.sections.get("summary", True): | |
| self._add_summary_section(doc, summary) | |
| if self.config.sections.get("decisions", True): | |
| self._add_decisions_section(doc, summary.decisions) | |
| if self.config.sections.get("action_items", True): | |
| self._add_action_items_section(doc, summary.action_items) | |
| if self.config.sections.get("transcript", True): | |
| self._add_transcript_section(doc, transcript) | |
| if self.config.sections.get("footer", True): | |
| self._add_footer(doc) | |
| # Generate filename if not provided | |
| if output_filename is None: | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| safe_title = self._sanitize_filename(metadata.title)[:30] | |
| ext = ".docx" if DOCX_AVAILABLE else ".txt" | |
| output_filename = f"notulensi_{safe_title}_{timestamp}{ext}" | |
| # Ensure .docx extension | |
| if not output_filename.endswith(".docx"): | |
| output_filename = Path(output_filename).with_suffix(".docx").name | |
| output_path = self.output_dir / output_filename | |
| # Save document | |
| if DOCX_AVAILABLE: | |
| doc.save(str(output_path)) | |
| else: | |
| # If python-docx is not available, build a minimal valid .docx package so Word can open it. | |
| warnings.warn( | |
| "python-docx is not available in the current environment; generating a minimal .docx package instead." | |
| ) | |
| paragraphs = self._extract_paragraph_texts(doc) | |
| self._save_minimal_docx(str(output_path), paragraphs) | |
| return str(output_path) | |
| def _setup_document(self, doc: Document): | |
| """Configure document settings""" | |
| # Set page margins | |
| sections = doc.sections | |
| for section in sections: | |
| section.top_margin = Cm(self.config.margin_top) | |
| section.bottom_margin = Cm(self.config.margin_bottom) | |
| section.left_margin = Cm(self.config.margin_left) | |
| section.right_margin = Cm(self.config.margin_right) | |
| def _setup_styles(self, doc: Document): | |
| """Configure document styles""" | |
| # Normal style | |
| style = doc.styles["Normal"] | |
| style.font.name = self.config.font_family | |
| style.font.size = Pt(self.config.body_font_size) | |
| # Set font for East Asian text | |
| style._element.rPr.rFonts.set(qn("w:eastAsia"), self.config.font_family) | |
| def _build_speaker_color_map(self, transcript: List[TranscriptSegment]): | |
| """Build consistent color mapping for speakers""" | |
| speakers = sorted(set(seg.speaker_id for seg in transcript)) | |
| for i, speaker in enumerate(speakers): | |
| self._speaker_color_map[speaker] = self.SPEAKER_COLORS[i % len(self.SPEAKER_COLORS)] | |
| def _add_title(self, doc: Document, metadata: MeetingMetadata): | |
| """Add document title""" | |
| # Main title | |
| title_para = doc.add_paragraph() | |
| title_run = title_para.add_run("NOTULENSI RAPAT") | |
| title_run.bold = True | |
| title_run.font.size = Pt(self.config.title_font_size) | |
| title_run.font.color.rgb = RGBColor(0, 51, 102) | |
| title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| # Subtitle with meeting title | |
| if metadata.title and metadata.title != "Notulensi Rapat": | |
| subtitle_para = doc.add_paragraph() | |
| subtitle_run = subtitle_para.add_run(metadata.title) | |
| subtitle_run.bold = True | |
| subtitle_run.font.size = Pt(self.config.heading1_font_size) | |
| subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| # Generated by note | |
| note_para = doc.add_paragraph() | |
| note_run = note_para.add_run("Generated by AI Meeting Transcriber (SpeechBrain + BERT)") | |
| note_run.italic = True | |
| note_run.font.size = Pt(9) | |
| note_run.font.color.rgb = RGBColor(128, 128, 128) | |
| note_para.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| # Spacer | |
| doc.add_paragraph() | |
| def _add_meeting_info(self, doc: Document, metadata: MeetingMetadata): | |
| """Add meeting information section""" | |
| # Section heading | |
| heading = doc.add_heading("Informasi Rapat", level=1) | |
| heading.runs[0].font.size = Pt(self.config.heading1_font_size) | |
| # Create info table | |
| info_items = [ | |
| ("Tanggal", metadata.date), | |
| ("Waktu", metadata.time or "-"), | |
| ("Lokasi/Platform", metadata.location or "-"), | |
| ("Durasi", metadata.duration or "-"), | |
| ("Penyelenggara", metadata.organizer or "-"), | |
| ] | |
| # Filter out empty items | |
| info_items = [(label, value) for label, value in info_items if value and value != "-"] | |
| if info_items: | |
| table = doc.add_table(rows=len(info_items), cols=2) | |
| table.style = "Table Grid" | |
| table.alignment = WD_TABLE_ALIGNMENT.LEFT | |
| for i, (label, value) in enumerate(info_items): | |
| row = table.rows[i] | |
| # Label cell | |
| cell_label = row.cells[0] | |
| cell_label.text = label | |
| cell_label.paragraphs[0].runs[0].bold = True | |
| cell_label.width = Cm(4) | |
| # Value cell | |
| cell_value = row.cells[1] | |
| cell_value.text = value | |
| # Add participants if available | |
| if metadata.participants: | |
| doc.add_paragraph() | |
| para = doc.add_paragraph() | |
| para.add_run("Peserta Rapat: ").bold = True | |
| para.add_run(", ".join(metadata.participants)) | |
| # Add agenda if available | |
| if metadata.agenda: | |
| doc.add_paragraph() | |
| para = doc.add_paragraph() | |
| para.add_run("Agenda: ").bold = True | |
| para.add_run(metadata.agenda) | |
| # Spacer | |
| doc.add_paragraph() | |
| def _add_summary_section(self, doc: Document, summary: MeetingSummary): | |
| """Add executive summary section""" | |
| # Section heading | |
| heading = doc.add_heading("Ringkasan Eksekutif", level=1) | |
| heading.runs[0].font.size = Pt(self.config.heading1_font_size) | |
| # Overview | |
| if summary.overview and not self._is_placeholder_text(summary.overview): | |
| overview_para = doc.add_paragraph() | |
| overview_para.add_run(summary.overview) | |
| overview_para.paragraph_format.space_after = Pt(12) | |
| else: | |
| overview_para = doc.add_paragraph() | |
| overview_para.add_run( | |
| "Ringkasan tidak tersedia. (Model ringkasan tidak dimuat atau data tidak mencukupi.)" | |
| ) | |
| overview_para.runs[0].italic = True | |
| overview_para.runs[0].font.color.rgb = RGBColor(128, 128, 128) | |
| # Key points (filter placeholders) | |
| filtered_points = [ | |
| p for p in (summary.key_points or []) if not self._is_placeholder_text(p) | |
| ] | |
| if filtered_points: | |
| subheading = doc.add_heading("Poin-Poin Penting", level=2) | |
| subheading.runs[0].font.size = Pt(self.config.heading2_font_size) | |
| for point in filtered_points: | |
| para = doc.add_paragraph(point, style="List Bullet") | |
| else: | |
| para = doc.add_paragraph() | |
| para.add_run("Tidak ada poin penting yang dihasilkan secara otomatis.") | |
| para.runs[0].italic = True | |
| para.runs[0].font.color.rgb = RGBColor(128, 128, 128) | |
| # Topics discussed (filter placeholders) | |
| topics_filtered = [t for t in (summary.topics or []) if not self._is_placeholder_text(t)] | |
| if topics_filtered: | |
| doc.add_paragraph() | |
| para = doc.add_paragraph() | |
| para.add_run("Topik yang dibahas: ").bold = True | |
| para.add_run(", ".join(topics_filtered)) | |
| # Spacer | |
| doc.add_paragraph() | |
| def _add_decisions_section(self, doc: Document, decisions: List[str]): | |
| """Add decisions section""" | |
| # Section heading | |
| heading = doc.add_heading("Keputusan Rapat", level=1) | |
| heading.runs[0].font.size = Pt(self.config.heading1_font_size) | |
| if decisions: | |
| for i, decision in enumerate(decisions, 1): | |
| para = doc.add_paragraph() | |
| para.add_run(f"{i}. ").bold = True | |
| para.add_run(decision) | |
| else: | |
| para = doc.add_paragraph() | |
| para.add_run("Tidak ada keputusan yang teridentifikasi secara otomatis.") | |
| para.runs[0].italic = True | |
| para.runs[0].font.color.rgb = RGBColor(128, 128, 128) | |
| # Spacer | |
| doc.add_paragraph() | |
| def _add_action_items_section(self, doc: Document, action_items: List[Dict[str, str]]): | |
| """Add action items section""" | |
| # Section heading | |
| heading = doc.add_heading("Action Items / Tindak Lanjut", level=1) | |
| heading.runs[0].font.size = Pt(self.config.heading1_font_size) | |
| if action_items: | |
| # Create table | |
| table = doc.add_table(rows=len(action_items) + 1, cols=4) | |
| table.style = "Table Grid" | |
| table.alignment = WD_TABLE_ALIGNMENT.LEFT | |
| # Header row | |
| headers = ["No.", "Penanggung Jawab", "Tugas", "Deadline"] | |
| header_row = table.rows[0] | |
| for i, header_text in enumerate(headers): | |
| cell = header_row.cells[i] | |
| cell.text = header_text | |
| # Style header | |
| for paragraph in cell.paragraphs: | |
| for run in paragraph.runs: | |
| run.bold = True | |
| # Set header background color | |
| shading = OxmlElement("w:shd") | |
| shading.set(qn("w:fill"), "D9E2F3") | |
| cell._tc.get_or_add_tcPr().append(shading) | |
| # Data rows | |
| for i, item in enumerate(action_items, 1): | |
| row = table.rows[i] | |
| row.cells[0].text = str(i) | |
| row.cells[1].text = item.get("owner", "-") | |
| row.cells[2].text = item.get("task", "-") | |
| row.cells[3].text = item.get("due", "-") | |
| # Set column widths | |
| for row in table.rows: | |
| row.cells[0].width = Cm(1.0) | |
| row.cells[1].width = Cm(3.5) | |
| row.cells[2].width = Cm(9.0) | |
| row.cells[3].width = Cm(2.5) | |
| else: | |
| para = doc.add_paragraph() | |
| para.add_run("Tidak ada action item yang teridentifikasi secara otomatis.") | |
| para.runs[0].italic = True | |
| para.runs[0].font.color.rgb = RGBColor(128, 128, 128) | |
| # Spacer | |
| doc.add_paragraph() | |
| def _add_transcript_section(self, doc: Document, transcript: List[TranscriptSegment]): | |
| """Add full transcript section""" | |
| # Section heading | |
| heading = doc.add_heading("Transkrip Percakapan", level=1) | |
| heading.runs[0].font.size = Pt(self.config.heading1_font_size) | |
| if not transcript: | |
| para = doc.add_paragraph() | |
| para.add_run("Tidak ada transkrip yang tersedia.") | |
| para.runs[0].italic = True | |
| return | |
| # Add each segment | |
| for seg in transcript: | |
| para = doc.add_paragraph() | |
| # Timestamp | |
| if self.config.include_timestamps: | |
| timestamp = self._format_timestamp(seg.start, seg.end) | |
| # Speaker label with color | |
| speaker_run = para.add_run(f"{seg.speaker_id} [{timestamp}]: ") | |
| speaker_run.bold = True | |
| if self.config.include_speaker_colors: | |
| color = self._speaker_color_map.get(seg.speaker_id, RGBColor(0, 0, 0)) | |
| speaker_run.font.color.rgb = color | |
| else: | |
| speaker_run = para.add_run(f"{seg.speaker_id}: ") | |
| speaker_run.bold = True | |
| # Transcript text (sanitize placeholder/fallback strings) | |
| text = seg.text or "" | |
| cleaned = self._clean_text_for_doc(text) | |
| para.add_run(cleaned) | |
| # Mark overlapping speech | |
| if seg.is_overlap: | |
| overlap_run = para.add_run(" [OVERLAP]") | |
| overlap_run.italic = True | |
| overlap_run.font.color.rgb = RGBColor(255, 102, 0) | |
| overlap_run.font.size = Pt(9) | |
| def _add_footer(self, doc: Document): | |
| """Add document footer""" | |
| # Separator line | |
| doc.add_paragraph() | |
| separator = doc.add_paragraph("─" * 70) | |
| separator.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| # Footer text | |
| footer_para = doc.add_paragraph() | |
| timestamp = datetime.now().strftime("%d %B %Y, %H:%M:%S") | |
| footer_text = f"Dokumen ini dihasilkan secara otomatis pada {timestamp}" | |
| footer_run = footer_para.add_run(footer_text) | |
| footer_run.italic = True | |
| footer_run.font.size = Pt(9) | |
| footer_run.font.color.rgb = RGBColor(128, 128, 128) | |
| footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| # Disclaimer | |
| disclaimer_para = doc.add_paragraph() | |
| disclaimer_text = ( | |
| "Hasil transkripsi dan ringkasan mungkin mengandung ketidakakuratan. " | |
| "Harap verifikasi informasi penting." | |
| ) | |
| disclaimer_run = disclaimer_para.add_run(disclaimer_text) | |
| disclaimer_run.italic = True | |
| disclaimer_run.font.size = Pt(8) | |
| disclaimer_run.font.color.rgb = RGBColor(150, 150, 150) | |
| disclaimer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| def _is_placeholder_text(self, text: Optional[str]) -> bool: | |
| """Detect summarizer/ASR fallback placeholder text.""" | |
| if not text: | |
| return True | |
| t = str(text).strip() | |
| # common placeholder patterns from summarizer / transcriber fallbacks | |
| if re.search(r"\[\s*Transkripsi placeholder", t, re.I): | |
| return True | |
| if re.search(r"placeholder", t, re.I) and len(t) < 120: | |
| return True | |
| return False | |
| def _clean_text_for_doc(self, text: Optional[str]) -> str: | |
| """Clean text for document: replace raw placeholders with user-friendly notices.""" | |
| if not text or self._is_placeholder_text(text): | |
| return "[transkripsi tidak tersedia]" | |
| # Remove any bracketed placeholder fragments embedded in text | |
| cleaned = re.sub(r"\[\s*Transkripsi placeholder[^\]]*\]", "", str(text), flags=re.I).strip() | |
| return cleaned or "[transkripsi tidak tersedia]" | |
| def _format_timestamp(start: float, end: float) -> str: | |
| """Format time range as HH:MM:SS""" | |
| def sec_to_str(sec: float) -> str: | |
| sec = max(0.0, float(sec)) | |
| h = int(sec // 3600) | |
| m = int((sec % 3600) // 60) | |
| s = int(sec % 60) | |
| if h > 0: | |
| return f"{h:02d}:{m:02d}:{s:02d}" | |
| return f"{m:02d}:{s:02d}" | |
| return f"{sec_to_str(start)}–{sec_to_str(end)}" | |
| def _save_minimal_docx(self, path: str, paragraphs: List[str]): | |
| """Create a minimal valid .docx (zip package) containing plain paragraphs. | |
| This is a lightweight fallback when python-docx is not installed, to ensure | |
| the generated file can be opened in Word. | |
| """ | |
| import zipfile | |
| def _escape_xml(s: str) -> str: | |
| return ( | |
| s.replace("&", "&") | |
| .replace("<", "<") | |
| .replace(">", ">") | |
| .replace('"', """) | |
| .replace("'", "'") | |
| ) | |
| content_types = ( | |
| '<?xml version="1.0" encoding="UTF-8"?>\n' | |
| '<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">\n' | |
| ' <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>\n' | |
| ' <Default Extension="xml" ContentType="application/xml"/>\n' | |
| ' <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>\n' | |
| "</Types>" | |
| ) | |
| rels = ( | |
| '<?xml version="1.0" encoding="UTF-8"?>\n' | |
| '<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">\n' | |
| ' <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>\n' | |
| "</Relationships>" | |
| ) | |
| doc_xml_header = ( | |
| '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n' | |
| '<w:document xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" ' | |
| 'xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" ' | |
| 'xmlns:o="urn:schemas-microsoft-com:office:office" ' | |
| 'xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" ' | |
| 'xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" ' | |
| 'xmlns:v="urn:schemas-microsoft-com:vml" ' | |
| 'xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" ' | |
| 'xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" ' | |
| 'xmlns:w10="urn:schemas-microsoft-com:office:word" ' | |
| 'xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" ' | |
| 'xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" ' | |
| 'xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" ' | |
| 'xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" ' | |
| 'xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" ' | |
| 'xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape">\n' | |
| " <w:body>\n" | |
| ) | |
| doc_xml_footer = ( | |
| " <w:sectPr>\n" | |
| ' <w:pgSz w:w="11900" w:h="16840"/>\n' | |
| ' <w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440" w:header="720" w:footer="720" w:gutter="0"/>\n' | |
| " </w:sectPr>\n" | |
| " </w:body>\n" | |
| "</w:document>" | |
| ) | |
| # Build paragraphs as simple <w:p><w:r><w:t>text</w:t></w:r></w:p> | |
| paras_xml = [] | |
| for p in paragraphs: | |
| t = _escape_xml(p.strip()) | |
| if not t: | |
| # preserve blank line | |
| paras_xml.append(" <w:p/>\n") | |
| else: | |
| paras_xml.append(f' <w:p><w:r><w:t xml:space="preserve">{t}</w:t></w:r></w:p>\n') | |
| doc_xml = doc_xml_header + "".join(paras_xml) + doc_xml_footer | |
| with zipfile.ZipFile(path, "w", compression=zipfile.ZIP_DEFLATED) as z: | |
| z.writestr("[Content_Types].xml", content_types) | |
| z.writestr("_rels/.rels", rels) | |
| z.writestr("word/document.xml", doc_xml) | |
| def _extract_paragraph_texts(self, doc: Document) -> List[str]: | |
| """Get paragraphs text for python-docx Document or fallback Document""" | |
| paras: List[str] = [] | |
| # python-docx Document | |
| try: | |
| # using attribute if present | |
| if hasattr(doc, "paragraphs"): | |
| for p in doc.paragraphs: | |
| paras.append(p.text) | |
| return paras | |
| except Exception: | |
| pass | |
| # fallback minimal Document implementation | |
| if hasattr(doc, "_paragraphs"): | |
| for p in doc._paragraphs: | |
| if hasattr(p, "runs"): | |
| paras.append(" ".join(getattr(r, "text", "") for r in p.runs)) | |
| else: | |
| paras.append(str(p)) | |
| return paras | |
| def _sanitize_filename(filename: str) -> str: | |
| """Remove invalid characters from filename""" | |
| import re | |
| # Remove invalid characters | |
| sanitized = re.sub(r'[<>:"/\\|?*]', "", filename) | |
| # Replace spaces with underscores | |
| sanitized = sanitized.replace(" ", "_") | |
| return sanitized | |