Spaces:

Yermia
/

Notulen_Otomatis

Sleeping

App Files Files Community

Notulen_Otomatis / src /document_generator.py

Yermia

Upload 13 files

fda93d9 verified 18 days ago

raw

history blame contribute delete

31.7 kB

	"""
	Document Generator Module
	=========================
	Exports meeting minutes to formatted .docx using python-docx.
	"""

	from __future__ import annotations

	import re
	import warnings
	from dataclasses import dataclass, field
	from datetime import datetime
	from pathlib import Path
	from typing import Dict, List, Optional

	try:
	from docx import Document
	from docx.enum.table import WD_TABLE_ALIGNMENT
	from docx.enum.text import WD_ALIGN_PARAGRAPH
	from docx.oxml import OxmlElement
	from docx.oxml.ns import qn
	from docx.shared import Cm, Pt, RGBColor

	DOCX_AVAILABLE = True
	except Exception:
	# Minimal fallback implementations for environments without python-docx (used in tests)
	DOCX_AVAILABLE = False

	class Document:
	def __init__(self):
	self._paragraphs = []
	self.sections = []

	# Minimal styles container to mimic python-docx for tests
	class DummyStyle:
	def __init__(self):
	self.font = type("F", (), {"name": None, "size": None})

	class RFonts:
	def set(self, args, *kwargs):
	pass

	class RPr:
	def __init__(self):
	self.rFonts = RFonts()

	class Element:
	def __init__(self):
	self.rPr = RPr()

	self._element = Element()

	class Styles:
	def __init__(self):
	self._styles = {"Normal": DummyStyle()}

	def __getitem__(self, key):
	return self._styles.setdefault(key, DummyStyle())

	self.styles = Styles()

	class Run:
	def __init__(self, text=""):
	self.text = str(text)
	self.bold = False
	self.italic = False
	self.font = type("F", (), {"size": None, "color": type("C", (), {"rgb": None})()})

	class Paragraph:
	def __init__(self, text=""):
	self.runs = []
	self.paragraph_format = type("PF", (), {"space_after": None})
	self.alignment = None
	if text:
	self.add_run(text)

	def add_run(self, text=""):
	# Create a lightweight run-like object for fallback
	run = type(
	"Run",
	(),
	{
	"text": str(text),
	"bold": False,
	"italic": False,
	"font": type(
	"F", (), {"size": None, "color": type("C", (), {"rgb": None})()}
	)(),
	},
	)()
	self.runs.append(run)
	return run

	def add_paragraph(self, text="", **kwargs):
	# Accept style and other kwargs for compatibility
	para = self.Paragraph(text)
	self._paragraphs.append(para)
	return para

	def add_heading(self, text, level=None, **kwargs):
	para = self.Paragraph(text)
	self._paragraphs.append(para)
	return para

	def add_table(self, rows, cols):
	outer = self

	class Cell:
	def __init__(self):
	self.paragraphs = [outer.Paragraph()]

	# Minimal _tc structure to support shading and other docx operations in fallback
	class TCPr:
	def append(self, args, *kwargs):
	pass

	class TC:
	def get_or_add_tcPr(self):
	return TCPr()

	self._tc = TC()

	@property
	def text(self):
	if self.paragraphs and self.paragraphs[0].runs:
	return " ".join(run.text for run in self.paragraphs[0].runs)
	return ""

	@text.setter
	def text(self, value):
	# Create lightweight run-like object
	self.paragraphs[0].runs = [
	type(
	"Run",
	(),
	{
	"text": str(value),
	"bold": False,
	"italic": False,
	"font": type(
	"F", (), {"size": None, "color": type("C", (), {"rgb": None})()}
	)(),
	},
	)()
	]

	class Row:
	def __init__(self, cols):
	self.cells = [Cell() for _ in range(cols)]

	table = type(
	"Table",
	(),
	{"rows": [Row(cols) for _ in range(rows)], "style": None, "alignment": None},
	)
	return table

	def save(self, path):
	# Save a plain text fallback document so tests can verify file exists
	lines = []
	for p in self._paragraphs:
	if hasattr(p, "runs"):
	lines.append(" ".join(getattr(r, "text", "") for r in p.runs))
	else:
	lines.append(str(p))
	with open(path, "w", encoding="utf-8") as f:
	f.write("\n".join(lines))

	class Pt:
	def __init__(self, value):
	self.value = value

	class Cm:
	def __init__(self, value):
	self.value = value

	class RGBColor:
	def __init__(self, r, g, b):
	pass

	class WD_ALIGN_PARAGRAPH:
	CENTER = 1

	class WD_TABLE_ALIGNMENT:
	LEFT = 1

	class OxmlElement:
	def __init__(self, args, *kwargs):
	pass

	def set(self, args, *kwargs):
	pass

	def qn(x):
	return x


	from src.summarizer import MeetingSummary
	from src.transcriber import TranscriptSegment


	@dataclass
	class MeetingMetadata:
	"""Meeting information for document header"""

	title: str
	date: str
	time: str = ""
	location: str = ""
	duration: str = ""
	participants: Optional[List[str]] = None
	organizer: str = ""
	agenda: str = ""

	@classmethod
	def create_default(cls, audio_duration_sec: float = 0) -> "MeetingMetadata":
	"""Create default metadata"""
	duration_str = ""
	if audio_duration_sec > 0:
	hours = int(audio_duration_sec // 3600)
	minutes = int((audio_duration_sec % 3600) // 60)
	seconds = int(audio_duration_sec % 60)

	if hours > 0:
	duration_str = f"{hours} jam {minutes} menit {seconds} detik"
	else:
	duration_str = f"{minutes} menit {seconds} detik"

	return cls(
	title="Notulensi Rapat",
	date=datetime.now().strftime("%d %B %Y"),
	time=datetime.now().strftime("%H:%M"),
	duration=duration_str,
	)


	@dataclass
	class DocumentConfig:
	"""Configuration for document generation"""

	# Font settings
	title_font_size: int = 18
	heading1_font_size: int = 14
	heading2_font_size: int = 12
	body_font_size: int = 11
	font_family: str = "Calibri"

	# Layout
	page_width: float = 21.0 # cm (A4)
	page_height: float = 29.7 # cm (A4)
	margin_top: float = 2.5
	margin_bottom: float = 2.5
	margin_left: float = 3.0
	margin_right: float = 2.5

	# Content options
	include_timestamps: bool = True
	include_speaker_colors: bool = True
	include_table_of_contents: bool = False
	include_page_numbers: bool = True

	# Sections to include
	sections: Dict[str, bool] = field(
	default_factory=lambda: {
	"header": True,
	"meeting_info": True,
	"summary": True,
	"decisions": True,
	"action_items": True,
	"transcript": True,
	"footer": True,
	}
	)


	class DocumentGenerator:
	"""
	Generates formatted .docx meeting minutes.

	Structure:
	- Title
	- Meeting Information
	- Executive Summary
	- Key Points
	- Decisions
	- Action Items
	- Full Transcript
	- Footer

	Attributes:
	config: DocumentConfig object
	output_dir: Output directory path

	Example:
	>>> generator = DocumentGenerator()
	>>> doc_path = generator.generate(metadata, summary, transcript)
	>>> print(f"Document saved: {doc_path}")
	"""

	# Speaker colors for visual distinction
	SPEAKER_COLORS = [
	RGBColor(0, 102, 204), # Blue
	RGBColor(204, 51, 0), # Red
	RGBColor(0, 153, 51), # Green
	RGBColor(153, 51, 153), # Purple
	RGBColor(204, 102, 0), # Orange
	RGBColor(0, 153, 153), # Teal
	RGBColor(102, 102, 0), # Olive
	RGBColor(153, 0, 76), # Maroon
	]

	def __init__(self, config: Optional[DocumentConfig] = None, output_dir: str = "./data/output"):
	"""
	Initialize DocumentGenerator.

	Args:
	config: DocumentConfig object
	output_dir: Directory for output files
	"""
	self.config = config or DocumentConfig()
	self.output_dir = Path(output_dir)
	self.output_dir.mkdir(parents=True, exist_ok=True)

	self._speaker_color_map: Dict[str, RGBColor] = {}

	def generate(
	self,
	metadata: MeetingMetadata,
	summary: MeetingSummary,
	transcript: List[TranscriptSegment],
	output_filename: Optional[str] = None,
	) -> str:
	"""
	Generate complete meeting minutes document.

	Args:
	metadata: Meeting information
	summary: Generated summary
	transcript: Transcribed segments with speakers
	output_filename: Output file name (auto-generated if None)

	Returns:
	Path to generated document
	"""
	# Create document
	doc = Document()

	# Setup document
	self._setup_document(doc)
	self._setup_styles(doc)

	# Build speaker color map
	self._build_speaker_color_map(transcript)

	# Add sections
	if self.config.sections.get("header", True):
	self._add_title(doc, metadata)

	if self.config.sections.get("meeting_info", True):
	self._add_meeting_info(doc, metadata)

	if self.config.sections.get("summary", True):
	self._add_summary_section(doc, summary)

	if self.config.sections.get("decisions", True):
	self._add_decisions_section(doc, summary.decisions)

	if self.config.sections.get("action_items", True):
	self._add_action_items_section(doc, summary.action_items)

	if self.config.sections.get("transcript", True):
	self._add_transcript_section(doc, transcript)

	if self.config.sections.get("footer", True):
	self._add_footer(doc)

	# Generate filename if not provided
	if output_filename is None:
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	safe_title = self._sanitize_filename(metadata.title)[:30]
	ext = ".docx" if DOCX_AVAILABLE else ".txt"
	output_filename = f"notulensi_{safe_title}_{timestamp}{ext}"

	# Ensure .docx extension
	if not output_filename.endswith(".docx"):
	output_filename = Path(output_filename).with_suffix(".docx").name

	output_path = self.output_dir / output_filename

	# Save document
	if DOCX_AVAILABLE:
	doc.save(str(output_path))
	else:
	# If python-docx is not available, build a minimal valid .docx package so Word can open it.
	warnings.warn(
	"python-docx is not available in the current environment; generating a minimal .docx package instead."
	)
	paragraphs = self._extract_paragraph_texts(doc)
	self._save_minimal_docx(str(output_path), paragraphs)

	return str(output_path)

	def _setup_document(self, doc: Document):
	"""Configure document settings"""
	# Set page margins
	sections = doc.sections
	for section in sections:
	section.top_margin = Cm(self.config.margin_top)
	section.bottom_margin = Cm(self.config.margin_bottom)
	section.left_margin = Cm(self.config.margin_left)
	section.right_margin = Cm(self.config.margin_right)

	def _setup_styles(self, doc: Document):
	"""Configure document styles"""
	# Normal style
	style = doc.styles["Normal"]
	style.font.name = self.config.font_family
	style.font.size = Pt(self.config.body_font_size)

	# Set font for East Asian text
	style._element.rPr.rFonts.set(qn("w:eastAsia"), self.config.font_family)

	def _build_speaker_color_map(self, transcript: List[TranscriptSegment]):
	"""Build consistent color mapping for speakers"""
	speakers = sorted(set(seg.speaker_id for seg in transcript))

	for i, speaker in enumerate(speakers):
	self._speaker_color_map[speaker] = self.SPEAKER_COLORS[i % len(self.SPEAKER_COLORS)]

	def _add_title(self, doc: Document, metadata: MeetingMetadata):
	"""Add document title"""
	# Main title
	title_para = doc.add_paragraph()
	title_run = title_para.add_run("NOTULENSI RAPAT")
	title_run.bold = True
	title_run.font.size = Pt(self.config.title_font_size)
	title_run.font.color.rgb = RGBColor(0, 51, 102)
	title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER

	# Subtitle with meeting title
	if metadata.title and metadata.title != "Notulensi Rapat":
	subtitle_para = doc.add_paragraph()
	subtitle_run = subtitle_para.add_run(metadata.title)
	subtitle_run.bold = True
	subtitle_run.font.size = Pt(self.config.heading1_font_size)
	subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER

	# Generated by note
	note_para = doc.add_paragraph()
	note_run = note_para.add_run("Generated by AI Meeting Transcriber (SpeechBrain + BERT)")
	note_run.italic = True
	note_run.font.size = Pt(9)
	note_run.font.color.rgb = RGBColor(128, 128, 128)
	note_para.alignment = WD_ALIGN_PARAGRAPH.CENTER

	# Spacer
	doc.add_paragraph()

	def _add_meeting_info(self, doc: Document, metadata: MeetingMetadata):
	"""Add meeting information section"""
	# Section heading
	heading = doc.add_heading("Informasi Rapat", level=1)
	heading.runs[0].font.size = Pt(self.config.heading1_font_size)

	# Create info table
	info_items = [
	("Tanggal", metadata.date),
	("Waktu", metadata.time or "-"),
	("Lokasi/Platform", metadata.location or "-"),
	("Durasi", metadata.duration or "-"),
	("Penyelenggara", metadata.organizer or "-"),
	]

	# Filter out empty items
	info_items = [(label, value) for label, value in info_items if value and value != "-"]

	if info_items:
	table = doc.add_table(rows=len(info_items), cols=2)
	table.style = "Table Grid"
	table.alignment = WD_TABLE_ALIGNMENT.LEFT

	for i, (label, value) in enumerate(info_items):
	row = table.rows[i]

	# Label cell
	cell_label = row.cells[0]
	cell_label.text = label
	cell_label.paragraphs[0].runs[0].bold = True
	cell_label.width = Cm(4)

	# Value cell
	cell_value = row.cells[1]
	cell_value.text = value

	# Add participants if available
	if metadata.participants:
	doc.add_paragraph()
	para = doc.add_paragraph()
	para.add_run("Peserta Rapat: ").bold = True
	para.add_run(", ".join(metadata.participants))

	# Add agenda if available
	if metadata.agenda:
	doc.add_paragraph()
	para = doc.add_paragraph()
	para.add_run("Agenda: ").bold = True
	para.add_run(metadata.agenda)

	# Spacer
	doc.add_paragraph()

	def _add_summary_section(self, doc: Document, summary: MeetingSummary):
	"""Add executive summary section"""
	# Section heading
	heading = doc.add_heading("Ringkasan Eksekutif", level=1)
	heading.runs[0].font.size = Pt(self.config.heading1_font_size)

	# Overview
	if summary.overview and not self._is_placeholder_text(summary.overview):
	overview_para = doc.add_paragraph()
	overview_para.add_run(summary.overview)
	overview_para.paragraph_format.space_after = Pt(12)
	else:
	overview_para = doc.add_paragraph()
	overview_para.add_run(
	"Ringkasan tidak tersedia. (Model ringkasan tidak dimuat atau data tidak mencukupi.)"
	)
	overview_para.runs[0].italic = True
	overview_para.runs[0].font.color.rgb = RGBColor(128, 128, 128)

	# Key points (filter placeholders)
	filtered_points = [
	p for p in (summary.key_points or []) if not self._is_placeholder_text(p)
	]
	if filtered_points:
	subheading = doc.add_heading("Poin-Poin Penting", level=2)
	subheading.runs[0].font.size = Pt(self.config.heading2_font_size)

	for point in filtered_points:
	para = doc.add_paragraph(point, style="List Bullet")
	else:
	para = doc.add_paragraph()
	para.add_run("Tidak ada poin penting yang dihasilkan secara otomatis.")
	para.runs[0].italic = True
	para.runs[0].font.color.rgb = RGBColor(128, 128, 128)

	# Topics discussed (filter placeholders)
	topics_filtered = [t for t in (summary.topics or []) if not self._is_placeholder_text(t)]
	if topics_filtered:
	doc.add_paragraph()
	para = doc.add_paragraph()
	para.add_run("Topik yang dibahas: ").bold = True
	para.add_run(", ".join(topics_filtered))

	# Spacer
	doc.add_paragraph()

	def _add_decisions_section(self, doc: Document, decisions: List[str]):
	"""Add decisions section"""
	# Section heading
	heading = doc.add_heading("Keputusan Rapat", level=1)
	heading.runs[0].font.size = Pt(self.config.heading1_font_size)

	if decisions:
	for i, decision in enumerate(decisions, 1):
	para = doc.add_paragraph()
	para.add_run(f"{i}. ").bold = True
	para.add_run(decision)
	else:
	para = doc.add_paragraph()
	para.add_run("Tidak ada keputusan yang teridentifikasi secara otomatis.")
	para.runs[0].italic = True
	para.runs[0].font.color.rgb = RGBColor(128, 128, 128)

	# Spacer
	doc.add_paragraph()

	def _add_action_items_section(self, doc: Document, action_items: List[Dict[str, str]]):
	"""Add action items section"""
	# Section heading
	heading = doc.add_heading("Action Items / Tindak Lanjut", level=1)
	heading.runs[0].font.size = Pt(self.config.heading1_font_size)

	if action_items:
	# Create table
	table = doc.add_table(rows=len(action_items) + 1, cols=4)
	table.style = "Table Grid"
	table.alignment = WD_TABLE_ALIGNMENT.LEFT

	# Header row
	headers = ["No.", "Penanggung Jawab", "Tugas", "Deadline"]
	header_row = table.rows[0]

	for i, header_text in enumerate(headers):
	cell = header_row.cells[i]
	cell.text = header_text

	# Style header
	for paragraph in cell.paragraphs:
	for run in paragraph.runs:
	run.bold = True

	# Set header background color
	shading = OxmlElement("w:shd")
	shading.set(qn("w:fill"), "D9E2F3")
	cell._tc.get_or_add_tcPr().append(shading)

	# Data rows
	for i, item in enumerate(action_items, 1):
	row = table.rows[i]

	row.cells[0].text = str(i)
	row.cells[1].text = item.get("owner", "-")
	row.cells[2].text = item.get("task", "-")
	row.cells[3].text = item.get("due", "-")

	# Set column widths
	for row in table.rows:
	row.cells[0].width = Cm(1.0)
	row.cells[1].width = Cm(3.5)
	row.cells[2].width = Cm(9.0)
	row.cells[3].width = Cm(2.5)
	else:
	para = doc.add_paragraph()
	para.add_run("Tidak ada action item yang teridentifikasi secara otomatis.")
	para.runs[0].italic = True
	para.runs[0].font.color.rgb = RGBColor(128, 128, 128)

	# Spacer
	doc.add_paragraph()

	def _add_transcript_section(self, doc: Document, transcript: List[TranscriptSegment]):
	"""Add full transcript section"""
	# Section heading
	heading = doc.add_heading("Transkrip Percakapan", level=1)
	heading.runs[0].font.size = Pt(self.config.heading1_font_size)

	if not transcript:
	para = doc.add_paragraph()
	para.add_run("Tidak ada transkrip yang tersedia.")
	para.runs[0].italic = True
	return

	# Add each segment
	for seg in transcript:
	para = doc.add_paragraph()

	# Timestamp
	if self.config.include_timestamps:
	timestamp = self._format_timestamp(seg.start, seg.end)

	# Speaker label with color
	speaker_run = para.add_run(f"{seg.speaker_id} [{timestamp}]: ")
	speaker_run.bold = True

	if self.config.include_speaker_colors:
	color = self._speaker_color_map.get(seg.speaker_id, RGBColor(0, 0, 0))
	speaker_run.font.color.rgb = color
	else:
	speaker_run = para.add_run(f"{seg.speaker_id}: ")
	speaker_run.bold = True

	# Transcript text (sanitize placeholder/fallback strings)
	text = seg.text or ""
	cleaned = self._clean_text_for_doc(text)
	para.add_run(cleaned)

	# Mark overlapping speech
	if seg.is_overlap:
	overlap_run = para.add_run(" [OVERLAP]")
	overlap_run.italic = True
	overlap_run.font.color.rgb = RGBColor(255, 102, 0)
	overlap_run.font.size = Pt(9)

	def _add_footer(self, doc: Document):
	"""Add document footer"""
	# Separator line
	doc.add_paragraph()
	separator = doc.add_paragraph("─" * 70)
	separator.alignment = WD_ALIGN_PARAGRAPH.CENTER

	# Footer text
	footer_para = doc.add_paragraph()

	timestamp = datetime.now().strftime("%d %B %Y, %H:%M:%S")
	footer_text = f"Dokumen ini dihasilkan secara otomatis pada {timestamp}"

	footer_run = footer_para.add_run(footer_text)
	footer_run.italic = True
	footer_run.font.size = Pt(9)
	footer_run.font.color.rgb = RGBColor(128, 128, 128)
	footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER

	# Disclaimer
	disclaimer_para = doc.add_paragraph()
	disclaimer_text = (
	"Hasil transkripsi dan ringkasan mungkin mengandung ketidakakuratan. "
	"Harap verifikasi informasi penting."
	)

	disclaimer_run = disclaimer_para.add_run(disclaimer_text)
	disclaimer_run.italic = True
	disclaimer_run.font.size = Pt(8)
	disclaimer_run.font.color.rgb = RGBColor(150, 150, 150)
	disclaimer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER

	def _is_placeholder_text(self, text: Optional[str]) -> bool:
	"""Detect summarizer/ASR fallback placeholder text."""
	if not text:
	return True
	t = str(text).strip()
	# common placeholder patterns from summarizer / transcriber fallbacks
	if re.search(r"\[\s*Transkripsi placeholder", t, re.I):
	return True
	if re.search(r"placeholder", t, re.I) and len(t) < 120:
	return True
	return False

	def _clean_text_for_doc(self, text: Optional[str]) -> str:
	"""Clean text for document: replace raw placeholders with user-friendly notices."""
	if not text or self._is_placeholder_text(text):
	return "[transkripsi tidak tersedia]"
	# Remove any bracketed placeholder fragments embedded in text
	cleaned = re.sub(r"\[\sTranskripsi placeholder[^\]]\]", "", str(text), flags=re.I).strip()
	return cleaned or "[transkripsi tidak tersedia]"

	@staticmethod
	def _format_timestamp(start: float, end: float) -> str:
	"""Format time range as HH:MM:SS"""

	def sec_to_str(sec: float) -> str:
	sec = max(0.0, float(sec))
	h = int(sec // 3600)
	m = int((sec % 3600) // 60)
	s = int(sec % 60)

	if h > 0:
	return f"{h:02d}:{m:02d}:{s:02d}"
	return f"{m:02d}:{s:02d}"

	return f"{sec_to_str(start)}–{sec_to_str(end)}"

	def _save_minimal_docx(self, path: str, paragraphs: List[str]):
	"""Create a minimal valid .docx (zip package) containing plain paragraphs.
	This is a lightweight fallback when python-docx is not installed, to ensure
	the generated file can be opened in Word.
	"""
	import zipfile

	def _escape_xml(s: str) -> str:
	return (
	s.replace("&", "&")
	.replace("<", "<")
	.replace(">", ">")
	.replace('"', """)
	.replace("'", "'")
	)

	content_types = (
	'<?xml version="1.0" encoding="UTF-8"?>\n'
	'<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">\n'
	' <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>\n'
	' <Default Extension="xml" ContentType="application/xml"/>\n'
	' <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>\n'
	"</Types>"
	)

	rels = (
	'<?xml version="1.0" encoding="UTF-8"?>\n'
	'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">\n'
	' <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>\n'
	"</Relationships>"
	)

	doc_xml_header = (
	'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
	'<w:document xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" '
	'xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" '
	'xmlns:o="urn:schemas-microsoft-com:office:office" '
	'xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" '
	'xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" '
	'xmlns:v="urn:schemas-microsoft-com:vml" '
	'xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" '
	'xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" '
	'xmlns:w10="urn:schemas-microsoft-com:office:word" '
	'xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" '
	'xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" '
	'xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" '
	'xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" '
	'xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" '
	'xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape">\n'
	" <w:body>\n"
	)

	doc_xml_footer = (
	" <w:sectPr>\n"
	' <w:pgSz w:w="11900" w:h="16840"/>\n'
	' <w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440" w:header="720" w:footer="720" w:gutter="0"/>\n'
	" </w:sectPr>\n"
	" </w:body>\n"
	"</w:document>"
	)

	# Build paragraphs as simple <w:p><w:r><w:t>text</w:t></w:r></w:p>
	paras_xml = []
	for p in paragraphs:
	t = _escape_xml(p.strip())
	if not t:
	# preserve blank line
	paras_xml.append(" <w:p/>\n")
	else:
	paras_xml.append(f' <w:p><w:r><w:t xml:space="preserve">{t}</w:t></w:r></w:p>\n')

	doc_xml = doc_xml_header + "".join(paras_xml) + doc_xml_footer

	with zipfile.ZipFile(path, "w", compression=zipfile.ZIP_DEFLATED) as z:
	z.writestr("[Content_Types].xml", content_types)
	z.writestr("_rels/.rels", rels)
	z.writestr("word/document.xml", doc_xml)

	def _extract_paragraph_texts(self, doc: Document) -> List[str]:
	"""Get paragraphs text for python-docx Document or fallback Document"""
	paras: List[str] = []
	# python-docx Document
	try:
	# using attribute if present
	if hasattr(doc, "paragraphs"):
	for p in doc.paragraphs:
	paras.append(p.text)
	return paras
	except Exception:
	pass

	# fallback minimal Document implementation
	if hasattr(doc, "_paragraphs"):
	for p in doc._paragraphs:
	if hasattr(p, "runs"):
	paras.append(" ".join(getattr(r, "text", "") for r in p.runs))
	else:
	paras.append(str(p))
	return paras

	@staticmethod
	def _sanitize_filename(filename: str) -> str:
	"""Remove invalid characters from filename"""
	import re

	# Remove invalid characters
	sanitized = re.sub(r'[<>:"/\\\|?*]', "", filename)
	# Replace spaces with underscores
	sanitized = sanitized.replace(" ", "_")
	return sanitized