""" DocMind — Structured Summary Generates a section-by-section summary using extracted headings. """ import logging from typing import Dict, List from pipeline.chunker import ChunkMetadata, extract_section_titles from pipeline.llm import generate_summary logger = logging.getLogger(__name__) SECTION_SUMMARY_PROMPT = ( "Summarize the following section content in exactly 2 sentences. " "Be factual, specific, and concise. Do not add information not in the text." ) def generate_structured_summary(chunks: List[ChunkMetadata]) -> str: """ Generate a structured summary organized by document sections. For each detected section heading, collects the top-3 most relevant chunks and generates a 2-sentence summary. Args: chunks: All chunks from the document. Returns: Markdown-formatted structured summary. """ if not chunks: return "No document content available for summarization." # Group chunks by section section_chunks: Dict[str, List[ChunkMetadata]] = {} for chunk in chunks: title = chunk.section_title.strip() or "General Content" if title not in section_chunks: section_chunks[title] = [] section_chunks[title].append(chunk) # Maintain order of first appearance ordered_sections = [] seen = set() for chunk in chunks: title = chunk.section_title.strip() or "General Content" if title not in seen: seen.add(title) ordered_sections.append(title) # Generate summary for each section summary_parts: List[str] = [] summary_parts.append("# Document Summary\n") for section_title in ordered_sections: section_chunk_list = section_chunks[section_title] # Take top-3 chunks from this section selected = section_chunk_list[:3] combined_text = "\n\n".join(c.text for c in selected) logger.info("Summarizing section '%s' (%d chunks)", section_title, len(selected)) try: section_summary = generate_summary(combined_text, SECTION_SUMMARY_PROMPT) except Exception as e: logger.error("Failed to summarize section '%s': %s", section_title, e) section_summary = "*Summary unavailable for this section.*" summary_parts.append(f"## {section_title}\n") summary_parts.append(f"{section_summary}\n") return "\n".join(summary_parts)