| """ |
| DocMind — Structured Summary |
| |
| Generates a section-by-section summary using extracted headings. |
| """ |
|
|
| import logging |
| from typing import Dict, List |
|
|
| from pipeline.chunker import ChunkMetadata, extract_section_titles |
| from pipeline.llm import generate_summary |
|
|
| logger = logging.getLogger(__name__) |
|
|
| SECTION_SUMMARY_PROMPT = ( |
| "Summarize the following section content in exactly 2 sentences. " |
| "Be factual, specific, and concise. Do not add information not in the text." |
| ) |
|
|
|
|
| def generate_structured_summary(chunks: List[ChunkMetadata]) -> str: |
| """ |
| Generate a structured summary organized by document sections. |
| |
| For each detected section heading, collects the top-3 most relevant |
| chunks and generates a 2-sentence summary. |
| |
| Args: |
| chunks: All chunks from the document. |
| |
| Returns: |
| Markdown-formatted structured summary. |
| """ |
| if not chunks: |
| return "No document content available for summarization." |
|
|
| |
| section_chunks: Dict[str, List[ChunkMetadata]] = {} |
| for chunk in chunks: |
| title = chunk.section_title.strip() or "General Content" |
| if title not in section_chunks: |
| section_chunks[title] = [] |
| section_chunks[title].append(chunk) |
|
|
| |
| ordered_sections = [] |
| seen = set() |
| for chunk in chunks: |
| title = chunk.section_title.strip() or "General Content" |
| if title not in seen: |
| seen.add(title) |
| ordered_sections.append(title) |
|
|
| |
| summary_parts: List[str] = [] |
| summary_parts.append("# Document Summary\n") |
|
|
| for section_title in ordered_sections: |
| section_chunk_list = section_chunks[section_title] |
| |
| selected = section_chunk_list[:3] |
| combined_text = "\n\n".join(c.text for c in selected) |
|
|
| logger.info("Summarizing section '%s' (%d chunks)", section_title, len(selected)) |
|
|
| try: |
| section_summary = generate_summary(combined_text, SECTION_SUMMARY_PROMPT) |
| except Exception as e: |
| logger.error("Failed to summarize section '%s': %s", section_title, e) |
| section_summary = "*Summary unavailable for this section.*" |
|
|
| summary_parts.append(f"## {section_title}\n") |
| summary_parts.append(f"{section_summary}\n") |
|
|
| return "\n".join(summary_parts) |
|
|