docmind / summarizer /structured_summary.py
AI Engineer
Initial commit for DocMind
6cca5b1
Raw
History Blame Contribute Delete
2.43 kB
"""
DocMind — Structured Summary
Generates a section-by-section summary using extracted headings.
"""
import logging
from typing import Dict, List
from pipeline.chunker import ChunkMetadata, extract_section_titles
from pipeline.llm import generate_summary
logger = logging.getLogger(__name__)
SECTION_SUMMARY_PROMPT = (
"Summarize the following section content in exactly 2 sentences. "
"Be factual, specific, and concise. Do not add information not in the text."
)
def generate_structured_summary(chunks: List[ChunkMetadata]) -> str:
"""
Generate a structured summary organized by document sections.
For each detected section heading, collects the top-3 most relevant
chunks and generates a 2-sentence summary.
Args:
chunks: All chunks from the document.
Returns:
Markdown-formatted structured summary.
"""
if not chunks:
return "No document content available for summarization."
# Group chunks by section
section_chunks: Dict[str, List[ChunkMetadata]] = {}
for chunk in chunks:
title = chunk.section_title.strip() or "General Content"
if title not in section_chunks:
section_chunks[title] = []
section_chunks[title].append(chunk)
# Maintain order of first appearance
ordered_sections = []
seen = set()
for chunk in chunks:
title = chunk.section_title.strip() or "General Content"
if title not in seen:
seen.add(title)
ordered_sections.append(title)
# Generate summary for each section
summary_parts: List[str] = []
summary_parts.append("# Document Summary\n")
for section_title in ordered_sections:
section_chunk_list = section_chunks[section_title]
# Take top-3 chunks from this section
selected = section_chunk_list[:3]
combined_text = "\n\n".join(c.text for c in selected)
logger.info("Summarizing section '%s' (%d chunks)", section_title, len(selected))
try:
section_summary = generate_summary(combined_text, SECTION_SUMMARY_PROMPT)
except Exception as e:
logger.error("Failed to summarize section '%s': %s", section_title, e)
section_summary = "*Summary unavailable for this section.*"
summary_parts.append(f"## {section_title}\n")
summary_parts.append(f"{section_summary}\n")
return "\n".join(summary_parts)