File size: 2,432 Bytes
6cca5b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
DocMind — Structured Summary

Generates a section-by-section summary using extracted headings.
"""

import logging
from typing import Dict, List

from pipeline.chunker import ChunkMetadata, extract_section_titles
from pipeline.llm import generate_summary

logger = logging.getLogger(__name__)

SECTION_SUMMARY_PROMPT = (
    "Summarize the following section content in exactly 2 sentences. "
    "Be factual, specific, and concise. Do not add information not in the text."
)


def generate_structured_summary(chunks: List[ChunkMetadata]) -> str:
    """
    Generate a structured summary organized by document sections.

    For each detected section heading, collects the top-3 most relevant
    chunks and generates a 2-sentence summary.

    Args:
        chunks: All chunks from the document.

    Returns:
        Markdown-formatted structured summary.
    """
    if not chunks:
        return "No document content available for summarization."

    # Group chunks by section
    section_chunks: Dict[str, List[ChunkMetadata]] = {}
    for chunk in chunks:
        title = chunk.section_title.strip() or "General Content"
        if title not in section_chunks:
            section_chunks[title] = []
        section_chunks[title].append(chunk)

    # Maintain order of first appearance
    ordered_sections = []
    seen = set()
    for chunk in chunks:
        title = chunk.section_title.strip() or "General Content"
        if title not in seen:
            seen.add(title)
            ordered_sections.append(title)

    # Generate summary for each section
    summary_parts: List[str] = []
    summary_parts.append("# Document Summary\n")

    for section_title in ordered_sections:
        section_chunk_list = section_chunks[section_title]
        # Take top-3 chunks from this section
        selected = section_chunk_list[:3]
        combined_text = "\n\n".join(c.text for c in selected)

        logger.info("Summarizing section '%s' (%d chunks)", section_title, len(selected))

        try:
            section_summary = generate_summary(combined_text, SECTION_SUMMARY_PROMPT)
        except Exception as e:
            logger.error("Failed to summarize section '%s': %s", section_title, e)
            section_summary = "*Summary unavailable for this section.*"

        summary_parts.append(f"## {section_title}\n")
        summary_parts.append(f"{section_summary}\n")

    return "\n".join(summary_parts)