| """ |
| DocMind — Quick Summary |
| |
| Generates a concise one-paragraph summary (~100 words) from the |
| first 3 and last 2 chunks of a document. |
| """ |
|
|
| import logging |
| from typing import List |
|
|
| from pipeline.chunker import ChunkMetadata |
| from pipeline.llm import generate_summary |
|
|
| logger = logging.getLogger(__name__) |
|
|
| QUICK_SUMMARY_PROMPT = ( |
| "Summarize this document in one paragraph of approximately 100 words. " |
| "Focus on the main topic, key claims, and conclusion. " |
| "Be factual and concise. Do not add any information not present in the text." |
| ) |
|
|
|
|
| def generate_quick_summary(chunks: List[ChunkMetadata]) -> str: |
| """ |
| Generate a quick one-paragraph summary. |
| |
| Strategy: Use the first 3 chunks (introduction) and last 2 chunks |
| (conclusion) to capture the document's scope. |
| |
| Args: |
| chunks: All chunks from the document, in order. |
| |
| Returns: |
| A single paragraph summary string. |
| """ |
| if not chunks: |
| return "No document content available for summarization." |
|
|
| |
| if len(chunks) <= 5: |
| selected = chunks |
| else: |
| selected = chunks[:3] + chunks[-2:] |
|
|
| combined_text = "\n\n".join( |
| f"[Page {c.page_num}] {c.text}" for c in selected |
| ) |
|
|
| logger.info("Generating quick summary from %d chunks", len(selected)) |
| return generate_summary(combined_text, QUICK_SUMMARY_PROMPT) |
|
|