File size: 3,047 Bytes
bae14fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from agents.model import llm
from typing import List


MAX_CONTEXT_CHARS = 100000


def summarize_pdf(pdf_path: str) -> str:
    """
    Token-efficient PDF summarizer.
    
    Strategy:
    1. If document is small enough, summarize in ONE call (stuff method)
    2. If larger, use iterative refinement with large chunks (fewer API calls)
    
    Args:
        pdf_path: Path to the PDF file
    
    Returns:
        Final summary string
    """
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()
    
    full_text = "\n\n".join(doc.page_content for doc in docs)
    
    if len(full_text) <= MAX_CONTEXT_CHARS:
        return _stuff_summarize(full_text)
    else:
        return _refine_summarize(full_text)


def _stuff_summarize(text: str) -> str:
    """Summarize entire document in one API call."""
    prompt = ChatPromptTemplate.from_template(
        "You are an expert summarizer. Read the following document and provide "
        "a comprehensive summary covering all key topics, concepts, and important details.\n\n"
        "Format your summary with:\n"
        "- A brief overview (2-3 sentences)\n"
        "- Main topics/sections with key points\n"
        "- Important definitions or concepts\n\n"
        "Document:\n{text}"
    )
    chain = prompt | llm
    response = chain.invoke({"text": text})
    return response.content


def _refine_summarize(text: str, chunk_size: int = 50000) -> str:
    """
    Iterative refinement for large documents.
    
    Uses fewer, larger chunks and refines the summary incrementally.
    This uses far fewer API calls than map-reduce.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=500,
    )
    chunks = splitter.split_text(text)
    
    first_prompt = ChatPromptTemplate.from_template(
        "You are an expert summarizer. Summarize the following content, "
        "capturing all key topics, concepts, and important details:\n\n{text}"
    )
    first_chain = first_prompt | llm
    summary = first_chain.invoke({"text": chunks[0]}).content
    
    if len(chunks) == 1:
        return summary
    
    refine_prompt = ChatPromptTemplate.from_template(
        "You have an existing summary of a document:\n\n"
        "EXISTING SUMMARY:\n{summary}\n\n"
        "Now incorporate the following additional content into the summary. "
        "Expand and refine the summary to include new information while keeping it coherent:\n\n"
        "NEW CONTENT:\n{new_content}\n\n"
        "Provide the updated comprehensive summary:"
    )
    refine_chain = refine_prompt | llm
    
    for chunk in chunks[1:]:
        response = refine_chain.invoke({
            "summary": summary,
            "new_content": chunk
        })
        summary = response.content
    
    return summary