Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_core.documents import Document | |
| from agents.model import llm | |
| from typing import List | |
| MAX_CONTEXT_CHARS = 100000 | |
| def summarize_pdf(pdf_path: str) -> str: | |
| """ | |
| Token-efficient PDF summarizer. | |
| Strategy: | |
| 1. If document is small enough, summarize in ONE call (stuff method) | |
| 2. If larger, use iterative refinement with large chunks (fewer API calls) | |
| Args: | |
| pdf_path: Path to the PDF file | |
| Returns: | |
| Final summary string | |
| """ | |
| loader = PyPDFLoader(pdf_path) | |
| docs = loader.load() | |
| full_text = "\n\n".join(doc.page_content for doc in docs) | |
| if len(full_text) <= MAX_CONTEXT_CHARS: | |
| return _stuff_summarize(full_text) | |
| else: | |
| return _refine_summarize(full_text) | |
| def _stuff_summarize(text: str) -> str: | |
| """Summarize entire document in one API call.""" | |
| prompt = ChatPromptTemplate.from_template( | |
| "You are an expert summarizer. Read the following document and provide " | |
| "a comprehensive summary covering all key topics, concepts, and important details.\n\n" | |
| "Format your summary with:\n" | |
| "- A brief overview (2-3 sentences)\n" | |
| "- Main topics/sections with key points\n" | |
| "- Important definitions or concepts\n\n" | |
| "Document:\n{text}" | |
| ) | |
| chain = prompt | llm | |
| response = chain.invoke({"text": text}) | |
| return response.content | |
| def _refine_summarize(text: str, chunk_size: int = 50000) -> str: | |
| """ | |
| Iterative refinement for large documents. | |
| Uses fewer, larger chunks and refines the summary incrementally. | |
| This uses far fewer API calls than map-reduce. | |
| """ | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=500, | |
| ) | |
| chunks = splitter.split_text(text) | |
| first_prompt = ChatPromptTemplate.from_template( | |
| "You are an expert summarizer. Summarize the following content, " | |
| "capturing all key topics, concepts, and important details:\n\n{text}" | |
| ) | |
| first_chain = first_prompt | llm | |
| summary = first_chain.invoke({"text": chunks[0]}).content | |
| if len(chunks) == 1: | |
| return summary | |
| refine_prompt = ChatPromptTemplate.from_template( | |
| "You have an existing summary of a document:\n\n" | |
| "EXISTING SUMMARY:\n{summary}\n\n" | |
| "Now incorporate the following additional content into the summary. " | |
| "Expand and refine the summary to include new information while keeping it coherent:\n\n" | |
| "NEW CONTENT:\n{new_content}\n\n" | |
| "Provide the updated comprehensive summary:" | |
| ) | |
| refine_chain = refine_prompt | llm | |
| for chunk in chunks[1:]: | |
| response = refine_chain.invoke({ | |
| "summary": summary, | |
| "new_content": chunk | |
| }) | |
| summary = response.content | |
| return summary | |