File size: 2,655 Bytes
cd6f412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import logging
from typing import Dict, Optional
from langchain_core.prompts import PromptTemplate

from scripts.data_processing.document_loader import load_document

from insucompass.config import settings
from insucompass.prompts.prompt_loader import load_prompt

# Configure logging
logging.basicConfig(level=settings.LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

SUMMARY_PROMPT_TEMPLATE = load_prompt('document_summarizer')

class DocumentSummarizerAgent:
    """
    A service to generate and cache summaries for full source documents.
    """
    def __init__(self, llm):
        self.llm = llm
        self.prompt = PromptTemplate.from_template(SUMMARY_PROMPT_TEMPLATE)
        self.chain = self.prompt | self.llm
        self._summary_cache: Dict[str, str] = {}
        logger.info("DocumentSummarizer service initialized.")

    def get_summary(self, source_id: int, local_path: str) -> Optional[str]:
        """
        Retrieves or generates a summary for a given source document.
        Uses an in-memory cache to avoid re-summarizing the same document.

        Args:
            source_id: The unique ID of the source document.
            local_path: The local file path to the full document content.

        Returns:
            The summary string, or None if summarization fails.
        """
        cache_key = str(source_id)
        if cache_key in self._summary_cache:
            logger.debug(f"Returning cached summary for source_id: {source_id}")
            return self._summary_cache[cache_key]

        logger.info(f"Generating new summary for source_id: {source_id} from path: {local_path}")
        
        full_content = load_document(local_path)
        if not full_content:
            logger.error(f"Could not load document content from {local_path} to generate summary.")
            return None
        
        try:
            # Truncate content to fit model context window if necessary
            # A safe number for many models is around 16k tokens, let's use chars as a proxy
            max_chars = 50000 
            truncated_content = full_content[:max_chars]

            response = self.chain.invoke({"document_content": truncated_content})
            summary = response.content if hasattr(response, 'content') else str(response)

            self._summary_cache[cache_key] = summary
            logger.info(f"Successfully generated and cached summary for source_id: {source_id}")
            return summary
        except Exception as e:
            logger.error(f"Failed to generate summary for source_id {source_id}: {e}")
            return None