import logging from typing import Dict, Optional from langchain_core.prompts import PromptTemplate from scripts.data_processing.document_loader import load_document from insucompass.config import settings from insucompass.prompts.prompt_loader import load_prompt # Configure logging logging.basicConfig(level=settings.LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) SUMMARY_PROMPT_TEMPLATE = load_prompt('document_summarizer') class DocumentSummarizerAgent: """ A service to generate and cache summaries for full source documents. """ def __init__(self, llm): self.llm = llm self.prompt = PromptTemplate.from_template(SUMMARY_PROMPT_TEMPLATE) self.chain = self.prompt | self.llm self._summary_cache: Dict[str, str] = {} logger.info("DocumentSummarizer service initialized.") def get_summary(self, source_id: int, local_path: str) -> Optional[str]: """ Retrieves or generates a summary for a given source document. Uses an in-memory cache to avoid re-summarizing the same document. Args: source_id: The unique ID of the source document. local_path: The local file path to the full document content. Returns: The summary string, or None if summarization fails. """ cache_key = str(source_id) if cache_key in self._summary_cache: logger.debug(f"Returning cached summary for source_id: {source_id}") return self._summary_cache[cache_key] logger.info(f"Generating new summary for source_id: {source_id} from path: {local_path}") full_content = load_document(local_path) if not full_content: logger.error(f"Could not load document content from {local_path} to generate summary.") return None try: # Truncate content to fit model context window if necessary # A safe number for many models is around 16k tokens, let's use chars as a proxy max_chars = 50000 truncated_content = full_content[:max_chars] response = self.chain.invoke({"document_content": truncated_content}) summary = response.content if hasattr(response, 'content') else str(response) self._summary_cache[cache_key] = summary logger.info(f"Successfully generated and cached summary for source_id: {source_id}") return summary except Exception as e: logger.error(f"Failed to generate summary for source_id {source_id}: {e}") return None