Spaces:
Sleeping
Sleeping
| import logging | |
| from typing import Dict, Optional | |
| from langchain_core.prompts import PromptTemplate | |
| from scripts.data_processing.document_loader import load_document | |
| from insucompass.config import settings | |
| from insucompass.prompts.prompt_loader import load_prompt | |
| # Configure logging | |
| logging.basicConfig(level=settings.LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| SUMMARY_PROMPT_TEMPLATE = load_prompt('document_summarizer') | |
| class DocumentSummarizerAgent: | |
| """ | |
| A service to generate and cache summaries for full source documents. | |
| """ | |
| def __init__(self, llm): | |
| self.llm = llm | |
| self.prompt = PromptTemplate.from_template(SUMMARY_PROMPT_TEMPLATE) | |
| self.chain = self.prompt | self.llm | |
| self._summary_cache: Dict[str, str] = {} | |
| logger.info("DocumentSummarizer service initialized.") | |
| def get_summary(self, source_id: int, local_path: str) -> Optional[str]: | |
| """ | |
| Retrieves or generates a summary for a given source document. | |
| Uses an in-memory cache to avoid re-summarizing the same document. | |
| Args: | |
| source_id: The unique ID of the source document. | |
| local_path: The local file path to the full document content. | |
| Returns: | |
| The summary string, or None if summarization fails. | |
| """ | |
| cache_key = str(source_id) | |
| if cache_key in self._summary_cache: | |
| logger.debug(f"Returning cached summary for source_id: {source_id}") | |
| return self._summary_cache[cache_key] | |
| logger.info(f"Generating new summary for source_id: {source_id} from path: {local_path}") | |
| full_content = load_document(local_path) | |
| if not full_content: | |
| logger.error(f"Could not load document content from {local_path} to generate summary.") | |
| return None | |
| try: | |
| # Truncate content to fit model context window if necessary | |
| # A safe number for many models is around 16k tokens, let's use chars as a proxy | |
| max_chars = 50000 | |
| truncated_content = full_content[:max_chars] | |
| response = self.chain.invoke({"document_content": truncated_content}) | |
| summary = response.content if hasattr(response, 'content') else str(response) | |
| self._summary_cache[cache_key] = summary | |
| logger.info(f"Successfully generated and cached summary for source_id: {source_id}") | |
| return summary | |
| except Exception as e: | |
| logger.error(f"Failed to generate summary for source_id {source_id}: {e}") | |
| return None |