Spaces:

nagur-shareef-shaik
/

InsuCompass-API

Sleeping

App Files Files Community

InsuCompass-API / insucompass /core /agents /document_summarizer.py

nagur-shareef-shaik

Add Application Code

cd6f412 8 months ago

raw

history blame contribute delete

2.66 kB

	import logging
	from typing import Dict, Optional
	from langchain_core.prompts import PromptTemplate

	from scripts.data_processing.document_loader import load_document

	from insucompass.config import settings
	from insucompass.prompts.prompt_loader import load_prompt

	# Configure logging
	logging.basicConfig(level=settings.LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	SUMMARY_PROMPT_TEMPLATE = load_prompt('document_summarizer')

	class DocumentSummarizerAgent:
	"""
	A service to generate and cache summaries for full source documents.
	"""
	def __init__(self, llm):
	self.llm = llm
	self.prompt = PromptTemplate.from_template(SUMMARY_PROMPT_TEMPLATE)
	self.chain = self.prompt \| self.llm
	self._summary_cache: Dict[str, str] = {}
	logger.info("DocumentSummarizer service initialized.")

	def get_summary(self, source_id: int, local_path: str) -> Optional[str]:
	"""
	Retrieves or generates a summary for a given source document.
	Uses an in-memory cache to avoid re-summarizing the same document.

	Args:
	source_id: The unique ID of the source document.
	local_path: The local file path to the full document content.

	Returns:
	The summary string, or None if summarization fails.
	"""
	cache_key = str(source_id)
	if cache_key in self._summary_cache:
	logger.debug(f"Returning cached summary for source_id: {source_id}")
	return self._summary_cache[cache_key]

	logger.info(f"Generating new summary for source_id: {source_id} from path: {local_path}")

	full_content = load_document(local_path)
	if not full_content:
	logger.error(f"Could not load document content from {local_path} to generate summary.")
	return None

	try:
	# Truncate content to fit model context window if necessary
	# A safe number for many models is around 16k tokens, let's use chars as a proxy
	max_chars = 50000
	truncated_content = full_content[:max_chars]

	response = self.chain.invoke({"document_content": truncated_content})
	summary = response.content if hasattr(response, 'content') else str(response)

	self._summary_cache[cache_key] = summary
	logger.info(f"Successfully generated and cached summary for source_id: {source_id}")
	return summary
	except Exception as e:
	logger.error(f"Failed to generate summary for source_id {source_id}: {e}")
	return None