Cybersecurity-Panel

Sleeping

Cybersecurity-Panel / multi_llm_chatbot_backend /app /utils /chat_summary.py

NeonClary

Include full chat history under 4k tokens, else LLM summary

38a5df6 about 2 months ago

8.5 kB

	from typing import Dict, List, Optional

	import logging
	import re

	from app.llm.llm_client import LLMClient
	from app.config import get_settings

	logger = logging.getLogger(__name__)

	CONTEXT_SUMMARY_SYSTEM = (
	"You are a concise summarizer. Condense the following conversation into a short summary "
	"that preserves the key topics discussed, any conclusions reached, important facts shared, "
	"and the overall tone. Keep it under 300 words. Write in third person narrative form."
	)


	def _conversation_role_label(role: str, persona_names: Optional[Dict[str, str]] = None) -> str:
	if role == "user":
	return "User"
	if role == "assistant":
	return "Assistant"
	if persona_names and role in persona_names:
	return persona_names[role]
	return role.replace("_", " ").title()


	async def generate_conversation_context_summary(
	messages: List[dict],
	llm: LLMClient,
	persona_names: Optional[Dict[str, str]] = None,
	max_tokens: int = 1024,
	) -> str:
	"""Summarize chat history for LLM context when the transcript exceeds the token budget."""
	transcript_lines = []
	for msg in messages:
	content = (msg.get("content") or "").strip()
	if not content:
	continue
	label = _conversation_role_label(msg.get("role", "user"), persona_names)
	transcript_lines.append(f"{label}: {content}")

	if not transcript_lines:
	return ""

	transcript = "\n".join(transcript_lines)
	try:
	summary = await llm.generate(
	system_prompt=CONTEXT_SUMMARY_SYSTEM,
	context=[{"role": "user", "content": transcript}],
	temperature=0.3,
	max_tokens=max_tokens,
	)
	return (summary or "").strip()
	except Exception as exc:
	logger.error("Conversation context summary failed: %s", exc)
	return ""

	async def generate_summary_from_messages(messages: List[dict], llm: LLMClient, max_tokens: int = 800) -> str:
	"""
	Summarize the conversation using the given LLM client.
	"""
	try:
	app_title = get_settings().app.title
	full_text = "\n\n".join([f"{m['role']}:\n{m['content']}" for m in messages])

	system_prompt = (
	f"You are an assistant for {app_title}. Summarize the following chat conversation "
	"into a well-formatted summary with clear bullet points. "
	"Please format your response as follows:\n"
	"- Use bullet points (starting with *) for key insights\n"
	"- Put each bullet point on a separate line\n"
	"- Include section headings if appropriate (formatted as Section Name:)\n"
	"- Focus on insights, questions, and actionable advice\n"
	"- Maximum 10 bullet points\n\n"
	"Example format:\n"
	"Key Insights:\n"
	"* First main point about the conversation\n"
	"* Second important insight\n"
	"* Third key takeaway\n\n"
	"Recommendations:\n"
	"* First actionable recommendation\n"
	"* Second suggestion"
	)

	context = [{"role": "user", "content": f"Chat Log:\n{full_text}"}]

	summary = await llm.generate(
	system_prompt=system_prompt,
	context=context,
	temperature=0.4,
	max_tokens=max_tokens
	)

	# Post-process the summary to ensure proper formatting
	formatted_summary = _format_summary_text(summary.strip())
	return formatted_summary

	except Exception as e:
	logger.error(f"Error generating summary: {str(e)}")
	return "Summary generation failed. Please try again later."


	def _format_summary_text(summary_text: str) -> str:
	"""
	Post-process the summary text to ensure proper bullet point formatting.
	"""
	# Fix common formatting issues

	# Add line breaks before bullet points that don't have them
	summary_text = re.sub(r'(?<!\n)([*•] )', r'\n\1', summary_text)

	# Add line breaks before numbered lists that don't have them
	summary_text = re.sub(r'(?<!\n)(\d+\.\s+)', r'\n\1', summary_text)

	# Add line breaks after periods followed by capital letters (likely new sentences)
	summary_text = re.sub(r'(?<=[.!?])(?=\s[•]\s)', '\n', summary_text)

	# Clean up multiple consecutive newlines
	summary_text = re.sub(r'\n{3,}', '\n\n', summary_text)

	# Ensure bullet points are properly spaced
	summary_text = re.sub(r'\n([*•] )', r'\n\n\1', summary_text)

	# Fix section headings that might be run together
	summary_text = re.sub(r'([.!?])\s(\\[^]+\\)', r'\1\n\n\2', summary_text)

	return summary_text.strip()


	def parse_summary_to_blocks(summary_text: str) -> List[Dict]:
	"""
	Parse summary text into structured blocks for better formatting.
	"""
	# First, ensure proper formatting
	summary_text = _format_summary_text(summary_text)

	lines = summary_text.strip().splitlines()
	blocks = []
	current_block = None

	def flush_current_block():
	if current_block:
	blocks.append(current_block.copy())

	for line in lines:
	line = line.strip()
	if not line:
	continue

	# Match section headings (e.g. Title: or Title)
	heading_match = re.match(r'^\\(.+?)\\:?$', line)
	if heading_match:
	flush_current_block()
	current_block = {"type": "heading", "text": heading_match.group(1).strip()}
	flush_current_block()
	current_block = None
	continue

	# Match bullet list items (*, •, or -)
	bullet_match = re.match(r'^[*•-]\s+(.+)', line)
	if bullet_match:
	if current_block is None or current_block["type"] != "list" or current_block.get("style") != "bullet":
	flush_current_block()
	current_block = {"type": "list", "style": "bullet", "items": []}
	current_block["items"].append(bullet_match.group(1).strip())
	continue

	# Match numbered list items
	number_match = re.match(r'^\d+\.\s+(.+)', line)
	if number_match:
	if current_block is None or current_block["type"] != "list" or current_block.get("style") != "numbered":
	flush_current_block()
	current_block = {"type": "list", "style": "numbered", "items": []}
	current_block["items"].append(number_match.group(1).strip())
	continue

	# Default: treat as paragraph
	flush_current_block()
	current_block = {"type": "paragraph", "text": line}
	flush_current_block()
	current_block = None

	flush_current_block()

	# Debug output to help troubleshoot
	logger.info(f"[DEBUG] Parsed {len(blocks)} blocks from summary")
	for i, block in enumerate(blocks):
	if block["type"] == "list":
	logger.info(f"Block {i}: {block['type']} ({block['style']}) with {len(block['items'])} items")
	else:
	logger.info(f"Block {i}: {block['type']}")

	return blocks


	def format_summary_for_text_export(summary_text: str) -> str:
	"""
	Format summary text specifically for TXT and DOCX exports with proper line breaks.
	"""
	formatted_text = _format_summary_text(summary_text)

	# Add extra spacing for better readability in text formats
	lines = formatted_text.split('\n')
	formatted_lines = []

	for line in lines:
	line = line.strip()
	if not line:
	continue

	# Add extra space before section headings
	if re.match(r'^\\(.+?)\\:?$', line):
	if formatted_lines: # Don't add space before first heading
	formatted_lines.append('')
	formatted_lines.append(line)
	formatted_lines.append('') # Space after heading
	# Add space before bullet points (but group them together)
	elif re.match(r'^[*•-]\s+', line):
	# Check if previous line was also a bullet point
	if formatted_lines and not re.match(r'^[*•-]\s+', formatted_lines[-1]):
	formatted_lines.append('') # Space before first bullet in group
	formatted_lines.append(line)
	else:
	# Regular paragraph
	if formatted_lines:
	formatted_lines.append('')
	formatted_lines.append(line)

	return '\n'.join(formatted_lines)