Spaces:

Pygmales
/

hsg_rag_eea

Sleeping

hsg_rag_eea / src /rag /response_formatter.py

Pygmales

updated project state

268baab about 2 months ago

5.98 kB

	"""
	Response formatter for handling long responses and table formatting.
	Ensures responses are mobile-friendly and appropriately sized.
	"""
	import re
	from config import MAX_RESPONSE_WORDS_LEAD, MAX_RESPONSE_WORDS_SUBAGENT
	from src.utils.logging import get_logger

	logger = get_logger("response_formatter")


	CONTINUATION_PROMPT = {
	'en': "Would you like me to continue with more details?",
	'de': "Möchten Sie, dass ich mit weiteren Details fortfahre?"
	}


	class ResponseFormatter:
	"""Formats agent responses for optimal display"""

	@staticmethod
	def count_words(text: str) -> int:
	"""Count words in text"""
	words = text.split()
	return len(words)

	@staticmethod
	def remove_tables(text: str) -> str:
	"""
	Convert markdown tables to bullet point lists.
	Tables don't display well on mobile devices.

	Args:
	text: Response text potentially containing tables

	Returns:
	Text with tables converted to bullet points
	"""
	# Pattern to match markdown tables
	table_pattern = r'\\|[^\n]+\\|\n\\|[-:\s\|]+\\|\n(\\|[^\n]+\\|\n)+'

	def table_to_bullets(match):
	table_text = match.group(0)
	lines = [line.strip() for line in table_text.split('\n') if line.strip()]

	if len(lines) < 3: # Not a valid table
	return table_text

	# Extract headers (first line)
	headers = [cell.strip() for cell in lines[0].split('\|') if cell.strip()]

	# Skip separator line (second line)
	# Process data rows
	bullet_points = []
	for line in lines[2:]:
	cells = [cell.strip() for cell in line.split('\|') if cell.strip()]
	if cells and len(cells) == len(headers):
	# Create bullet point from row
	row_text = ", ".join([
	f"{headers[i]}: {cells[i]}"
	for i in range(len(cells))
	if cells[i]
	])
	bullet_points.append(f"• {row_text}")

	return "\n".join(bullet_points)

	# Replace tables with bullet points
	formatted = re.sub(table_pattern, table_to_bullets, text)

	if formatted != text:
	logger.info("Converted table to bullet points for mobile-friendly display")

	return formatted

	@staticmethod
	def chunk_response(
	text: str,
	max_words: int = MAX_RESPONSE_WORDS_LEAD,
	language: str = 'en'
	) -> tuple[str, str \| None]:
	"""
	Split long response into current response and continuation.

	Args:
	text: Full response text
	max_words: Maximum words for current response
	language: Language code ('en' or 'de') for continuation prompt

	Returns:
	Tuple of (current_response, continuation_or_none)
	"""
	word_count = ResponseFormatter.count_words(text)

	if word_count <= max_words:
	return text, None

	# Need to chunk — preserve line structure (markdown formatting)
	logger.info(f"Response has {word_count} words, chunking to {max_words} words")

	lines = text.split('\n')
	current_lines = []
	current_word_count = 0

	for line in lines:
	line_words = len(line.split()) if line.strip() else 0
	if current_word_count + line_words > max_words and current_lines:
	break
	current_lines.append(line)
	current_word_count += line_words

	current = '\n'.join(current_lines)
	continuation = '\n'.join(lines[len(current_lines):])

	# Add continuation prompt in the correct language
	continuation_msg = CONTINUATION_PROMPT.get(language, CONTINUATION_PROMPT['en'])
	current += f"\n\n{continuation_msg}"

	return current, continuation

	@staticmethod
	def format_response(
	text: str,
	agent_type: str = 'lead',
	enable_chunking: bool = True,
	language: str = 'en'
	) -> str:
	"""
	Format response: remove tables and handle length.

	Args:
	text: Raw response text
	agent_type: 'lead' or 'subagent' (determines max length)
	enable_chunking: Whether to chunk long responses
	language: Language code ('en' or 'de') for any generated text

	Returns:
	Formatted response text
	"""
	# Remove tables
	formatted = ResponseFormatter.remove_tables(text)

	# Determine max words
	max_words = (
	MAX_RESPONSE_WORDS_LEAD
	if agent_type == 'lead'
	else MAX_RESPONSE_WORDS_SUBAGENT
	)

	# Handle chunking if enabled
	if enable_chunking:
	formatted, _continuation = ResponseFormatter.chunk_response(
	formatted,
	max_words,
	language
	)

	return formatted

	@staticmethod
	def clean_response(text: str) -> str:
	"""
	Clean up response text (remove extra whitespace, etc.)

	Args:
	text: Response text

	Returns:
	Cleaned text
	"""
	# Remove multiple consecutive newlines
	cleaned = re.sub(r'\n{3,}', '\n\n', text)

	# Remove trailing whitespace
	cleaned = cleaned.strip()

	return cleaned

	@staticmethod
	def format_name_of_university(formatted_response, language):
	if language == "en":
	pattern = r"Universität St\.Gallen"
	replace = "University of St.Gallen"
	formatted_response = re.sub(pattern, replace, formatted_response)

	return formatted_response