import logging import re from typing import Optional, Dict, Any, List logger = logging.getLogger(__name__) class OutputFormatter: """ Formats model responses for better presentation and usability. """ def __init__(self): """ Initialize the OutputFormatter. """ self.post_processors = { "programming_software_dev": self._format_code, "mbpp": self._format_code, "machine_learning_ai_data_science": self._format_technical_content, "mathematics": self._format_equations, "default": self._default_formatter } logger.info("OutputFormatter initialized") def format_response(self, response: str, specialization: Optional[str] = None) -> str: """ Format the model response based on specialization. Args: response: The raw response from the model specialization: The specialization area (optional) Returns: Formatted response """ if not response: return "" # Apply basic formatting to all responses formatted_response = self._clean_whitespace(response) # Apply specialization-specific formatting processor = self.post_processors.get(specialization, self.post_processors["default"]) formatted_response = processor(formatted_response) return formatted_response def _clean_whitespace(self, text: str) -> str: """ Clean up excessive whitespace. """ # Replace multiple newlines with double newlines text = re.sub(r'\n{3,}', '\n\n', text) # Replace multiple spaces with a single space text = re.sub(r' {2,}', ' ', text) return text.strip() def _format_code(self, text: str) -> str: """ Format code blocks with proper syntax highlighting markers. """ # Identify unmarked code blocks and add markdown code block syntax # Look for patterns that suggest code (indentation, common programming keywords) code_patterns = [ r'((?:^|\n)(?:def |class |import |function |public |private |var |let |const |if |for |while ).+(?:\n[ \t]+.+)+)', r'((?:^|\n)(?:SELECT |INSERT |UPDATE |DELETE |CREATE |ALTER |DROP ).+(?:;)(?:\n|$))' ] for pattern in code_patterns: def add_code_markers(match): code_block = match.group(1) # Try to determine the language based on keywords lang = self._detect_language(code_block) return f"\n```{lang}\n{code_block}\n```\n" text = re.sub(pattern, add_code_markers, text) return text def _detect_language(self, code_block: str) -> str: """ Attempt to detect the programming language from a code block. """ if re.search(r'def |class |import |if __name__ ==|print\(', code_block): return "python" elif re.search(r'function |var |const |let |=> |document\.', code_block): return "javascript" elif re.search(r'public |private |class .+ {|void |String |int |boolean', code_block): return "java" elif re.search(r'#include|int main|std::|printf|scanf', code_block): return "c++" elif re.search(r'SELECT |INSERT |UPDATE |DELETE |CREATE TABLE|ALTER TABLE', code_block): return "sql" else: return "" # Generic code block def _format_equations(self, text: str) -> str: """ Format mathematical equations with LaTeX markers if needed. """ # Basic pattern for unmarked equations equation_patterns = [ r'([^$])(\\frac{.+?}{.+?}|\\sum_|\\int_|\\lim_)', r'([^$])([a-zA-Z]_[0-9]+)', r'([^$])([a-zA-Z]\\in)' ] for pattern in equation_patterns: text = re.sub(pattern, r'\1$\2$', text) # Ensure equation blocks use proper LaTeX delimiters text = re.sub(r'\\begin{equation}(.+?)\\end{equation}', r'$$\1$$', text, flags=re.DOTALL) return text def _format_technical_content(self, text: str) -> str: """ Format technical content with proper highlighting of terms and concepts. """ # Highlight technical terms technical_terms = [ "neural network", "machine learning", "deep learning", "algorithm", "regression", "classification", "clustering", "backpropagation", "gradient descent", "optimization", "hyperparameter" ] for term in technical_terms: # Only highlight whole words, not substrings text = re.sub(r'\b(' + re.escape(term) + r')\b(?![*_])', r'*\1*', text) return text def _default_formatter(self, text: str) -> str: """ Default formatter that applies general improvements. """ # Add paragraph breaks for readability when appropriate text = re.sub(r'(\w\.)\s+([A-Z])', r'\1\n\n\2', text) # Format lists for readability if they're not already formatted text = re.sub(r'(? str: """ Format structured data outputs (like JSON) into readable text. Args: data: Dictionary containing structured data Returns: Formatted string representation """ if not isinstance(data, dict): return str(data) formatted_parts = [] # Format main response if present if "response" in data: formatted_parts.append(self.format_response(data["response"])) # Add metadata in a clean format if needed metadata = {} for key, value in data.items(): if key != "response" and not key.startswith("_"): metadata[key] = value if metadata: formatted_parts.append("\n\n---\n") for key, value in metadata.items(): formatted_parts.append(f"**{key.replace('_', ' ').title()}**: {value}") return "\n".join(formatted_parts)