Wildnerve-tlm01_Hybrid_Model / utils /output_formatter.py
WildnerveAI's picture
Upload 20 files
0861a59 verified
import logging
import re
from typing import Optional, Dict, Any, List
logger = logging.getLogger(__name__)
class OutputFormatter:
"""
Formats model responses for better presentation and usability.
"""
def __init__(self):
"""
Initialize the OutputFormatter.
"""
self.post_processors = {
"programming_software_dev": self._format_code,
"mbpp": self._format_code,
"machine_learning_ai_data_science": self._format_technical_content,
"mathematics": self._format_equations,
"default": self._default_formatter
}
logger.info("OutputFormatter initialized")
def format_response(self, response: str, specialization: Optional[str] = None) -> str:
"""
Format the model response based on specialization.
Args:
response: The raw response from the model
specialization: The specialization area (optional)
Returns:
Formatted response
"""
if not response:
return ""
# Apply basic formatting to all responses
formatted_response = self._clean_whitespace(response)
# Apply specialization-specific formatting
processor = self.post_processors.get(specialization, self.post_processors["default"])
formatted_response = processor(formatted_response)
return formatted_response
def _clean_whitespace(self, text: str) -> str:
"""
Clean up excessive whitespace.
"""
# Replace multiple newlines with double newlines
text = re.sub(r'\n{3,}', '\n\n', text)
# Replace multiple spaces with a single space
text = re.sub(r' {2,}', ' ', text)
return text.strip()
def _format_code(self, text: str) -> str:
"""
Format code blocks with proper syntax highlighting markers.
"""
# Identify unmarked code blocks and add markdown code block syntax
# Look for patterns that suggest code (indentation, common programming keywords)
code_patterns = [
r'((?:^|\n)(?:def |class |import |function |public |private |var |let |const |if |for |while ).+(?:\n[ \t]+.+)+)',
r'((?:^|\n)(?:SELECT |INSERT |UPDATE |DELETE |CREATE |ALTER |DROP ).+(?:;)(?:\n|$))'
]
for pattern in code_patterns:
def add_code_markers(match):
code_block = match.group(1)
# Try to determine the language based on keywords
lang = self._detect_language(code_block)
return f"\n```{lang}\n{code_block}\n```\n"
text = re.sub(pattern, add_code_markers, text)
return text
def _detect_language(self, code_block: str) -> str:
"""
Attempt to detect the programming language from a code block.
"""
if re.search(r'def |class |import |if __name__ ==|print\(', code_block):
return "python"
elif re.search(r'function |var |const |let |=> |document\.', code_block):
return "javascript"
elif re.search(r'public |private |class .+ {|void |String |int |boolean', code_block):
return "java"
elif re.search(r'#include|int main|std::|printf|scanf', code_block):
return "c++"
elif re.search(r'SELECT |INSERT |UPDATE |DELETE |CREATE TABLE|ALTER TABLE', code_block):
return "sql"
else:
return "" # Generic code block
def _format_equations(self, text: str) -> str:
"""
Format mathematical equations with LaTeX markers if needed.
"""
# Basic pattern for unmarked equations
equation_patterns = [
r'([^$])(\\frac{.+?}{.+?}|\\sum_|\\int_|\\lim_)',
r'([^$])([a-zA-Z]_[0-9]+)',
r'([^$])([a-zA-Z]\\in)'
]
for pattern in equation_patterns:
text = re.sub(pattern, r'\1$\2$', text)
# Ensure equation blocks use proper LaTeX delimiters
text = re.sub(r'\\begin{equation}(.+?)\\end{equation}', r'$$\1$$', text, flags=re.DOTALL)
return text
def _format_technical_content(self, text: str) -> str:
"""
Format technical content with proper highlighting of terms and concepts.
"""
# Highlight technical terms
technical_terms = [
"neural network", "machine learning", "deep learning", "algorithm",
"regression", "classification", "clustering", "backpropagation",
"gradient descent", "optimization", "hyperparameter"
]
for term in technical_terms:
# Only highlight whole words, not substrings
text = re.sub(r'\b(' + re.escape(term) + r')\b(?![*_])', r'*\1*', text)
return text
def _default_formatter(self, text: str) -> str:
"""
Default formatter that applies general improvements.
"""
# Add paragraph breaks for readability when appropriate
text = re.sub(r'(\w\.)\s+([A-Z])', r'\1\n\n\2', text)
# Format lists for readability if they're not already formatted
text = re.sub(r'(?<!\n)(\d+\.)\s+', r'\n\1 ', text)
return text
def format_structured_output(self, data: Dict[str, Any]) -> str:
"""
Format structured data outputs (like JSON) into readable text.
Args:
data: Dictionary containing structured data
Returns:
Formatted string representation
"""
if not isinstance(data, dict):
return str(data)
formatted_parts = []
# Format main response if present
if "response" in data:
formatted_parts.append(self.format_response(data["response"]))
# Add metadata in a clean format if needed
metadata = {}
for key, value in data.items():
if key != "response" and not key.startswith("_"):
metadata[key] = value
if metadata:
formatted_parts.append("\n\n---\n")
for key, value in metadata.items():
formatted_parts.append(f"**{key.replace('_', ' ').title()}**: {value}")
return "\n".join(formatted_parts)