import re
import html
import uuid
import logging
from typing import List, Tuple, Optional
# Configure logging
logger = logging.getLogger(__name__)
def sanitize_input(text: str) -> str:
"""Sanitize user input to prevent potential injection attacks.
Args:
text: User input text
Returns:
Sanitized text with safe characters only
"""
try:
# Remove any potentially harmful characters while preserving basic formatting
sanitized = re.sub(r'[<>]', '', text)
# Remove any JavaScript event handlers
sanitized = re.sub(r'on\w+="[^"]*"', '', sanitized, flags=re.IGNORECASE)
# Limit length with increased capacity
return sanitized[:2000]
except Exception as e:
logger.error(f"Error sanitizing input: {e}")
return ""
def strip_html(text: str) -> str:
"""Remove HTML tags from text while preserving content structure.
Args:
text: HTML content to be stripped
Returns:
Plain text with HTML tags removed but content structure preserved
"""
if not text:
return ""
# Replace line break tags with actual line breaks
text = text.replace(' ', '\n')
text = text.replace(' ', '\n')
text = text.replace('
', '\n\n')
text = text.replace('', '\n\n')
# Replace list tags with appropriate formatting
text = re.sub(r'?ul>', '\n', text)
text = re.sub(r'?ol>', '\n', text)
text = re.sub(r'
', '\n- ', text)
# Remove remaining HTML tags
clean_text = re.sub(r"<[^>]+>", "", text)
# Clean up extra whitespace
clean_text = re.sub(r'\n\s*\n', '\n\n', clean_text)
return clean_text.strip()
def inject_interactive_elements(html_str: str) -> str:
"""
Add interactive elements to HTML content like:
- Copy buttons for code blocks
- Expandable sections for long content
- Syntax highlighting
Args:
html_str: HTML content with potential code blocks
Returns:
HTML content with interactive elements added
"""
if not html_str or '```' not in html_str:
return html_str
import re
# Add copy buttons to code blocks
def add_copy_button(match):
code_content = match.group(2)
code_lang = match.group(1) if match.group(1) else "text"
button_id = str(uuid.uuid4())[:8]
return f'''
{html.escape(code_content)}
'''
# Process code blocks with language specification
try:
result = re.sub(r'```(\w*)\n(.*?)```', add_copy_button, html_str, flags=re.DOTALL)
# Add JavaScript for copy functionality
js_script = """
"""
# Add syntax highlighting CSS if needed
css_link = '\n'
hljs_script = '\n'
# Add the script and CSS if we have code blocks
result = css_link + hljs_script + result + js_script
return result
except Exception as e:
logger.error(f"Error adding interactive elements: {e}")
return html_str
def detect_language_from_context(question: str, topic: str) -> str:
"""Detect the programming language based on question and topic context.
Args:
question: User's question text
topic: Main topic of the query
Returns:
Detected programming language code
"""
# Language mapping with common indicators
mapping = {
"Python": ["python", "pandas", "numpy", "matplotlib", "dataframe"],
"SQL": ["sql", "query", "database", "select", "join"],
"JavaScript": ["javascript", "js", "react", "dom", "node"],
"Java": ["java", "spring", "hibernate"],
"C#": ["c#", "csharp", "dotnet", ".net"],
"Power BI": ["dax", "powerbi", "power bi", "pbix"],
"Data Visualization": ["visualization", "chart", "plot", "graph"],
"HTML": ["html", "markup", "webpage"],
"CSS": ["css", "stylesheet"],
"Shell": ["bash", "shell", "command", "script"]
}
# Check topic first with exact matches
for lang, keywords in mapping.items():
for keyword in keywords:
if keyword.lower() in topic.lower():
return lang.lower()
# Check question for additional clues
question_lower = question.lower()
for lang, keywords in mapping.items():
for keyword in keywords:
if keyword.lower() in question_lower:
return lang.lower()
return "text"
def truncate_text(text: str, max_length: int = 500, min_length: int = 200) -> str:
"""Truncate text to a maximum length while trying to preserve meaningful content.
Args:
text: Text to truncate
max_length: Maximum length for the truncated text
min_length: Minimum length before adding ellipsis
Returns:
Truncated text with ellipsis if needed
"""
if not text:
return ""
if len(text) <= max_length:
return text
# Try to find a natural break point
space_index = text.rfind(' ', min_length, max_length)
if space_index > 0:
return text[:space_index] + "..."
# Fallback to simple truncation
return text[:max_length] + "..."