import re import html import uuid import logging from typing import List, Tuple, Optional # Configure logging logger = logging.getLogger(__name__) def sanitize_input(text: str) -> str: """Sanitize user input to prevent potential injection attacks. Args: text: User input text Returns: Sanitized text with safe characters only """ try: # Remove any potentially harmful characters while preserving basic formatting sanitized = re.sub(r'[<>]', '', text) # Remove any JavaScript event handlers sanitized = re.sub(r'on\w+="[^"]*"', '', sanitized, flags=re.IGNORECASE) # Limit length with increased capacity return sanitized[:2000] except Exception as e: logger.error(f"Error sanitizing input: {e}") return "" def strip_html(text: str) -> str: """Remove HTML tags from text while preserving content structure. Args: text: HTML content to be stripped Returns: Plain text with HTML tags removed but content structure preserved """ if not text: return "" # Replace line break tags with actual line breaks text = text.replace('
', '\n') text = text.replace('
', '\n') text = text.replace('

', '\n\n') text = text.replace('', '\n\n') # Replace list tags with appropriate formatting text = re.sub(r'', '\n', text) text = re.sub(r'', '\n', text) text = re.sub(r'
  • ', '\n- ', text) # Remove remaining HTML tags clean_text = re.sub(r"<[^>]+>", "", text) # Clean up extra whitespace clean_text = re.sub(r'\n\s*\n', '\n\n', clean_text) return clean_text.strip() def inject_interactive_elements(html_str: str) -> str: """ Add interactive elements to HTML content like: - Copy buttons for code blocks - Expandable sections for long content - Syntax highlighting Args: html_str: HTML content with potential code blocks Returns: HTML content with interactive elements added """ if not html_str or '```' not in html_str: return html_str import re # Add copy buttons to code blocks def add_copy_button(match): code_content = match.group(2) code_lang = match.group(1) if match.group(1) else "text" button_id = str(uuid.uuid4())[:8] return f'''
                    {html.escape(code_content)}
                
    ''' # Process code blocks with language specification try: result = re.sub(r'```(\w*)\n(.*?)```', add_copy_button, html_str, flags=re.DOTALL) # Add JavaScript for copy functionality js_script = """ """ # Add syntax highlighting CSS if needed css_link = '\n' hljs_script = '\n' # Add the script and CSS if we have code blocks result = css_link + hljs_script + result + js_script return result except Exception as e: logger.error(f"Error adding interactive elements: {e}") return html_str def detect_language_from_context(question: str, topic: str) -> str: """Detect the programming language based on question and topic context. Args: question: User's question text topic: Main topic of the query Returns: Detected programming language code """ # Language mapping with common indicators mapping = { "Python": ["python", "pandas", "numpy", "matplotlib", "dataframe"], "SQL": ["sql", "query", "database", "select", "join"], "JavaScript": ["javascript", "js", "react", "dom", "node"], "Java": ["java", "spring", "hibernate"], "C#": ["c#", "csharp", "dotnet", ".net"], "Power BI": ["dax", "powerbi", "power bi", "pbix"], "Data Visualization": ["visualization", "chart", "plot", "graph"], "HTML": ["html", "markup", "webpage"], "CSS": ["css", "stylesheet"], "Shell": ["bash", "shell", "command", "script"] } # Check topic first with exact matches for lang, keywords in mapping.items(): for keyword in keywords: if keyword.lower() in topic.lower(): return lang.lower() # Check question for additional clues question_lower = question.lower() for lang, keywords in mapping.items(): for keyword in keywords: if keyword.lower() in question_lower: return lang.lower() return "text" def truncate_text(text: str, max_length: int = 500, min_length: int = 200) -> str: """Truncate text to a maximum length while trying to preserve meaningful content. Args: text: Text to truncate max_length: Maximum length for the truncated text min_length: Minimum length before adding ellipsis Returns: Truncated text with ellipsis if needed """ if not text: return "" if len(text) <= max_length: return text # Try to find a natural break point space_index = text.rfind(' ', min_length, max_length) if space_index > 0: return text[:space_index] + "..." # Fallback to simple truncation return text[:max_length] + "..."