File size: 7,306 Bytes
00bd2b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import re
import html
import uuid
import logging
from typing import List, Tuple, Optional
# Configure logging
logger = logging.getLogger(__name__)
def sanitize_input(text: str) -> str:
"""Sanitize user input to prevent potential injection attacks.
Args:
text: User input text
Returns:
Sanitized text with safe characters only
"""
try:
# Remove any potentially harmful characters while preserving basic formatting
sanitized = re.sub(r'[<>]', '', text)
# Remove any JavaScript event handlers
sanitized = re.sub(r'on\w+="[^"]*"', '', sanitized, flags=re.IGNORECASE)
# Limit length with increased capacity
return sanitized[:2000]
except Exception as e:
logger.error(f"Error sanitizing input: {e}")
return ""
def strip_html(text: str) -> str:
"""Remove HTML tags from text while preserving content structure.
Args:
text: HTML content to be stripped
Returns:
Plain text with HTML tags removed but content structure preserved
"""
if not text:
return ""
# Replace line break tags with actual line breaks
text = text.replace('<br>', '\n')
text = text.replace('<br/>', '\n')
text = text.replace('</p>', '\n\n')
text = text.replace('</div>', '\n\n')
# Replace list tags with appropriate formatting
text = re.sub(r'</?ul>', '\n', text)
text = re.sub(r'</?ol>', '\n', text)
text = re.sub(r'<li>', '\n- ', text)
# Remove remaining HTML tags
clean_text = re.sub(r"<[^>]+>", "", text)
# Clean up extra whitespace
clean_text = re.sub(r'\n\s*\n', '\n\n', clean_text)
return clean_text.strip()
def inject_interactive_elements(html_str: str) -> str:
"""
Add interactive elements to HTML content like:
- Copy buttons for code blocks
- Expandable sections for long content
- Syntax highlighting
Args:
html_str: HTML content with potential code blocks
Returns:
HTML content with interactive elements added
"""
if not html_str or '```' not in html_str:
return html_str
import re
# Add copy buttons to code blocks
def add_copy_button(match):
code_content = match.group(2)
code_lang = match.group(1) if match.group(1) else "text"
button_id = str(uuid.uuid4())[:8]
return f'''
<div style="position: relative; margin: 10px 0;">
<button id="copy-btn-{button_id}" onclick="copyCode('{button_id}')"
style="position: absolute; top: 5px; right: 5px; z-index: 10;
background: #f0f0f0; border: 1px solid #ccc; border-radius: 4px;
padding: 4px 8px; cursor: pointer; font-size: 12px;">
Copy
</button>
<pre style="padding: 20px 10px 10px 10px; border-radius: 8px;
background: #f8f8f8; overflow-x: auto; position: relative;">
<code class="language-{code_lang}">{html.escape(code_content)}</code>
</pre>
</div>
'''
# Process code blocks with language specification
try:
result = re.sub(r'```(\w*)\n(.*?)```', add_copy_button, html_str, flags=re.DOTALL)
# Add JavaScript for copy functionality
js_script = """
<script>
function copyCode(elementId) {
const button = document.getElementById('copy-btn-' + elementId);
const codeBlock = button.nextElementSibling.querySelector('code');
const text = codeBlock.textContent;
navigator.clipboard.writeText(text).then(() => {
const originalText = button.textContent;
button.textContent = 'Copied!';
setTimeout(() => {
button.textContent = originalText;
}, 2000);
}).catch(err => {
console.error('Failed to copy: ', err);
button.textContent = 'Failed';
setTimeout(() => {
button.textContent = 'Copy';
}, 2000);
});
}
// Initialize syntax highlighting
document.addEventListener('DOMContentLoaded', (event) => {
document.querySelectorAll('pre code').forEach((el) => {
hljs.highlightElement(el);
});
});
</script>
"""
# Add syntax highlighting CSS if needed
css_link = '<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/github.min.css">\n'
hljs_script = '<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>\n'
# Add the script and CSS if we have code blocks
result = css_link + hljs_script + result + js_script
return result
except Exception as e:
logger.error(f"Error adding interactive elements: {e}")
return html_str
def detect_language_from_context(question: str, topic: str) -> str:
"""Detect the programming language based on question and topic context.
Args:
question: User's question text
topic: Main topic of the query
Returns:
Detected programming language code
"""
# Language mapping with common indicators
mapping = {
"Python": ["python", "pandas", "numpy", "matplotlib", "dataframe"],
"SQL": ["sql", "query", "database", "select", "join"],
"JavaScript": ["javascript", "js", "react", "dom", "node"],
"Java": ["java", "spring", "hibernate"],
"C#": ["c#", "csharp", "dotnet", ".net"],
"Power BI": ["dax", "powerbi", "power bi", "pbix"],
"Data Visualization": ["visualization", "chart", "plot", "graph"],
"HTML": ["html", "markup", "webpage"],
"CSS": ["css", "stylesheet"],
"Shell": ["bash", "shell", "command", "script"]
}
# Check topic first with exact matches
for lang, keywords in mapping.items():
for keyword in keywords:
if keyword.lower() in topic.lower():
return lang.lower()
# Check question for additional clues
question_lower = question.lower()
for lang, keywords in mapping.items():
for keyword in keywords:
if keyword.lower() in question_lower:
return lang.lower()
return "text"
def truncate_text(text: str, max_length: int = 500, min_length: int = 200) -> str:
"""Truncate text to a maximum length while trying to preserve meaningful content.
Args:
text: Text to truncate
max_length: Maximum length for the truncated text
min_length: Minimum length before adding ellipsis
Returns:
Truncated text with ellipsis if needed
"""
if not text:
return ""
if len(text) <= max_length:
return text
# Try to find a natural break point
space_index = text.rfind(' ', min_length, max_length)
if space_index > 0:
return text[:space_index] + "..."
# Fallback to simple truncation
return text[:max_length] + "..." |