chatbot1 / src /utils.py
Jack-ki1's picture
Upload 16 files
00bd2b1 verified
import re
import html
import uuid
import logging
from typing import List, Tuple, Optional
# Configure logging
logger = logging.getLogger(__name__)
def sanitize_input(text: str) -> str:
"""Sanitize user input to prevent potential injection attacks.
Args:
text: User input text
Returns:
Sanitized text with safe characters only
"""
try:
# Remove any potentially harmful characters while preserving basic formatting
sanitized = re.sub(r'[<>]', '', text)
# Remove any JavaScript event handlers
sanitized = re.sub(r'on\w+="[^"]*"', '', sanitized, flags=re.IGNORECASE)
# Limit length with increased capacity
return sanitized[:2000]
except Exception as e:
logger.error(f"Error sanitizing input: {e}")
return ""
def strip_html(text: str) -> str:
"""Remove HTML tags from text while preserving content structure.
Args:
text: HTML content to be stripped
Returns:
Plain text with HTML tags removed but content structure preserved
"""
if not text:
return ""
# Replace line break tags with actual line breaks
text = text.replace('<br>', '\n')
text = text.replace('<br/>', '\n')
text = text.replace('</p>', '\n\n')
text = text.replace('</div>', '\n\n')
# Replace list tags with appropriate formatting
text = re.sub(r'</?ul>', '\n', text)
text = re.sub(r'</?ol>', '\n', text)
text = re.sub(r'<li>', '\n- ', text)
# Remove remaining HTML tags
clean_text = re.sub(r"<[^>]+>", "", text)
# Clean up extra whitespace
clean_text = re.sub(r'\n\s*\n', '\n\n', clean_text)
return clean_text.strip()
def inject_interactive_elements(html_str: str) -> str:
"""
Add interactive elements to HTML content like:
- Copy buttons for code blocks
- Expandable sections for long content
- Syntax highlighting
Args:
html_str: HTML content with potential code blocks
Returns:
HTML content with interactive elements added
"""
if not html_str or '```' not in html_str:
return html_str
import re
# Add copy buttons to code blocks
def add_copy_button(match):
code_content = match.group(2)
code_lang = match.group(1) if match.group(1) else "text"
button_id = str(uuid.uuid4())[:8]
return f'''
<div style="position: relative; margin: 10px 0;">
<button id="copy-btn-{button_id}" onclick="copyCode('{button_id}')"
style="position: absolute; top: 5px; right: 5px; z-index: 10;
background: #f0f0f0; border: 1px solid #ccc; border-radius: 4px;
padding: 4px 8px; cursor: pointer; font-size: 12px;">
Copy
</button>
<pre style="padding: 20px 10px 10px 10px; border-radius: 8px;
background: #f8f8f8; overflow-x: auto; position: relative;">
<code class="language-{code_lang}">{html.escape(code_content)}</code>
</pre>
</div>
'''
# Process code blocks with language specification
try:
result = re.sub(r'```(\w*)\n(.*?)```', add_copy_button, html_str, flags=re.DOTALL)
# Add JavaScript for copy functionality
js_script = """
<script>
function copyCode(elementId) {
const button = document.getElementById('copy-btn-' + elementId);
const codeBlock = button.nextElementSibling.querySelector('code');
const text = codeBlock.textContent;
navigator.clipboard.writeText(text).then(() => {
const originalText = button.textContent;
button.textContent = 'Copied!';
setTimeout(() => {
button.textContent = originalText;
}, 2000);
}).catch(err => {
console.error('Failed to copy: ', err);
button.textContent = 'Failed';
setTimeout(() => {
button.textContent = 'Copy';
}, 2000);
});
}
// Initialize syntax highlighting
document.addEventListener('DOMContentLoaded', (event) => {
document.querySelectorAll('pre code').forEach((el) => {
hljs.highlightElement(el);
});
});
</script>
"""
# Add syntax highlighting CSS if needed
css_link = '<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/github.min.css">\n'
hljs_script = '<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>\n'
# Add the script and CSS if we have code blocks
result = css_link + hljs_script + result + js_script
return result
except Exception as e:
logger.error(f"Error adding interactive elements: {e}")
return html_str
def detect_language_from_context(question: str, topic: str) -> str:
"""Detect the programming language based on question and topic context.
Args:
question: User's question text
topic: Main topic of the query
Returns:
Detected programming language code
"""
# Language mapping with common indicators
mapping = {
"Python": ["python", "pandas", "numpy", "matplotlib", "dataframe"],
"SQL": ["sql", "query", "database", "select", "join"],
"JavaScript": ["javascript", "js", "react", "dom", "node"],
"Java": ["java", "spring", "hibernate"],
"C#": ["c#", "csharp", "dotnet", ".net"],
"Power BI": ["dax", "powerbi", "power bi", "pbix"],
"Data Visualization": ["visualization", "chart", "plot", "graph"],
"HTML": ["html", "markup", "webpage"],
"CSS": ["css", "stylesheet"],
"Shell": ["bash", "shell", "command", "script"]
}
# Check topic first with exact matches
for lang, keywords in mapping.items():
for keyword in keywords:
if keyword.lower() in topic.lower():
return lang.lower()
# Check question for additional clues
question_lower = question.lower()
for lang, keywords in mapping.items():
for keyword in keywords:
if keyword.lower() in question_lower:
return lang.lower()
return "text"
def truncate_text(text: str, max_length: int = 500, min_length: int = 200) -> str:
"""Truncate text to a maximum length while trying to preserve meaningful content.
Args:
text: Text to truncate
max_length: Maximum length for the truncated text
min_length: Minimum length before adding ellipsis
Returns:
Truncated text with ellipsis if needed
"""
if not text:
return ""
if len(text) <= max_length:
return text
# Try to find a natural break point
space_index = text.rfind(' ', min_length, max_length)
if space_index > 0:
return text[:space_index] + "..."
# Fallback to simple truncation
return text[:max_length] + "..."