3Stark123's picture
Create utils/helpers.py
3f58cad verified
"""
Helper utility functions
"""
import re
import hashlib
import logging
from typing import Dict, List, Any, Optional
from datetime import datetime
logger = logging.getLogger(__name__)
def sanitize_text(text: str) -> str:
"""Sanitize text input for processing"""
if not text:
return ""
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text.strip())
# Remove special characters that might cause issues
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\'\"]', '', text)
return text
def validate_hex_color(color: str) -> bool:
"""Validate hex color format"""
if not color:
return False
pattern = r'^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})$'
return bool(re.match(pattern, color))
def generate_unique_id(content: str = "") -> str:
"""Generate unique ID for content"""
timestamp = datetime.now().isoformat()
content_hash = hashlib.md5(f"{content}{timestamp}".encode()).hexdigest()
return content_hash[:8]
def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
"""Truncate text to specified length"""
if len(text) <= max_length:
return text
return text[:max_length - len(suffix)].strip() + suffix
def extract_numbers(text: str) -> List[float]:
"""Extract all numbers from text"""
pattern = r'-?\d+(?:\.\d+)?'
matches = re.findall(pattern, text)
return [float(match) for match in matches]
def calculate_reading_time(text: str, wpm: int = 200) -> int:
"""Calculate estimated reading time in minutes"""
word_count = len(text.split())
return max(1, round(word_count / wpm))
def format_file_size(size_bytes: int) -> str:
"""Format file size in human readable format"""
if size_bytes == 0:
return "0 B"
size_names = ["B", "KB", "MB", "GB"]
i = 0
while size_bytes >= 1024 and i < len(size_names) - 1:
size_bytes /= 1024.0
i += 1
return f"{size_bytes:.1f} {size_names[i]}"
def safe_divide(a: float, b: float, default: float = 0.0) -> float:
"""Safe division with default value"""
try:
return a / b if b != 0 else default
except (TypeError, ZeroDivisionError):
return default
def merge_dicts(dict1: Dict, dict2: Dict) -> Dict:
"""Merge two dictionaries recursively"""
result = dict1.copy()
for key, value in dict2.items():
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
result[key] = merge_dicts(result[key], value)
else:
result[key] = value
return result
def validate_content_length(text: str, min_length: int = 50, max_length: int = 15000) -> Dict[str, Any]:
"""Validate content length"""
length = len(text.strip())
return {
'valid': min_length <= length <= max_length,
'length': length,
'min_required': min_length,
'max_allowed': max_length,
'message': f"Content length: {length} characters"
}
def extract_urls(text: str) -> List[str]:
"""Extract URLs from text"""
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
return re.findall(url_pattern, text)
def clean_filename(filename: str) -> str:
"""Clean filename for safe file system usage"""
# Remove or replace invalid characters
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
filename = filename.strip('. ')
# Limit length
if len(filename) > 100:
name, ext = os.path.splitext(filename)
filename = name[:100-len(ext)] + ext
return filename or 'untitled'
def log_performance(func):
"""Decorator to log function performance"""
def wrapper(*args, **kwargs):
start_time = datetime.now()
try:
result = func(*args, **kwargs)
end_time = datetime.now()
duration = (end_time - start_time).total_seconds()
logger.info(f"{func.__name__} completed in {duration:.2f} seconds")
return result
except Exception as e:
end_time = datetime.now()
duration = (end_time - start_time).total_seconds()
logger.error(f"{func.__name__} failed after {duration:.2f} seconds: {e}")
raise
return wrapper
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
"""Split text into overlapping chunks"""
if len(text) <= chunk_size:
return [text]
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
if end >= len(text):
chunks.append(text[start:])
break
# Try to break at a sentence or word boundary
chunk = text[start:end]
# Look for sentence boundary
last_sentence = chunk.rfind('.')
if last_sentence > chunk_size // 2:
chunk = chunk[:last_sentence + 1]
else:
# Look for word boundary
last_space = chunk.rfind(' ')
if last_space > chunk_size // 2:
chunk = chunk[:last_space]
chunks.append(chunk)
start += len(chunk) - overlap
return chunks