File size: 5,238 Bytes
3f58cad | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
"""
Helper utility functions
"""
import re
import hashlib
import logging
from typing import Dict, List, Any, Optional
from datetime import datetime
logger = logging.getLogger(__name__)
def sanitize_text(text: str) -> str:
"""Sanitize text input for processing"""
if not text:
return ""
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text.strip())
# Remove special characters that might cause issues
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\'\"]', '', text)
return text
def validate_hex_color(color: str) -> bool:
"""Validate hex color format"""
if not color:
return False
pattern = r'^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})$'
return bool(re.match(pattern, color))
def generate_unique_id(content: str = "") -> str:
"""Generate unique ID for content"""
timestamp = datetime.now().isoformat()
content_hash = hashlib.md5(f"{content}{timestamp}".encode()).hexdigest()
return content_hash[:8]
def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
"""Truncate text to specified length"""
if len(text) <= max_length:
return text
return text[:max_length - len(suffix)].strip() + suffix
def extract_numbers(text: str) -> List[float]:
"""Extract all numbers from text"""
pattern = r'-?\d+(?:\.\d+)?'
matches = re.findall(pattern, text)
return [float(match) for match in matches]
def calculate_reading_time(text: str, wpm: int = 200) -> int:
"""Calculate estimated reading time in minutes"""
word_count = len(text.split())
return max(1, round(word_count / wpm))
def format_file_size(size_bytes: int) -> str:
"""Format file size in human readable format"""
if size_bytes == 0:
return "0 B"
size_names = ["B", "KB", "MB", "GB"]
i = 0
while size_bytes >= 1024 and i < len(size_names) - 1:
size_bytes /= 1024.0
i += 1
return f"{size_bytes:.1f} {size_names[i]}"
def safe_divide(a: float, b: float, default: float = 0.0) -> float:
"""Safe division with default value"""
try:
return a / b if b != 0 else default
except (TypeError, ZeroDivisionError):
return default
def merge_dicts(dict1: Dict, dict2: Dict) -> Dict:
"""Merge two dictionaries recursively"""
result = dict1.copy()
for key, value in dict2.items():
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
result[key] = merge_dicts(result[key], value)
else:
result[key] = value
return result
def validate_content_length(text: str, min_length: int = 50, max_length: int = 15000) -> Dict[str, Any]:
"""Validate content length"""
length = len(text.strip())
return {
'valid': min_length <= length <= max_length,
'length': length,
'min_required': min_length,
'max_allowed': max_length,
'message': f"Content length: {length} characters"
}
def extract_urls(text: str) -> List[str]:
"""Extract URLs from text"""
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
return re.findall(url_pattern, text)
def clean_filename(filename: str) -> str:
"""Clean filename for safe file system usage"""
# Remove or replace invalid characters
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
filename = filename.strip('. ')
# Limit length
if len(filename) > 100:
name, ext = os.path.splitext(filename)
filename = name[:100-len(ext)] + ext
return filename or 'untitled'
def log_performance(func):
"""Decorator to log function performance"""
def wrapper(*args, **kwargs):
start_time = datetime.now()
try:
result = func(*args, **kwargs)
end_time = datetime.now()
duration = (end_time - start_time).total_seconds()
logger.info(f"{func.__name__} completed in {duration:.2f} seconds")
return result
except Exception as e:
end_time = datetime.now()
duration = (end_time - start_time).total_seconds()
logger.error(f"{func.__name__} failed after {duration:.2f} seconds: {e}")
raise
return wrapper
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
"""Split text into overlapping chunks"""
if len(text) <= chunk_size:
return [text]
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
if end >= len(text):
chunks.append(text[start:])
break
# Try to break at a sentence or word boundary
chunk = text[start:end]
# Look for sentence boundary
last_sentence = chunk.rfind('.')
if last_sentence > chunk_size // 2:
chunk = chunk[:last_sentence + 1]
else:
# Look for word boundary
last_space = chunk.rfind(' ')
if last_space > chunk_size // 2:
chunk = chunk[:last_space]
chunks.append(chunk)
start += len(chunk) - overlap
return chunks |