""" Helper utility functions """ import re import logging from typing import List, Dict, Any from datetime import datetime # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def clean_text(text: str) -> str: """ Clean and normalize text by removing extra whitespace, special characters, etc. Args: text: Raw text to clean Returns: Cleaned text string """ # Remove extra whitespace text = re.sub(r'\s+', ' ', text) # Remove special characters but keep basic punctuation text = re.sub(r'[^\w\s.,!?;:\-\'\"()]', '', text) # Strip leading/trailing whitespace text = text.strip() return text def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]: """ Split text into overlapping chunks for processing. Args: text: Text to chunk chunk_size: Size of each chunk in characters overlap: Overlap between chunks Returns: List of text chunks """ chunks = [] start = 0 text_length = len(text) while start < text_length: end = start + chunk_size chunk = text[start:end] chunks.append(chunk) start = end - overlap return chunks def summarize_text(text: str, max_length: int = 500) -> str: """ Create a simple extractive summary by taking the first sentences. Args: text: Text to summarize max_length: Maximum length of summary Returns: Summarized text """ sentences = re.split(r'[.!?]+', text) summary = "" for sentence in sentences: sentence = sentence.strip() if not sentence: continue if len(summary) + len(sentence) + 2 <= max_length: # +2 for ". " summary += sentence + ". " else: break # If no sentences fit, return truncated text if not summary and text: summary = text[:max_length].rsplit(' ', 1)[0] + "..." return summary.strip() def extract_keywords(text: str, top_n: int = 10) -> List[str]: """ Extract top keywords from text using simple frequency analysis. Args: text: Text to analyze top_n: Number of top keywords to return Returns: List of keywords """ # Simple word frequency approach words = re.findall(r'\b[a-zA-Z]{4,}\b', text.lower()) # Remove common stop words stop_words = {'that', 'this', 'with', 'from', 'have', 'been', 'were', 'will', 'would', 'could', 'should', 'about', 'their', 'there'} words = [w for w in words if w not in stop_words] # Count frequency word_freq: Dict[str, int] = {} for word in words: word_freq[word] = word_freq.get(word, 0) + 1 # Sort by frequency and return top N sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) return [word for word, freq in sorted_words[:top_n]] def validate_url(url: str) -> bool: """ Validate if a string is a proper URL. Args: url: URL string to validate Returns: True if valid URL, False otherwise """ url_pattern = re.compile( r'^https?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain... r'localhost|' # localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) return url_pattern.match(url) is not None def format_timestamp() -> str: """ Get current timestamp in ISO format. Returns: ISO formatted timestamp string """ return datetime.now().isoformat() def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float: """ Safely divide two numbers, returning default if denominator is zero. Args: numerator: Numerator value denominator: Denominator value default: Default value if division by zero Returns: Division result or default """ try: return numerator / denominator if denominator != 0 else default except (TypeError, ZeroDivisionError): return default def parse_json_safe(json_str: str) -> Dict[str, Any]: """ Safely parse JSON string with error handling. Args: json_str: JSON string to parse Returns: Parsed dictionary or empty dict on error """ import json try: return json.loads(json_str) except json.JSONDecodeError as e: logger.error(f"JSON parse error: {e}") return {}