# utils.py
import re
import string
from typing import Optional

def preprocess(text: str, model_type: str = "naive_bayes") -> str:
    """
    Enhanced preprocessing function with model-specific optimizations
    
    Args:
        text (str): Input text to preprocess
        model_type (str): Type of model ("naive_bayes" or "bert")
    
    Returns:
        str: Preprocessed text
    """
    if not text or not isinstance(text, str):
        return ""
    
    # Basic cleaning
    text = text.strip()
    
    if model_type.lower() == "bert":
        # BERT-specific preprocessing (less aggressive)
        # BERT can handle punctuation and case better
        
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        
        # Remove email addresses
        text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
        
        # Remove excessive punctuation (more than 2 consecutive)
        text = re.sub(r'[.]{3,}', '...', text)
        text = re.sub(r'[!]{2,}', '!', text)
        text = re.sub(r'[?]{2,}', '?', text)
        
        return text.strip()
    
    else:
        # Naive Bayes preprocessing (more aggressive cleaning)
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        
        # Remove email addresses
        text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
        
        # Remove special financial symbols but keep dollar signs and percentages
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Handle numbers and percentages
        text = re.sub(r'\b\d+\.\d+%\b', 'PERCENTAGE', text)
        text = re.sub(r'\b\d+%\b', 'PERCENTAGE', text)
        text = re.sub(r'\$\d+\.?\d*[KMB]?\b', 'DOLLAR_AMOUNT', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        
        return text.strip()

def clean_financial_text(text: str) -> str:
    """
    Specialized cleaning for financial text
    
    Args:
        text (str): Financial text to clean
        
    Returns:
        str: Cleaned financial text
    """
    if not text:
        return ""
    
    # Common financial abbreviations to preserve
    financial_terms = {
        'q1': 'first quarter',
        'q2': 'second quarter', 
        'q3': 'third quarter',
        'q4': 'fourth quarter',
        'yoy': 'year over year',
        'qoq': 'quarter over quarter',
        'ipo': 'initial public offering',
        'ceo': 'chief executive officer',
        'cfo': 'chief financial officer',
        'fed': 'federal reserve',
        'gdp': 'gross domestic product',
        'etf': 'exchange traded fund'
    }
    
    text_lower = text.lower()
    for abbrev, full_form in financial_terms.items():
        text_lower = text_lower.replace(abbrev, full_form)
    
    return text_lower

def extract_financial_entities(text: str) -> dict:
    """
    Extract financial entities from text
    
    Args:
        text (str): Input text
        
    Returns:
        dict: Dictionary containing extracted entities
    """
    entities = {
        'percentages': [],
        'dollar_amounts': [],
        'stock_symbols': [],
        'quarters': [],
        'years': []
    }
    
    # Extract percentages
    percentages = re.findall(r'\b\d+\.?\d*%\b', text)
    entities['percentages'] = percentages
    
    # Extract dollar amounts
    dollar_amounts = re.findall(r'\$\d+\.?\d*[KMB]?\b', text)
    entities['dollar_amounts'] = dollar_amounts
    
    # Extract potential stock symbols (2-5 uppercase letters)
    stock_symbols = re.findall(r'\b[A-Z]{2,5}\b', text)
    entities['stock_symbols'] = stock_symbols
    
    # Extract quarters
    quarters = re.findall(r'\bQ[1-4]\b|\b[1-4]Q\b', text, re.IGNORECASE)
    entities['quarters'] = quarters
    
    # Extract years
    years = re.findall(r'\b20\d{2}\b', text)
    entities['years'] = years
    
    return entities

def get_text_stats(text: str) -> dict:
    """
    Get basic statistics about the text
    
    Args:
        text (str): Input text
        
    Returns:
        dict: Text statistics
    """
    if not text:
        return {
            'word_count': 0,
            'char_count': 0,
            'sentence_count': 0,
            'avg_word_length': 0
        }
    
    words = text.split()
    sentences = re.split(r'[.!?]+', text)
    
    stats = {
        'word_count': len(words),
        'char_count': len(text),
        'sentence_count': len([s for s in sentences if s.strip()]),
        'avg_word_length': sum(len(word) for word in words) / len(words) if words else 0
    }
    
    return stats

def validate_input(text: str, min_length: int = 5, max_length: int = 1000) -> tuple[bool, str]:
    """
    Validate user input
    
    Args:
        text (str): Input text to validate
        min_length (int): Minimum required length
        max_length (int): Maximum allowed length
        
    Returns:
        tuple: (is_valid, error_message)
    """
    if not text or not text.strip():
        return False, "Text cannot be empty"
    
    if len(text.strip()) < min_length:
        return False, f"Text must be at least {min_length} characters long"
    
    if len(text) > max_length:
        return False, f"Text cannot exceed {max_length} characters"
    
    # Check if text contains only special characters
    if re.match(r'^[^\w\s]+$', text.strip()):
        return False, "Text must contain alphanumeric characters"
    
    return True, ""