|
|
|
|
|
import re
|
|
|
import string
|
|
|
from typing import Optional
|
|
|
|
|
|
def preprocess(text: str, model_type: str = "naive_bayes") -> str:
|
|
|
"""
|
|
|
Enhanced preprocessing function with model-specific optimizations
|
|
|
|
|
|
Args:
|
|
|
text (str): Input text to preprocess
|
|
|
model_type (str): Type of model ("naive_bayes" or "bert")
|
|
|
|
|
|
Returns:
|
|
|
str: Preprocessed text
|
|
|
"""
|
|
|
if not text or not isinstance(text, str):
|
|
|
return ""
|
|
|
|
|
|
|
|
|
text = text.strip()
|
|
|
|
|
|
if model_type.lower() == "bert":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text)
|
|
|
|
|
|
|
|
|
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
|
|
|
|
|
|
|
|
|
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
|
|
|
|
|
|
|
|
|
text = re.sub(r'[.]{3,}', '...', text)
|
|
|
text = re.sub(r'[!]{2,}', '!', text)
|
|
|
text = re.sub(r'[?]{2,}', '?', text)
|
|
|
|
|
|
return text.strip()
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
|
|
|
|
text = text.lower()
|
|
|
|
|
|
|
|
|
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
|
|
|
|
|
|
|
|
|
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
|
|
|
|
|
|
|
|
|
text = re.sub(r'[^\w\s$%.-]', ' ', text)
|
|
|
|
|
|
|
|
|
text = re.sub(r'\b\d+\.\d+%\b', 'PERCENTAGE', text)
|
|
|
text = re.sub(r'\b\d+%\b', 'PERCENTAGE', text)
|
|
|
text = re.sub(r'\$\d+\.?\d*[KMB]?\b', 'DOLLAR_AMOUNT', text)
|
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text)
|
|
|
|
|
|
return text.strip()
|
|
|
|
|
|
def clean_financial_text(text: str) -> str:
|
|
|
"""
|
|
|
Specialized cleaning for financial text
|
|
|
|
|
|
Args:
|
|
|
text (str): Financial text to clean
|
|
|
|
|
|
Returns:
|
|
|
str: Cleaned financial text
|
|
|
"""
|
|
|
if not text:
|
|
|
return ""
|
|
|
|
|
|
|
|
|
financial_terms = {
|
|
|
'q1': 'first quarter',
|
|
|
'q2': 'second quarter',
|
|
|
'q3': 'third quarter',
|
|
|
'q4': 'fourth quarter',
|
|
|
'yoy': 'year over year',
|
|
|
'qoq': 'quarter over quarter',
|
|
|
'ipo': 'initial public offering',
|
|
|
'ceo': 'chief executive officer',
|
|
|
'cfo': 'chief financial officer',
|
|
|
'fed': 'federal reserve',
|
|
|
'gdp': 'gross domestic product',
|
|
|
'etf': 'exchange traded fund'
|
|
|
}
|
|
|
|
|
|
text_lower = text.lower()
|
|
|
for abbrev, full_form in financial_terms.items():
|
|
|
text_lower = text_lower.replace(abbrev, full_form)
|
|
|
|
|
|
return text_lower
|
|
|
|
|
|
def extract_financial_entities(text: str) -> dict:
|
|
|
"""
|
|
|
Extract financial entities from text
|
|
|
|
|
|
Args:
|
|
|
text (str): Input text
|
|
|
|
|
|
Returns:
|
|
|
dict: Dictionary containing extracted entities
|
|
|
"""
|
|
|
entities = {
|
|
|
'percentages': [],
|
|
|
'dollar_amounts': [],
|
|
|
'stock_symbols': [],
|
|
|
'quarters': [],
|
|
|
'years': []
|
|
|
}
|
|
|
|
|
|
|
|
|
percentages = re.findall(r'\b\d+\.?\d*%\b', text)
|
|
|
entities['percentages'] = percentages
|
|
|
|
|
|
|
|
|
dollar_amounts = re.findall(r'\$\d+\.?\d*[KMB]?\b', text)
|
|
|
entities['dollar_amounts'] = dollar_amounts
|
|
|
|
|
|
|
|
|
stock_symbols = re.findall(r'\b[A-Z]{2,5}\b', text)
|
|
|
entities['stock_symbols'] = stock_symbols
|
|
|
|
|
|
|
|
|
quarters = re.findall(r'\bQ[1-4]\b|\b[1-4]Q\b', text, re.IGNORECASE)
|
|
|
entities['quarters'] = quarters
|
|
|
|
|
|
|
|
|
years = re.findall(r'\b20\d{2}\b', text)
|
|
|
entities['years'] = years
|
|
|
|
|
|
return entities
|
|
|
|
|
|
def get_text_stats(text: str) -> dict:
|
|
|
"""
|
|
|
Get basic statistics about the text
|
|
|
|
|
|
Args:
|
|
|
text (str): Input text
|
|
|
|
|
|
Returns:
|
|
|
dict: Text statistics
|
|
|
"""
|
|
|
if not text:
|
|
|
return {
|
|
|
'word_count': 0,
|
|
|
'char_count': 0,
|
|
|
'sentence_count': 0,
|
|
|
'avg_word_length': 0
|
|
|
}
|
|
|
|
|
|
words = text.split()
|
|
|
sentences = re.split(r'[.!?]+', text)
|
|
|
|
|
|
stats = {
|
|
|
'word_count': len(words),
|
|
|
'char_count': len(text),
|
|
|
'sentence_count': len([s for s in sentences if s.strip()]),
|
|
|
'avg_word_length': sum(len(word) for word in words) / len(words) if words else 0
|
|
|
}
|
|
|
|
|
|
return stats
|
|
|
|
|
|
def validate_input(text: str, min_length: int = 5, max_length: int = 1000) -> tuple[bool, str]:
|
|
|
"""
|
|
|
Validate user input
|
|
|
|
|
|
Args:
|
|
|
text (str): Input text to validate
|
|
|
min_length (int): Minimum required length
|
|
|
max_length (int): Maximum allowed length
|
|
|
|
|
|
Returns:
|
|
|
tuple: (is_valid, error_message)
|
|
|
"""
|
|
|
if not text or not text.strip():
|
|
|
return False, "Text cannot be empty"
|
|
|
|
|
|
if len(text.strip()) < min_length:
|
|
|
return False, f"Text must be at least {min_length} characters long"
|
|
|
|
|
|
if len(text) > max_length:
|
|
|
return False, f"Text cannot exceed {max_length} characters"
|
|
|
|
|
|
|
|
|
if re.match(r'^[^\w\s]+$', text.strip()):
|
|
|
return False, "Text must contain alphanumeric characters"
|
|
|
|
|
|
return True, "" |