""" Text processing and analysis module """ import re import nltk import textstat from typing import Dict, List, Tuple, Optional from collections import Counter import logging # Download required NLTK data try: nltk.download('punkt', quiet=True) nltk.download('stopwords', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) except: pass logger = logging.getLogger(__name__) class TextProcessor: """Process and analyze text content for infographic generation""" def __init__(self): self.stop_words = set() try: from nltk.corpus import stopwords self.stop_words = set(stopwords.words('english')) except: # Fallback stop words self.stop_words = { 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did' } def analyze_text(self, text: str) -> Dict: """Comprehensive text analysis""" if not text or len(text.strip()) < 10: return self._empty_analysis() analysis = { 'original_text': text, 'cleaned_text': self._clean_text(text), 'statistics': self._get_text_statistics(text), 'structure': self._analyze_structure(text), 'key_points': self._extract_key_points(text), 'keywords': self._extract_keywords(text), 'sentiment': self._analyze_sentiment(text), 'sections': self._identify_sections(text), 'data_elements': self._extract_data_elements(text) } return analysis def _clean_text(self, text: str) -> str: """Clean and normalize text""" text = re.sub(r'\s+', ' ', text) text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', '', text) return text.strip() def _get_text_statistics(self, text: str) -> Dict: """Get basic text statistics""" sentences = re.split(r'[.!?]+', text) sentences = [s for s in sentences if s.strip()] return { 'word_count': len(text.split()), 'char_count': len(text), 'sentence_count': len(sentences), 'paragraph_count': len([p for p in text.split('\n\n') if p.strip()]), 'reading_level': min(100, max(0, textstat.flesch_reading_ease(text))), 'avg_words_per_sentence': round(len(text.split()) / max(1, len(sentences)), 1) } def _analyze_structure(self, text: str) -> Dict: """Analyze text structure""" paragraphs = [p.strip() for p in text.split('\n') if p.strip()] return { 'has_title': self._detect_title(text), 'has_headers': self._detect_headers(paragraphs), 'has_lists': self._detect_lists(text), 'has_numbers': self._detect_numbers(text), 'paragraph_count': len(paragraphs), 'suggested_layout': self._suggest_layout(paragraphs) } def _extract_key_points(self, text: str) -> List[str]: """Extract key points from text""" sentences = re.split(r'[.!?]+', text) sentences = [s.strip() for s in sentences if len(s.strip()) > 20] scored_sentences = [] for sentence in sentences[:15]: score = len(sentence.split()) if re.search(r'\d+', sentence): score += 5 if any(word in sentence.lower() for word in ['important', 'key', 'main', 'significant']): score += 3 scored_sentences.append((sentence, score)) scored_sentences.sort(key=lambda x: x[1], reverse=True) return [sent[0] for sent in scored_sentences[:6]] def _extract_keywords(self, text: str) -> List[str]: """Extract important keywords""" words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower()) words = [word for word in words if word not in self.stop_words] word_freq = Counter(words) return [word for word, count in word_freq.most_common(12)] def _analyze_sentiment(self, text: str) -> str: """Basic sentiment analysis""" positive_words = {'good', 'great', 'excellent', 'amazing', 'wonderful', 'positive', 'success', 'achieve', 'benefit', 'advantage'} negative_words = {'bad', 'terrible', 'awful', 'negative', 'problem', 'issue', 'challenge', 'difficult', 'risk', 'disadvantage'} words = set(text.lower().split()) positive_count = len(words & positive_words) negative_count = len(words & negative_words) if positive_count > negative_count: return 'positive' elif negative_count > positive_count: return 'negative' else: return 'neutral' def _identify_sections(self, text: str) -> List[Dict]: """Identify logical sections in the text""" paragraphs = [p.strip() for p in text.split('\n') if p.strip()] sections = [] for i, paragraph in enumerate(paragraphs[:8]): section = { 'id': i + 1, 'content': paragraph, 'type': self._classify_paragraph_type(paragraph), 'word_count': len(paragraph.split()), 'priority': self._calculate_priority(paragraph) } sections.append(section) return sections def _extract_data_elements(self, text: str) -> Dict: """Extract numerical and data elements""" numbers = re.findall(r'\b\d+(?:\.\d+)?(?:%|\$|€|£)?\b', text) percentages = re.findall(r'\d+(?:\.\d+)?%', text) currencies = re.findall(r'[\$€£]\d+(?:\.\d+)?(?:[kmb])?', text) dates = re.findall(r'\b\d{4}\b|\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text) return { 'numbers': numbers[:10], 'percentages': percentages[:5], 'currencies': currencies[:5], 'dates': dates[:5], 'has_data': bool(numbers or percentages or currencies) } def _detect_title(self, text: str) -> bool: """Detect if text has a clear title""" first_line = text.split('\n')[0].strip() return len(first_line) < 100 and len(first_line.split()) < 12 def _detect_headers(self, paragraphs: List[str]) -> bool: """Detect if text has headers""" short_paragraphs = [p for p in paragraphs if len(p.split()) < 8] return len(short_paragraphs) >= 2 def _detect_lists(self, text: str) -> bool: """Detect if text contains lists""" list_patterns = [r'^\d+\.', r'^\-', r'^\*', r'^\•'] lines = text.split('\n') list_count = 0 for line in lines: line = line.strip() for pattern in list_patterns: if re.match(pattern, line): list_count += 1 break return list_count >= 2 def _detect_numbers(self, text: str) -> bool: """Detect if text contains significant numbers""" numbers = re.findall(r'\b\d+', text) return len(numbers) >= 3 def _suggest_layout(self, paragraphs: List[str]) -> str: """Suggest optimal layout based on content""" if len(paragraphs) <= 3: return "Vertical" elif any(len(p.split()) < 10 for p in paragraphs[:3]): return "Grid" elif len(paragraphs) >= 6: return "Flow" else: return "Horizontal" def _classify_paragraph_type(self, paragraph: str) -> str: """Classify paragraph type""" word_count = len(paragraph.split()) if word_count < 8: return "header" elif word_count < 25: return "key_point" elif re.search(r'\d+', paragraph): return "data" else: return "body" def _calculate_priority(self, paragraph: str) -> int: """Calculate paragraph priority for display""" priority = len(paragraph.split()) if re.search(r'\d+', paragraph): priority += 10 if any(word in paragraph.lower() for word in ['important', 'key', 'main', 'critical']): priority += 15 return min(priority, 100) def _empty_analysis(self) -> Dict: """Return empty analysis structure""" return { 'original_text': '', 'cleaned_text': '', 'statistics': {'word_count': 0, 'char_count': 0, 'sentence_count': 0, 'paragraph_count': 0, 'reading_level': 0, 'avg_words_per_sentence': 0}, 'structure': {'has_title': False, 'has_headers': False, 'has_lists': False, 'has_numbers': False, 'paragraph_count': 0, 'suggested_layout': 'Vertical'}, 'key_points': [], 'keywords': [], 'sentiment': 'neutral', 'sections': [], 'data_elements': {'numbers': [], 'percentages': [], 'currencies': [], 'dates': [], 'has_data': False} }