File size: 9,252 Bytes
430279a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241

"""
Text processing and analysis module
"""
import re
import nltk
import textstat
from typing import Dict, List, Tuple, Optional
from collections import Counter
import logging

# Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
except:
    pass

logger = logging.getLogger(__name__)

class TextProcessor:
    """Process and analyze text content for infographic generation"""
    
    def __init__(self):
        self.stop_words = set()
        try:
            from nltk.corpus import stopwords
            self.stop_words = set(stopwords.words('english'))
        except:
            # Fallback stop words
            self.stop_words = {
                'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 
                'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 
                'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did'
            }
    
    def analyze_text(self, text: str) -> Dict:
        """Comprehensive text analysis"""
        if not text or len(text.strip()) < 10:
            return self._empty_analysis()
        
        analysis = {
            'original_text': text,
            'cleaned_text': self._clean_text(text),
            'statistics': self._get_text_statistics(text),
            'structure': self._analyze_structure(text),
            'key_points': self._extract_key_points(text),
            'keywords': self._extract_keywords(text),
            'sentiment': self._analyze_sentiment(text),
            'sections': self._identify_sections(text),
            'data_elements': self._extract_data_elements(text)
        }
        
        return analysis
    
    def _clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', '', text)
        return text.strip()
    
    def _get_text_statistics(self, text: str) -> Dict:
        """Get basic text statistics"""
        sentences = re.split(r'[.!?]+', text)
        sentences = [s for s in sentences if s.strip()]
        
        return {
            'word_count': len(text.split()),
            'char_count': len(text),
            'sentence_count': len(sentences),
            'paragraph_count': len([p for p in text.split('\n\n') if p.strip()]),
            'reading_level': min(100, max(0, textstat.flesch_reading_ease(text))),
            'avg_words_per_sentence': round(len(text.split()) / max(1, len(sentences)), 1)
        }
    
    def _analyze_structure(self, text: str) -> Dict:
        """Analyze text structure"""
        paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
        
        return {
            'has_title': self._detect_title(text),
            'has_headers': self._detect_headers(paragraphs),
            'has_lists': self._detect_lists(text),
            'has_numbers': self._detect_numbers(text),
            'paragraph_count': len(paragraphs),
            'suggested_layout': self._suggest_layout(paragraphs)
        }
    
    def _extract_key_points(self, text: str) -> List[str]:
        """Extract key points from text"""
        sentences = re.split(r'[.!?]+', text)
        sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
        
        scored_sentences = []
        for sentence in sentences[:15]:
            score = len(sentence.split())
            if re.search(r'\d+', sentence):
                score += 5
            if any(word in sentence.lower() for word in ['important', 'key', 'main', 'significant']):
                score += 3
            
            scored_sentences.append((sentence, score))
        
        scored_sentences.sort(key=lambda x: x[1], reverse=True)
        return [sent[0] for sent in scored_sentences[:6]]
    
    def _extract_keywords(self, text: str) -> List[str]:
        """Extract important keywords"""
        words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
        words = [word for word in words if word not in self.stop_words]
        
        word_freq = Counter(words)
        return [word for word, count in word_freq.most_common(12)]
    
    def _analyze_sentiment(self, text: str) -> str:
        """Basic sentiment analysis"""
        positive_words = {'good', 'great', 'excellent', 'amazing', 'wonderful', 'positive', 'success', 'achieve', 'benefit', 'advantage'}
        negative_words = {'bad', 'terrible', 'awful', 'negative', 'problem', 'issue', 'challenge', 'difficult', 'risk', 'disadvantage'}
        
        words = set(text.lower().split())
        positive_count = len(words & positive_words)
        negative_count = len(words & negative_words)
        
        if positive_count > negative_count:
            return 'positive'
        elif negative_count > positive_count:
            return 'negative'
        else:
            return 'neutral'
    
    def _identify_sections(self, text: str) -> List[Dict]:
        """Identify logical sections in the text"""
        paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
        sections = []
        
        for i, paragraph in enumerate(paragraphs[:8]):
            section = {
                'id': i + 1,
                'content': paragraph,
                'type': self._classify_paragraph_type(paragraph),
                'word_count': len(paragraph.split()),
                'priority': self._calculate_priority(paragraph)
            }
            sections.append(section)
        
        return sections
    
    def _extract_data_elements(self, text: str) -> Dict:
        """Extract numerical and data elements"""
        numbers = re.findall(r'\b\d+(?:\.\d+)?(?:%|\$|€|£)?\b', text)
        percentages = re.findall(r'\d+(?:\.\d+)?%', text)
        currencies = re.findall(r'[\$€£]\d+(?:\.\d+)?(?:[kmb])?', text)
        dates = re.findall(r'\b\d{4}\b|\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text)
        
        return {
            'numbers': numbers[:10],
            'percentages': percentages[:5],
            'currencies': currencies[:5],
            'dates': dates[:5],
            'has_data': bool(numbers or percentages or currencies)
        }
    
    def _detect_title(self, text: str) -> bool:
        """Detect if text has a clear title"""
        first_line = text.split('\n')[0].strip()
        return len(first_line) < 100 and len(first_line.split()) < 12
    
    def _detect_headers(self, paragraphs: List[str]) -> bool:
        """Detect if text has headers"""
        short_paragraphs = [p for p in paragraphs if len(p.split()) < 8]
        return len(short_paragraphs) >= 2
    
    def _detect_lists(self, text: str) -> bool:
        """Detect if text contains lists"""
        list_patterns = [r'^\d+\.', r'^\-', r'^\*', r'^\•']
        lines = text.split('\n')
        list_count = 0
        
        for line in lines:
            line = line.strip()
            for pattern in list_patterns:
                if re.match(pattern, line):
                    list_count += 1
                    break
        
        return list_count >= 2
    
    def _detect_numbers(self, text: str) -> bool:
        """Detect if text contains significant numbers"""
        numbers = re.findall(r'\b\d+', text)
        return len(numbers) >= 3
    
    def _suggest_layout(self, paragraphs: List[str]) -> str:
        """Suggest optimal layout based on content"""
        if len(paragraphs) <= 3:
            return "Vertical"
        elif any(len(p.split()) < 10 for p in paragraphs[:3]):
            return "Grid"
        elif len(paragraphs) >= 6:
            return "Flow"
        else:
            return "Horizontal"
    
    def _classify_paragraph_type(self, paragraph: str) -> str:
        """Classify paragraph type"""
        word_count = len(paragraph.split())
        
        if word_count < 8:
            return "header"
        elif word_count < 25:
            return "key_point"
        elif re.search(r'\d+', paragraph):
            return "data"
        else:
            return "body"
    
    def _calculate_priority(self, paragraph: str) -> int:
        """Calculate paragraph priority for display"""
        priority = len(paragraph.split())
        
        if re.search(r'\d+', paragraph):
            priority += 10
        if any(word in paragraph.lower() for word in ['important', 'key', 'main', 'critical']):
            priority += 15
        
        return min(priority, 100)
    
    def _empty_analysis(self) -> Dict:
        """Return empty analysis structure"""
        return {
            'original_text': '',
            'cleaned_text': '',
            'statistics': {'word_count': 0, 'char_count': 0, 'sentence_count': 0, 'paragraph_count': 0, 'reading_level': 0, 'avg_words_per_sentence': 0},
            'structure': {'has_title': False, 'has_headers': False, 'has_lists': False, 'has_numbers': False, 'paragraph_count': 0, 'suggested_layout': 'Vertical'},
            'key_points': [],
            'keywords': [],
            'sentiment': 'neutral',
            'sections': [],
            'data_elements': {'numbers': [], 'percentages': [], 'currencies': [], 'dates': [], 'has_data': False}
        }