Infographics_Generator_1 / src /text_processor.py
3Stark123's picture
Create src/text_processor.py
430279a verified
"""
Text processing and analysis module
"""
import re
import nltk
import textstat
from typing import Dict, List, Tuple, Optional
from collections import Counter
import logging
# Download required NLTK data
try:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
except:
pass
logger = logging.getLogger(__name__)
class TextProcessor:
"""Process and analyze text content for infographic generation"""
def __init__(self):
self.stop_words = set()
try:
from nltk.corpus import stopwords
self.stop_words = set(stopwords.words('english'))
except:
# Fallback stop words
self.stop_words = {
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be',
'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did'
}
def analyze_text(self, text: str) -> Dict:
"""Comprehensive text analysis"""
if not text or len(text.strip()) < 10:
return self._empty_analysis()
analysis = {
'original_text': text,
'cleaned_text': self._clean_text(text),
'statistics': self._get_text_statistics(text),
'structure': self._analyze_structure(text),
'key_points': self._extract_key_points(text),
'keywords': self._extract_keywords(text),
'sentiment': self._analyze_sentiment(text),
'sections': self._identify_sections(text),
'data_elements': self._extract_data_elements(text)
}
return analysis
def _clean_text(self, text: str) -> str:
"""Clean and normalize text"""
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', '', text)
return text.strip()
def _get_text_statistics(self, text: str) -> Dict:
"""Get basic text statistics"""
sentences = re.split(r'[.!?]+', text)
sentences = [s for s in sentences if s.strip()]
return {
'word_count': len(text.split()),
'char_count': len(text),
'sentence_count': len(sentences),
'paragraph_count': len([p for p in text.split('\n\n') if p.strip()]),
'reading_level': min(100, max(0, textstat.flesch_reading_ease(text))),
'avg_words_per_sentence': round(len(text.split()) / max(1, len(sentences)), 1)
}
def _analyze_structure(self, text: str) -> Dict:
"""Analyze text structure"""
paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
return {
'has_title': self._detect_title(text),
'has_headers': self._detect_headers(paragraphs),
'has_lists': self._detect_lists(text),
'has_numbers': self._detect_numbers(text),
'paragraph_count': len(paragraphs),
'suggested_layout': self._suggest_layout(paragraphs)
}
def _extract_key_points(self, text: str) -> List[str]:
"""Extract key points from text"""
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
scored_sentences = []
for sentence in sentences[:15]:
score = len(sentence.split())
if re.search(r'\d+', sentence):
score += 5
if any(word in sentence.lower() for word in ['important', 'key', 'main', 'significant']):
score += 3
scored_sentences.append((sentence, score))
scored_sentences.sort(key=lambda x: x[1], reverse=True)
return [sent[0] for sent in scored_sentences[:6]]
def _extract_keywords(self, text: str) -> List[str]:
"""Extract important keywords"""
words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
words = [word for word in words if word not in self.stop_words]
word_freq = Counter(words)
return [word for word, count in word_freq.most_common(12)]
def _analyze_sentiment(self, text: str) -> str:
"""Basic sentiment analysis"""
positive_words = {'good', 'great', 'excellent', 'amazing', 'wonderful', 'positive', 'success', 'achieve', 'benefit', 'advantage'}
negative_words = {'bad', 'terrible', 'awful', 'negative', 'problem', 'issue', 'challenge', 'difficult', 'risk', 'disadvantage'}
words = set(text.lower().split())
positive_count = len(words & positive_words)
negative_count = len(words & negative_words)
if positive_count > negative_count:
return 'positive'
elif negative_count > positive_count:
return 'negative'
else:
return 'neutral'
def _identify_sections(self, text: str) -> List[Dict]:
"""Identify logical sections in the text"""
paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
sections = []
for i, paragraph in enumerate(paragraphs[:8]):
section = {
'id': i + 1,
'content': paragraph,
'type': self._classify_paragraph_type(paragraph),
'word_count': len(paragraph.split()),
'priority': self._calculate_priority(paragraph)
}
sections.append(section)
return sections
def _extract_data_elements(self, text: str) -> Dict:
"""Extract numerical and data elements"""
numbers = re.findall(r'\b\d+(?:\.\d+)?(?:%|\$|€|£)?\b', text)
percentages = re.findall(r'\d+(?:\.\d+)?%', text)
currencies = re.findall(r'[\$€£]\d+(?:\.\d+)?(?:[kmb])?', text)
dates = re.findall(r'\b\d{4}\b|\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text)
return {
'numbers': numbers[:10],
'percentages': percentages[:5],
'currencies': currencies[:5],
'dates': dates[:5],
'has_data': bool(numbers or percentages or currencies)
}
def _detect_title(self, text: str) -> bool:
"""Detect if text has a clear title"""
first_line = text.split('\n')[0].strip()
return len(first_line) < 100 and len(first_line.split()) < 12
def _detect_headers(self, paragraphs: List[str]) -> bool:
"""Detect if text has headers"""
short_paragraphs = [p for p in paragraphs if len(p.split()) < 8]
return len(short_paragraphs) >= 2
def _detect_lists(self, text: str) -> bool:
"""Detect if text contains lists"""
list_patterns = [r'^\d+\.', r'^\-', r'^\*', r'^\•']
lines = text.split('\n')
list_count = 0
for line in lines:
line = line.strip()
for pattern in list_patterns:
if re.match(pattern, line):
list_count += 1
break
return list_count >= 2
def _detect_numbers(self, text: str) -> bool:
"""Detect if text contains significant numbers"""
numbers = re.findall(r'\b\d+', text)
return len(numbers) >= 3
def _suggest_layout(self, paragraphs: List[str]) -> str:
"""Suggest optimal layout based on content"""
if len(paragraphs) <= 3:
return "Vertical"
elif any(len(p.split()) < 10 for p in paragraphs[:3]):
return "Grid"
elif len(paragraphs) >= 6:
return "Flow"
else:
return "Horizontal"
def _classify_paragraph_type(self, paragraph: str) -> str:
"""Classify paragraph type"""
word_count = len(paragraph.split())
if word_count < 8:
return "header"
elif word_count < 25:
return "key_point"
elif re.search(r'\d+', paragraph):
return "data"
else:
return "body"
def _calculate_priority(self, paragraph: str) -> int:
"""Calculate paragraph priority for display"""
priority = len(paragraph.split())
if re.search(r'\d+', paragraph):
priority += 10
if any(word in paragraph.lower() for word in ['important', 'key', 'main', 'critical']):
priority += 15
return min(priority, 100)
def _empty_analysis(self) -> Dict:
"""Return empty analysis structure"""
return {
'original_text': '',
'cleaned_text': '',
'statistics': {'word_count': 0, 'char_count': 0, 'sentence_count': 0, 'paragraph_count': 0, 'reading_level': 0, 'avg_words_per_sentence': 0},
'structure': {'has_title': False, 'has_headers': False, 'has_lists': False, 'has_numbers': False, 'paragraph_count': 0, 'suggested_layout': 'Vertical'},
'key_points': [],
'keywords': [],
'sentiment': 'neutral',
'sections': [],
'data_elements': {'numbers': [], 'percentages': [], 'currencies': [], 'dates': [], 'has_data': False}
}