|
|
|
|
|
""" |
|
|
Text processing and analysis module |
|
|
""" |
|
|
import re |
|
|
import nltk |
|
|
import textstat |
|
|
from typing import Dict, List, Tuple, Optional |
|
|
from collections import Counter |
|
|
import logging |
|
|
|
|
|
|
|
|
try: |
|
|
nltk.download('punkt', quiet=True) |
|
|
nltk.download('stopwords', quiet=True) |
|
|
nltk.download('averaged_perceptron_tagger', quiet=True) |
|
|
except: |
|
|
pass |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class TextProcessor: |
|
|
"""Process and analyze text content for infographic generation""" |
|
|
|
|
|
def __init__(self): |
|
|
self.stop_words = set() |
|
|
try: |
|
|
from nltk.corpus import stopwords |
|
|
self.stop_words = set(stopwords.words('english')) |
|
|
except: |
|
|
|
|
|
self.stop_words = { |
|
|
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', |
|
|
'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', |
|
|
'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did' |
|
|
} |
|
|
|
|
|
def analyze_text(self, text: str) -> Dict: |
|
|
"""Comprehensive text analysis""" |
|
|
if not text or len(text.strip()) < 10: |
|
|
return self._empty_analysis() |
|
|
|
|
|
analysis = { |
|
|
'original_text': text, |
|
|
'cleaned_text': self._clean_text(text), |
|
|
'statistics': self._get_text_statistics(text), |
|
|
'structure': self._analyze_structure(text), |
|
|
'key_points': self._extract_key_points(text), |
|
|
'keywords': self._extract_keywords(text), |
|
|
'sentiment': self._analyze_sentiment(text), |
|
|
'sections': self._identify_sections(text), |
|
|
'data_elements': self._extract_data_elements(text) |
|
|
} |
|
|
|
|
|
return analysis |
|
|
|
|
|
def _clean_text(self, text: str) -> str: |
|
|
"""Clean and normalize text""" |
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', '', text) |
|
|
return text.strip() |
|
|
|
|
|
def _get_text_statistics(self, text: str) -> Dict: |
|
|
"""Get basic text statistics""" |
|
|
sentences = re.split(r'[.!?]+', text) |
|
|
sentences = [s for s in sentences if s.strip()] |
|
|
|
|
|
return { |
|
|
'word_count': len(text.split()), |
|
|
'char_count': len(text), |
|
|
'sentence_count': len(sentences), |
|
|
'paragraph_count': len([p for p in text.split('\n\n') if p.strip()]), |
|
|
'reading_level': min(100, max(0, textstat.flesch_reading_ease(text))), |
|
|
'avg_words_per_sentence': round(len(text.split()) / max(1, len(sentences)), 1) |
|
|
} |
|
|
|
|
|
def _analyze_structure(self, text: str) -> Dict: |
|
|
"""Analyze text structure""" |
|
|
paragraphs = [p.strip() for p in text.split('\n') if p.strip()] |
|
|
|
|
|
return { |
|
|
'has_title': self._detect_title(text), |
|
|
'has_headers': self._detect_headers(paragraphs), |
|
|
'has_lists': self._detect_lists(text), |
|
|
'has_numbers': self._detect_numbers(text), |
|
|
'paragraph_count': len(paragraphs), |
|
|
'suggested_layout': self._suggest_layout(paragraphs) |
|
|
} |
|
|
|
|
|
def _extract_key_points(self, text: str) -> List[str]: |
|
|
"""Extract key points from text""" |
|
|
sentences = re.split(r'[.!?]+', text) |
|
|
sentences = [s.strip() for s in sentences if len(s.strip()) > 20] |
|
|
|
|
|
scored_sentences = [] |
|
|
for sentence in sentences[:15]: |
|
|
score = len(sentence.split()) |
|
|
if re.search(r'\d+', sentence): |
|
|
score += 5 |
|
|
if any(word in sentence.lower() for word in ['important', 'key', 'main', 'significant']): |
|
|
score += 3 |
|
|
|
|
|
scored_sentences.append((sentence, score)) |
|
|
|
|
|
scored_sentences.sort(key=lambda x: x[1], reverse=True) |
|
|
return [sent[0] for sent in scored_sentences[:6]] |
|
|
|
|
|
def _extract_keywords(self, text: str) -> List[str]: |
|
|
"""Extract important keywords""" |
|
|
words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower()) |
|
|
words = [word for word in words if word not in self.stop_words] |
|
|
|
|
|
word_freq = Counter(words) |
|
|
return [word for word, count in word_freq.most_common(12)] |
|
|
|
|
|
def _analyze_sentiment(self, text: str) -> str: |
|
|
"""Basic sentiment analysis""" |
|
|
positive_words = {'good', 'great', 'excellent', 'amazing', 'wonderful', 'positive', 'success', 'achieve', 'benefit', 'advantage'} |
|
|
negative_words = {'bad', 'terrible', 'awful', 'negative', 'problem', 'issue', 'challenge', 'difficult', 'risk', 'disadvantage'} |
|
|
|
|
|
words = set(text.lower().split()) |
|
|
positive_count = len(words & positive_words) |
|
|
negative_count = len(words & negative_words) |
|
|
|
|
|
if positive_count > negative_count: |
|
|
return 'positive' |
|
|
elif negative_count > positive_count: |
|
|
return 'negative' |
|
|
else: |
|
|
return 'neutral' |
|
|
|
|
|
def _identify_sections(self, text: str) -> List[Dict]: |
|
|
"""Identify logical sections in the text""" |
|
|
paragraphs = [p.strip() for p in text.split('\n') if p.strip()] |
|
|
sections = [] |
|
|
|
|
|
for i, paragraph in enumerate(paragraphs[:8]): |
|
|
section = { |
|
|
'id': i + 1, |
|
|
'content': paragraph, |
|
|
'type': self._classify_paragraph_type(paragraph), |
|
|
'word_count': len(paragraph.split()), |
|
|
'priority': self._calculate_priority(paragraph) |
|
|
} |
|
|
sections.append(section) |
|
|
|
|
|
return sections |
|
|
|
|
|
def _extract_data_elements(self, text: str) -> Dict: |
|
|
"""Extract numerical and data elements""" |
|
|
numbers = re.findall(r'\b\d+(?:\.\d+)?(?:%|\$|€|£)?\b', text) |
|
|
percentages = re.findall(r'\d+(?:\.\d+)?%', text) |
|
|
currencies = re.findall(r'[\$€£]\d+(?:\.\d+)?(?:[kmb])?', text) |
|
|
dates = re.findall(r'\b\d{4}\b|\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text) |
|
|
|
|
|
return { |
|
|
'numbers': numbers[:10], |
|
|
'percentages': percentages[:5], |
|
|
'currencies': currencies[:5], |
|
|
'dates': dates[:5], |
|
|
'has_data': bool(numbers or percentages or currencies) |
|
|
} |
|
|
|
|
|
def _detect_title(self, text: str) -> bool: |
|
|
"""Detect if text has a clear title""" |
|
|
first_line = text.split('\n')[0].strip() |
|
|
return len(first_line) < 100 and len(first_line.split()) < 12 |
|
|
|
|
|
def _detect_headers(self, paragraphs: List[str]) -> bool: |
|
|
"""Detect if text has headers""" |
|
|
short_paragraphs = [p for p in paragraphs if len(p.split()) < 8] |
|
|
return len(short_paragraphs) >= 2 |
|
|
|
|
|
def _detect_lists(self, text: str) -> bool: |
|
|
"""Detect if text contains lists""" |
|
|
list_patterns = [r'^\d+\.', r'^\-', r'^\*', r'^\•'] |
|
|
lines = text.split('\n') |
|
|
list_count = 0 |
|
|
|
|
|
for line in lines: |
|
|
line = line.strip() |
|
|
for pattern in list_patterns: |
|
|
if re.match(pattern, line): |
|
|
list_count += 1 |
|
|
break |
|
|
|
|
|
return list_count >= 2 |
|
|
|
|
|
def _detect_numbers(self, text: str) -> bool: |
|
|
"""Detect if text contains significant numbers""" |
|
|
numbers = re.findall(r'\b\d+', text) |
|
|
return len(numbers) >= 3 |
|
|
|
|
|
def _suggest_layout(self, paragraphs: List[str]) -> str: |
|
|
"""Suggest optimal layout based on content""" |
|
|
if len(paragraphs) <= 3: |
|
|
return "Vertical" |
|
|
elif any(len(p.split()) < 10 for p in paragraphs[:3]): |
|
|
return "Grid" |
|
|
elif len(paragraphs) >= 6: |
|
|
return "Flow" |
|
|
else: |
|
|
return "Horizontal" |
|
|
|
|
|
def _classify_paragraph_type(self, paragraph: str) -> str: |
|
|
"""Classify paragraph type""" |
|
|
word_count = len(paragraph.split()) |
|
|
|
|
|
if word_count < 8: |
|
|
return "header" |
|
|
elif word_count < 25: |
|
|
return "key_point" |
|
|
elif re.search(r'\d+', paragraph): |
|
|
return "data" |
|
|
else: |
|
|
return "body" |
|
|
|
|
|
def _calculate_priority(self, paragraph: str) -> int: |
|
|
"""Calculate paragraph priority for display""" |
|
|
priority = len(paragraph.split()) |
|
|
|
|
|
if re.search(r'\d+', paragraph): |
|
|
priority += 10 |
|
|
if any(word in paragraph.lower() for word in ['important', 'key', 'main', 'critical']): |
|
|
priority += 15 |
|
|
|
|
|
return min(priority, 100) |
|
|
|
|
|
def _empty_analysis(self) -> Dict: |
|
|
"""Return empty analysis structure""" |
|
|
return { |
|
|
'original_text': '', |
|
|
'cleaned_text': '', |
|
|
'statistics': {'word_count': 0, 'char_count': 0, 'sentence_count': 0, 'paragraph_count': 0, 'reading_level': 0, 'avg_words_per_sentence': 0}, |
|
|
'structure': {'has_title': False, 'has_headers': False, 'has_lists': False, 'has_numbers': False, 'paragraph_count': 0, 'suggested_layout': 'Vertical'}, |
|
|
'key_points': [], |
|
|
'keywords': [], |
|
|
'sentiment': 'neutral', |
|
|
'sections': [], |
|
|
'data_elements': {'numbers': [], 'percentages': [], 'currencies': [], 'dates': [], 'has_data': False} |
|
|
} |