Spaces:

3Stark123
/

Infographics_Generator_1

Running

File size: 9,252 Bytes

430279a


"""
Text processing and analysis module
"""
import re
import nltk
import textstat
from typing import Dict, List, Tuple, Optional
from collections import Counter
import logging

# Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
except:
    pass

logger = logging.getLogger(__name__)

class TextProcessor:
    """Process and analyze text content for infographic generation"""
    
    def __init__(self):
        self.stop_words = set()
        try:
            from nltk.corpus import stopwords
            self.stop_words = set(stopwords.words('english'))
        except:
            # Fallback stop words
            self.stop_words = {
                'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 
                'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 
                'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did'
            }
    
    def analyze_text(self, text: str) -> Dict:
        """Comprehensive text analysis"""
        if not text or len(text.strip()) < 10:
            return self._empty_analysis()
        
        analysis = {
            'original_text': text,
            'cleaned_text': self._clean_text(text),
            'statistics': self._get_text_statistics(text),
            'structure': self._analyze_structure(text),
            'key_points': self._extract_key_points(text),
            'keywords': self._extract_keywords(text),
            'sentiment': self._analyze_sentiment(text),
            'sections': self._identify_sections(text),
            'data_elements': self._extract_data_elements(text)
        }
        
        return analysis
    
    def _clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', '', text)
        return text.strip()
    
    def _get_text_statistics(self, text: str) -> Dict:
        """Get basic text statistics"""
        sentences = re.split(r'[.!?]+', text)
        sentences = [s for s in sentences if s.strip()]
        
        return {
            'word_count': len(text.split()),
            'char_count': len(text),
            'sentence_count': len(sentences),
            'paragraph_count': len([p for p in text.split('\n\n') if p.strip()]),
            'reading_level': min(100, max(0, textstat.flesch_reading_ease(text))),
            'avg_words_per_sentence': round(len(text.split()) / max(1, len(sentences)), 1)
        }
    
    def _analyze_structure(self, text: str) -> Dict:
        """Analyze text structure"""
        paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
        
        return {
            'has_title': self._detect_title(text),
            'has_headers': self._detect_headers(paragraphs),
            'has_lists': self._detect_lists(text),
            'has_numbers': self._detect_numbers(text),
            'paragraph_count': len(paragraphs),
            'suggested_layout': self._suggest_layout(paragraphs)
        }
    
    def _extract_key_points(self, text: str) -> List[str]:
        """Extract key points from text"""
        sentences = re.split(r'[.!?]+', text)
        sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
        
        scored_sentences = []
        for sentence in sentences[:15]:
            score = len(sentence.split())
            if re.search(r'\d+', sentence):
                score += 5
            if any(word in sentence.lower() for word in ['important', 'key', 'main', 'significant']):
                score += 3
            
            scored_sentences.append((sentence, score))
        
        scored_sentences.sort(key=lambda x: x[1], reverse=True)
        return [sent[0] for sent in scored_sentences[:6]]
    
    def _extract_keywords(self, text: str) -> List[str]:
        """Extract important keywords"""
        words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
        words = [word for word in words if word not in self.stop_words]
        
        word_freq = Counter(words)
        return [word for word, count in word_freq.most_common(12)]
    
    def _analyze_sentiment(self, text: str) -> str:
        """Basic sentiment analysis"""
        positive_words = {'good', 'great', 'excellent', 'amazing', 'wonderful', 'positive', 'success', 'achieve', 'benefit', 'advantage'}
        negative_words = {'bad', 'terrible', 'awful', 'negative', 'problem', 'issue', 'challenge', 'difficult', 'risk', 'disadvantage'}
        
        words = set(text.lower().split())
        positive_count = len(words & positive_words)
        negative_count = len(words & negative_words)
        
        if positive_count > negative_count:
            return 'positive'
        elif negative_count > positive_count:
            return 'negative'
        else:
            return 'neutral'
    
    def _identify_sections(self, text: str) -> List[Dict]:
        """Identify logical sections in the text"""
        paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
        sections = []
        
        for i, paragraph in enumerate(paragraphs[:8]):
            section = {
                'id': i + 1,
                'content': paragraph,
                'type': self._classify_paragraph_type(paragraph),
                'word_count': len(paragraph.split()),
                'priority': self._calculate_priority(paragraph)
            }
            sections.append(section)
        
        return sections
    
    def _extract_data_elements(self, text: str) -> Dict:
        """Extract numerical and data elements"""
        numbers = re.findall(r'\b\d+(?:\.\d+)?(?:%|\$|€|£)?\b', text)
        percentages = re.findall(r'\d+(?:\.\d+)?%', text)
        currencies = re.findall(r'[\$€£]\d+(?:\.\d+)?(?:[kmb])?', text)
        dates = re.findall(r'\b\d{4}\b|\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text)
        
        return {
            'numbers': numbers[:10],
            'percentages': percentages[:5],
            'currencies': currencies[:5],
            'dates': dates[:5],
            'has_data': bool(numbers or percentages or currencies)
        }
    
    def _detect_title(self, text: str) -> bool:
        """Detect if text has a clear title"""
        first_line = text.split('\n')[0].strip()
        return len(first_line) < 100 and len(first_line.split()) < 12
    
    def _detect_headers(self, paragraphs: List[str]) -> bool:
        """Detect if text has headers"""
        short_paragraphs = [p for p in paragraphs if len(p.split()) < 8]
        return len(short_paragraphs) >= 2
    
    def _detect_lists(self, text: str) -> bool:
        """Detect if text contains lists"""
        list_patterns = [r'^\d+\.', r'^\-', r'^\*', r'^\•']
        lines = text.split('\n')
        list_count = 0
        
        for line in lines:
            line = line.strip()
            for pattern in list_patterns:
                if re.match(pattern, line):
                    list_count += 1
                    break
        
        return list_count >= 2
    
    def _detect_numbers(self, text: str) -> bool:
        """Detect if text contains significant numbers"""
        numbers = re.findall(r'\b\d+', text)
        return len(numbers) >= 3
    
    def _suggest_layout(self, paragraphs: List[str]) -> str:
        """Suggest optimal layout based on content"""
        if len(paragraphs) <= 3:
            return "Vertical"
        elif any(len(p.split()) < 10 for p in paragraphs[:3]):
            return "Grid"
        elif len(paragraphs) >= 6:
            return "Flow"
        else:
            return "Horizontal"
    
    def _classify_paragraph_type(self, paragraph: str) -> str:
        """Classify paragraph type"""
        word_count = len(paragraph.split())
        
        if word_count < 8:
            return "header"
        elif word_count < 25:
            return "key_point"
        elif re.search(r'\d+', paragraph):
            return "data"
        else:
            return "body"
    
    def _calculate_priority(self, paragraph: str) -> int:
        """Calculate paragraph priority for display"""
        priority = len(paragraph.split())
        
        if re.search(r'\d+', paragraph):
            priority += 10
        if any(word in paragraph.lower() for word in ['important', 'key', 'main', 'critical']):
            priority += 15
        
        return min(priority, 100)
    
    def _empty_analysis(self) -> Dict:
        """Return empty analysis structure"""
        return {
            'original_text': '',
            'cleaned_text': '',
            'statistics': {'word_count': 0, 'char_count': 0, 'sentence_count': 0, 'paragraph_count': 0, 'reading_level': 0, 'avg_words_per_sentence': 0},
            'structure': {'has_title': False, 'has_headers': False, 'has_lists': False, 'has_numbers': False, 'paragraph_count': 0, 'suggested_layout': 'Vertical'},
            'key_points': [],
            'keywords': [],
            'sentiment': 'neutral',
            'sections': [],
            'data_elements': {'numbers': [], 'percentages': [], 'currencies': [], 'dates': [], 'has_data': False}
        }