Spaces:

sandylolpotty
/

CatalystGPT-4

Sleeping

App Files Files Community

sandylolpotty commited on Aug 15

Commit

a357512

verified ·

1 Parent(s): a5cabf8

Create app.py

Browse files

Files changed (1) hide show

app.py +1164 -0

app.py ADDED Viewed

	@@ -0,0 +1,1164 @@

+import gradio as gr
+import os
+import re
+import json
+import tempfile
+import hashlib
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, List, Tuple, Optional, Union
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Optional imports for document processing
+try:
+    from docx import Document
+    DOCX_AVAILABLE = True
+except ImportError:
+    DOCX_AVAILABLE = False
+    logger.warning("python-docx not installed. DOCX processing will be disabled.")
+try:
+    import PyPDF2
+    PDF_AVAILABLE = True
+except ImportError:
+    PDF_AVAILABLE = False
+    logger.warning("PyPDF2 not installed. PDF processing will be disabled.")
+try:
+    import fitz  # PyMuPDF - alternative PDF processor
+    PYMUPDF_AVAILABLE = True
+except ImportError:
+    PYMUPDF_AVAILABLE = False
+# Optional imports for advanced text processing
+try:
+    import nltk
+    from nltk.tokenize import sent_tokenize, word_tokenize
+    from nltk.corpus import stopwords
+    from nltk.frequency import FreqDist
+    from nltk.sentiment import SentimentIntensityAnalyzer
+    NLTK_AVAILABLE = True
+    # Download required NLTK data
+    required_nltk_data = ['punkt', 'stopwords', 'vader_lexicon']
+    for data_name in required_nltk_data:
+        try:
+            if data_name == 'punkt':
+                nltk.data.find('tokenizers/punkt')
+            elif data_name == 'stopwords':
+                nltk.data.find('corpora/stopwords')
+            elif data_name == 'vader_lexicon':
+                nltk.data.find('vader_lexicon')
+        except LookupError:
+            nltk.download(data_name, quiet=True)
+except ImportError:
+    NLTK_AVAILABLE = False
+    logger.warning("NLTK not installed. Advanced text analysis will be limited.")
+try:
+    from transformers import pipeline
+    import torch
+    TRANSFORMERS_AVAILABLE = True
+    DEVICE = 0 if torch.cuda.is_available() else -1
+except ImportError:
+    TRANSFORMERS_AVAILABLE = False
+    DEVICE = -1
+    logger.warning("transformers not installed. AI summarization will use basic extraction methods.")
+class AdvancedDocumentSummarizer:
+    """CatalystGPT-4 Advanced Document Summarizer with enhanced features"""
+    def __init__(self):
+        self.summarizer = None
+        self.sentiment_analyzer = None
+        self.cache = {}
+        # Initialize AI models
+        if TRANSFORMERS_AVAILABLE:
+            self._initialize_ai_models()
+        # Initialize sentiment analyzer
+        if NLTK_AVAILABLE:
+            try:
+                self.sentiment_analyzer = SentimentIntensityAnalyzer()
+            except Exception as e:
+                logger.warning(f"Failed to initialize sentiment analyzer: {e}")
+    def _initialize_ai_models(self):
+        """Initialize AI models with error handling and fallbacks"""
+        models_to_try = [
+            "facebook/bart-large-cnn",
+            "t5-small",
+            "google/pegasus-xsum"
+        ]
+        for model_name in models_to_try:
+            try:
+                self.summarizer = pipeline(
+                    "summarization",
+                    model=model_name,
+                    device=DEVICE,
+                    torch_dtype=torch.float16 if DEVICE >= 0 else torch.float32
+                )
+                logger.info(f"Successfully loaded {model_name}")
+                break
+            except Exception as e:
+                logger.warning(f"Failed to load {model_name}: {e}")
+                continue
+    def _get_file_hash(self, file_path: str) -> str:
+        """Generate hash for file caching"""
+        try:
+            with open(file_path, 'rb') as f:
+                content = f.read()
+            return hashlib.md5(content).hexdigest()
+        except Exception:
+            return str(datetime.now().timestamp())
+    def extract_text_from_pdf(self, file_path: str) -> str:
+        """Enhanced PDF text extraction with better error handling"""
+        text = ""
+        # Try PyMuPDF first (generally better)
+        if PYMUPDF_AVAILABLE:
+            try:
+                doc = fitz.open(file_path)
+                for page_num, page in enumerate(doc):
+                    page_text = page.get_text()
+                    if page_text.strip():  # Only add non-empty pages
+                        text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
+                doc.close()
+                if text.strip():
+                    return text
+            except Exception as e:
+                logger.error(f"PyMuPDF extraction failed: {e}")
+        # Fallback to PyPDF2
+        if PDF_AVAILABLE:
+            try:
+                with open(file_path, 'rb') as file:
+                    pdf_reader = PyPDF2.PdfReader(file)
+                    for page_num, page in enumerate(pdf_reader.pages):
+                        page_text = page.extract_text()
+                        if page_text.strip():
+                            text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
+                if text.strip():
+                    return text
+            except Exception as e:
+                logger.error(f"PyPDF2 extraction failed: {e}")
+        return "PDF processing libraries not available or extraction failed."
+    def extract_text_from_docx(self, file_path: str) -> str:
+        """Enhanced DOCX extraction with better formatting preservation"""
+        if not DOCX_AVAILABLE:
+            return "python-docx library not available."
+        try:
+            doc = Document(file_path)
+            text_parts = []
+            # Extract paragraphs
+            for paragraph in doc.paragraphs:
+                if paragraph.text.strip():
+                    text_parts.append(paragraph.text)
+            # Extract tables
+            for table_num, table in enumerate(doc.tables):
+                text_parts.append(f"\n--- Table {table_num + 1} ---")
+                for row in table.rows:
+                    row_text = " | ".join(cell.text.strip() for cell in row.cells)
+                    if row_text.strip():
+                        text_parts.append(row_text)
+            return "\n".join(text_parts)
+        except Exception as e:
+            logger.error(f"Error processing DOCX file: {e}")
+            return f"Error processing DOCX file: {str(e)}"
+    def get_enhanced_document_stats(self, text: str) -> Dict:
+        """Get comprehensive document statistics with sentiment analysis"""
+        if not text.strip():
+            return {}
+        # Basic stats
+        word_count = len(text.split())
+        char_count = len(text)
+        char_count_no_spaces = len(text.replace(' ', ''))
+        paragraph_count = len([p for p in text.split('\n\n') if p.strip()])
+        stats = {
+            'word_count': word_count,
+            'character_count': char_count,
+            'character_count_no_spaces': char_count_no_spaces,
+            'paragraph_count': paragraph_count,
+            'estimated_reading_time': max(1, round(word_count / 200)),  # 200 WPM average
+            'estimated_speaking_time': max(1, round(word_count / 150))  # 150 WPM speaking
+        }
+        if NLTK_AVAILABLE:
+            sentences = sent_tokenize(text)
+            stats['sentence_count'] = len(sentences)
+            stats['avg_sentence_length'] = round(word_count / len(sentences), 1) if sentences else 0
+            # Word frequency analysis
+            words = word_tokenize(text.lower())
+            stop_words = set(stopwords.words('english'))
+            filtered_words = [w for w in words if w.isalpha() and w not in stop_words and len(w) > 2]
+            if filtered_words:
+                freq_dist = FreqDist(filtered_words)
+                stats['top_words'] = freq_dist.most_common(15)
+                stats['unique_words'] = len(set(filtered_words))
+                stats['lexical_diversity'] = round(len(set(filtered_words)) / len(filtered_words), 3) if filtered_words else 0
+            # Sentiment analysis
+            if self.sentiment_analyzer:
+                try:
+                    sentiment_scores = self.sentiment_analyzer.polarity_scores(text[:5000])  # Limit for performance
+                    stats['sentiment'] = {
+                        'compound': round(sentiment_scores['compound'], 3),
+                        'positive': round(sentiment_scores['pos'], 3),
+                        'negative': round(sentiment_scores['neg'], 3),
+                        'neutral': round(sentiment_scores['neu'], 3)
+                    }
+                except Exception as e:
+                    logger.error(f"Sentiment analysis failed: {e}")
+        else:
+            # Fallback without NLTK
+            sentences = [s.strip() for s in text.split('.') if s.strip()]
+            stats['sentence_count'] = len(sentences)
+            stats['avg_sentence_length'] = round(word_count / len(sentences), 1) if sentences else 0
+            words = re.findall(r'\b\w+\b', text.lower())
+            word_freq = {}
+            for word in words:
+                if len(word) > 2:
+                    word_freq[word] = word_freq.get(word, 0) + 1
+            stats['top_words'] = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:15]
+            stats['unique_words'] = len(set(words))
+        return stats
+    def advanced_extractive_summary(self, text: str, num_sentences: int = 3) -> str:
+        """Enhanced extractive summarization with improved sentence scoring"""
+        if not text.strip():
+            return "No text to summarize."
+        if NLTK_AVAILABLE:
+            sentences = sent_tokenize(text)
+        else:
+            sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
+        if len(sentences) <= num_sentences:
+            return text
+        # Enhanced sentence scoring
+        scored_sentences = []
+        total_sentences = len(sentences)
+        # Calculate word frequencies for TF scoring
+        all_words = re.findall(r'\b\w+\b', text.lower())
+        word_freq = {}
+        for word in all_words:
+            if len(word) > 2:
+                word_freq[word] = word_freq.get(word, 0) + 1
+        # Important keywords that boost sentence scores
+        importance_keywords = [
+            'conclusion', 'summary', 'result', 'finding', 'important', 'significant',
+            'key', 'main', 'primary', 'essential', 'crucial', 'objective', 'goal',
+            'recommendation', 'suggest', 'propose', 'indicate', 'show', 'demonstrate'
+        ]
+        for i, sentence in enumerate(sentences):
+            if len(sentence.split()) < 5:  # Skip very short sentences
+                continue
+            score = 0
+            sentence_lower = sentence.lower()
+            sentence_words = sentence.split()
+            # Position scoring (beginning and end are more important)
+            if i < total_sentences * 0.15:  # First 15%
+                score += 3
+            elif i > total_sentences * 0.85:  # Last 15%
+                score += 2
+            elif total_sentences * 0.4 <= i <= total_sentences * 0.6:  # Middle section
+                score += 1
+            # Length scoring (prefer moderate length)
+            word_count = len(sentence_words)
+            if 12 <= word_count <= 25:
+                score += 3
+            elif 8 <= word_count <= 35:
+                score += 2
+            elif 5 <= word_count <= 45:
+                score += 1
+            # Keyword importance scoring
+            keyword_score = sum(2 if keyword in sentence_lower else 0 for keyword in importance_keywords)
+            score += min(keyword_score, 6)  # Cap keyword bonus
+            # TF-based scoring (frequency of important words)
+            tf_score = 0
+            for word in sentence_words:
+                word_lower = word.lower()
+                if word_lower in word_freq and len(word_lower) > 3:
+                    tf_score += min(word_freq[word_lower], 5)  # Cap individual word contribution
+            score += min(tf_score / len(sentence_words), 3)  # Normalize by sentence length
+            # Structural indicators
+            if any(indicator in sentence for indicator in [':', '—', '"', '(']):
+                score += 1
+            # Numerical data (often important)
+            if re.search(r'\b\d+(?:\.\d+)?%?\b', sentence):
+                score += 1
+            scored_sentences.append((sentence, score, i))
+        # Sort by score and select top sentences
+        scored_sentences.sort(key=lambda x: x[1], reverse=True)
+        selected_sentences = scored_sentences[:num_sentences]
+        # Sort selected sentences by original position to maintain flow
+        selected_sentences.sort(key=lambda x: x[2])
+        return ' '.join([s[0] for s in selected_sentences])
+    def intelligent_chunking(self, text: str, max_chunk_size: int = 1024) -> List[str]:
+        """Intelligently chunk text while preserving semantic boundaries"""
+        if len(text) <= max_chunk_size:
+            return [text]
+        chunks = []
+        # Try to split by double newlines first (paragraphs)
+        paragraphs = text.split('\n\n')
+        current_chunk = ""
+        for paragraph in paragraphs:
+            # If single paragraph is too long, split by sentences
+            if len(paragraph) > max_chunk_size:
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                    current_chunk = ""
+                # Split long paragraph by sentences
+                if NLTK_AVAILABLE:
+                    sentences = sent_tokenize(paragraph)
+                else:
+                    sentences = [s.strip() for s in paragraph.split('.') if s.strip()]
+                temp_chunk = ""
+                for sentence in sentences:
+                    if len(temp_chunk + sentence) <= max_chunk_size:
+                        temp_chunk += sentence + ". "
+                    else:
+                        if temp_chunk:
+                            chunks.append(temp_chunk.strip())
+                        temp_chunk = sentence + ". "
+                if temp_chunk:
+                    current_chunk = temp_chunk
+            else:
+                # Normal paragraph processing
+                if len(current_chunk + paragraph) <= max_chunk_size:
+                    current_chunk += paragraph + "\n\n"
+                else:
+                    if current_chunk:
+                        chunks.append(current_chunk.strip())
+                    current_chunk = paragraph + "\n\n"
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        return [chunk for chunk in chunks if chunk.strip()]
+    def ai_summary(self, text: str, max_length: int = 150, min_length: int = 50) -> str:
+        """Enhanced AI-powered summarization with better chunking and error handling"""
+        if not self.summarizer:
+            return self.advanced_extractive_summary(text)
+        try:
+            # Intelligent chunking
+            chunks = self.intelligent_chunking(text, 1000)  # Slightly smaller chunks for better quality
+            if not chunks:
+                return "No meaningful content found for summarization."
+            summaries = []
+            for i, chunk in enumerate(chunks):
+                if len(chunk.strip()) < 50:  # Skip very short chunks
+                    continue
+                try:
+                    # Adjust parameters based on chunk size
+                    chunk_max_length = min(max_length, max(50, len(chunk.split()) // 3))
+                    chunk_min_length = min(min_length, chunk_max_length // 2)
+                    summary = self.summarizer(
+                        chunk,
+                        max_length=chunk_max_length,
+                        min_length=chunk_min_length,
+                        do_sample=False,
+                        truncation=True
+                    )
+                    summaries.append(summary[0]['summary_text'])
+                except Exception as e:
+                    logger.warning(f"Error summarizing chunk {i}: {e}")
+                    # Fallback to extractive summary for this chunk
+                    fallback_summary = self.advanced_extractive_summary(chunk, 2)
+                    if fallback_summary and fallback_summary != "No text to summarize.":
+                        summaries.append(fallback_summary)
+            if not summaries:
+                return self.advanced_extractive_summary(text)
+            # Combine and refine summaries
+            if len(summaries) == 1:
+                return summaries[0]
+            else:
+                combined_summary = ' '.join(summaries)
+                # If combined summary is still too long, summarize again
+                if len(combined_summary.split()) > max_length * 1.5:
+                    try:
+                        final_summary = self.summarizer(
+                            combined_summary,
+                            max_length=max_length,
+                            min_length=min_length,
+                            do_sample=False,
+                            truncation=True
+                        )
+                        return final_summary[0]['summary_text']
+                    except Exception:
+                        return combined_summary[:max_length * 10]  # Rough character limit fallback
+                return combined_summary
+        except Exception as e:
+            logger.error(f"AI summarization failed: {e}")
+            return self.advanced_extractive_summary(text)
+    def generate_enhanced_key_points(self, text: str, num_points: int = 7) -> List[str]:
+        """Generate key points with improved extraction and categorization"""
+        if not text.strip():
+            return []
+        if NLTK_AVAILABLE:
+            sentences = sent_tokenize(text)
+        else:
+            sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
+        # Enhanced key point indicators with categories
+        key_indicators = {
+            'conclusions': ['conclusion', 'conclude', 'result', 'outcome', 'finding', 'discovered'],
+            'objectives': ['objective', 'goal', 'purpose', 'aim', 'target', 'mission'],
+            'methods': ['method', 'approach', 'technique', 'procedure', 'process', 'way'],
+            'importance': ['important', 'significant', 'crucial', 'essential', 'key', 'main', 'primary'],
+            'recommendations': ['recommend', 'suggest', 'propose', 'should', 'must', 'need to'],
+            'problems': ['problem', 'issue', 'challenge', 'difficulty', 'obstacle', 'concern'],
+            'benefits': ['benefit', 'advantage', 'improvement', 'enhancement', 'positive', 'gain']
+        }
+        scored_sentences = []
+        for sentence in sentences:
+            if len(sentence.split()) < 6:  # Skip very short sentences
+                continue
+            score = 0
+            sentence_lower = sentence.lower()
+            category = 'general'
+            # Category-based scoring
+            for cat, indicators in key_indicators.items():
+                category_score = sum(2 if indicator in sentence_lower else 0 for indicator in indicators)
+                if category_score > score:
+                    score = category_score
+                    category = cat
+            # Structural scoring
+            if sentence.strip().startswith(('•', '-', '1.', '2.', '3.', '4.', '5.')):
+                score += 4
+            # Punctuation indicators
+            if any(punct in sentence for punct in [':', ';', '—', '"']):
+                score += 1
+            # Length scoring (prefer moderate length for key points)
+            word_count = len(sentence.split())
+            if 8 <= word_count <= 20:
+                score += 3
+            elif 6 <= word_count <= 30:
+                score += 2
+            elif 4 <= word_count <= 40:
+                score += 1
+            # Numerical data bonus
+            if re.search(r'\b\d+(?:\.\d+)?%?\b', sentence):
+                score += 2
+            # Avoid very generic sentences
+            generic_words = ['the', 'this', 'that', 'there', 'it', 'they']
+            if sentence.split()[0].lower() in generic_words:
+                score -= 1
+            if score > 0:
+                scored_sentences.append((sentence.strip(), score, category))
+        # Sort by score and diversify by category
+        scored_sentences.sort(key=lambda x: x[1], reverse=True)
+        # Select diverse key points
+        selected_points = []
+        used_categories = set()
+        # First pass: get the highest scoring point from each category
+        for sentence, score, category in scored_sentences:
+            if len(selected_points) >= num_points:
+                break
+            if category not in used_categories:
+                selected_points.append(sentence)
+                used_categories.add(category)
+        # Second pass: fill remaining slots with highest scoring sentences
+        for sentence, score, category in scored_sentences:
+            if len(selected_points) >= num_points:
+                break
+            if sentence not in selected_points:
+                selected_points.append(sentence)
+        return selected_points[:num_points]
+    def generate_document_outline(self, text: str) -> List[str]:
+        """Generate a structured outline of the document"""
+        if not text.strip():
+            return []
+        lines = text.split('\n')
+        outline = []
+        # Look for headers, numbered sections, etc.
+        header_patterns = [
+            r'^#{1,6}\s+(.+)$',  # Markdown headers
+            r'^(\d+\.?\s+[A-Z][^.]{10,})$',  # Numbered sections
+            r'^([A-Z][A-Z\s]{5,})$',  # ALL CAPS headers
+            r'^([A-Z][a-z\s]{10,}:)$',  # Title Case with colon
+        ]
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+            for pattern in header_patterns:
+                match = re.match(pattern, line)
+                if match:
+                    outline.append(match.group(1).strip())
+                    break
+        return outline[:10]  # Limit to 10 outline items
+    def process_document(self, file_path: str, summary_type: str = "ai",
+                        summary_length: str = "medium") -> Tuple[Optional[Dict], Optional[str]]:
+        """Enhanced document processing with caching and comprehensive analysis"""
+        if not file_path:
+            return None, "No file provided."
+        try:
+            # Check cache
+            file_hash = self._get_file_hash(file_path)
+            cache_key = f"{file_hash}_{summary_type}_{summary_length}"
+            if cache_key in self.cache:
+                logger.info("Returning cached result")
+                return self.cache[cache_key], None
+            # Extract text based on file type
+            file_extension = Path(file_path).suffix.lower()
+            if file_extension == '.pdf':
+                text = self.extract_text_from_pdf(file_path)
+            elif file_extension == '.docx':
+                text = self.extract_text_from_docx(file_path)
+            elif file_extension in ['.txt', '.md', '.rtf']:
+                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                    text = f.read()
+            else:
+                return None, f"Unsupported file type: {file_extension}"
+            if not text.strip() or "not available" in text.lower():
+                return None, "No text could be extracted from the document or extraction failed."
+            # Clean text
+            text = re.sub(r'\n{3,}', '\n\n', text)  # Reduce excessive newlines
+            text = re.sub(r' {2,}', ' ', text)  # Reduce excessive spaces
+            # Get comprehensive statistics
+            stats = self.get_enhanced_document_stats(text)
+            # Generate summary based on type and length
+            length_params = {
+                "short": {"sentences": 2, "max_length": 80, "min_length": 30},
+                "medium": {"sentences": 4, "max_length": 150, "min_length": 50},
+                "long": {"sentences": 6, "max_length": 250, "min_length": 100},
+                "detailed": {"sentences": 8, "max_length": 400, "min_length": 150}
+            }
+            params = length_params.get(summary_length, length_params["medium"])
+            # Generate summary
+            if summary_type == "ai" and self.summarizer:
+                summary = self.ai_summary(text, params["max_length"], params["min_length"])
+            else:
+                summary = self.advanced_extractive_summary(text, params["sentences"])
+            # Generate enhanced features
+            key_points = self.generate_enhanced_key_points(text, 7)
+            outline = self.generate_document_outline(text)
+            # Calculate readability (simple approximation)
+            avg_sentence_length = stats.get('avg_sentence_length', 0)
+            readability_score = max(0, min(100, 100 - (avg_sentence_length * 2)))
+            result = {
+                'original_text': text[:2000] + "..." if len(text) > 2000 else text,  # Truncate for display
+                'full_text_length': len(text),
+                'summary': summary,
+                'key_points': key_points,
+                'outline': outline,
+                'stats': stats,
+                'readability_score': readability_score,
+                'file_name': Path(file_path).name,
+                'file_size': os.path.getsize(file_path),
+                'processing_time': datetime.now().isoformat(),
+                'summary_type': summary_type,
+                'summary_length': summary_length,
+                'model_used': 'AI (BART/T5)' if self.summarizer else 'Extractive'
+            }
+            # Cache result
+            self.cache[cache_key] = result
+            return result, None
+        except Exception as e:
+            logger.error(f"Document processing error: {e}")
+            return None, f"Error processing document: {str(e)}"
+def create_catalyst_interface():
+    """Create the CatalystGPT-4 document summarizer interface"""
+    summarizer = AdvancedDocumentSummarizer()
+    # Enhanced CSS with modern styling
+    css = """
+    .catalyst-header {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        padding: 30px;
+        border-radius: 20px;
+        text-align: center;
+        margin-bottom: 25px;
+        box-shadow: 0 10px 30px rgba(0,0,0,0.2);
+    }
+    .summary-container {
+        background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
+        color: white;
+        padding: 25px;
+        border-radius: 15px;
+        margin: 15px 0;
+        box-shadow: 0 8px 25px rgba(0,0,0,0.15);
+    }
+    .stats-container {
+        background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
+        color: white;
+        padding: 20px;
+        border-radius: 12px;
+        margin: 15px 0;
+        box-shadow: 0 6px 20px rgba(0,0,0,0.1);
+    }
+    .key-points-container {
+        background: linear-gradient(135deg, #4ecdc4 0%, #44a08d 100%);
+        color: white;
+        padding: 20px;
+        border-radius: 12px;
+        margin: 15px 0;
+        box-shadow: 0 6px 20px rgba(0,0,0,0.1);
+    }
+    .outline-container {
+        background: linear-gradient(135deg, #fa709a 0%, #fee140 100%);
+        color: white;
+        padding: 20px;
+        border-radius: 12px;
+        margin: 15px 0;
+        box-shadow: 0 6px 20px rgba(0,0,0,0.1);
+    }
+    .error-container {
+        background: linear-gradient(135deg, #ff9a9e 0%, #fecfef 100%);
+        color: #721c24;
+        padding: 20px;
+        border-radius: 12px;
+        margin: 15px 0;
+        border-left: 5px solid #dc3545;
+    }
+    .control-panel {
+        background: linear-gradient(135deg, #f6f9fc 0%, #e9ecef 100%);
+        padding: 25px;
+        border-radius: 15px;
+        margin: 15px 0;
+        border: 1px solid #dee2e6;
+        box-shadow: 0 4px 15px rgba(0,0,0,0.05);
+    }
+    .file-upload-area {
+        border: 3px dashed #007bff;
+        border-radius: 15px;
+        padding: 40px;
+        text-align: center;
+        background: linear-gradient(135deg, #f8f9ff 0%, #e3f2fd 100%);
+        transition: all 0.3s ease;
+        margin: 15px 0;
+    }
+    .file-upload-area:hover {
+        border-color: #0056b3;
+        background: linear-gradient(135deg, #f0f7ff 0%, #e1f5fe 100%);
+        transform: translateY(-2px);
+    }
+    .metric-card {
+        background: white;
+        padding: 15px;
+        border-radius: 10px;
+        margin: 5px;
+        box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+        text-align: center;
+    }
+    .sentiment-indicator {
+        display: inline-block;
+        padding: 5px 12px;
+        border-radius: 20px;
+        font-weight: bold;
+        font-size: 12px;
+        margin: 2px;
+    }
+    .sentiment-positive { background: #d4edda; color: #155724; }
+    .sentiment-negative { background: #f8d7da; color: #721c24; }
+    .sentiment-neutral { background: #d1ecf1; color: #0c5460; }
+    .progress-bar {
+        background: #e9ecef;
+        border-radius: 10px;
+        overflow: hidden;
+        height: 8px;
+        margin: 5px 0;
+    }
+    .progress-fill {
+        height: 100%;
+        background: linear-gradient(90deg, #28a745, #20c997);
+        transition: width 0.3s ease;
+    }
+    """
+    def format_file_size(size_bytes):
+        """Convert bytes to human readable format"""
+        for unit in ['B', 'KB', 'MB', 'GB']:
+            if size_bytes < 1024.0:
+                return f"{size_bytes:.1f} {unit}"
+            size_bytes /= 1024.0
+        return f"{size_bytes:.1f} TB"
+    def get_sentiment_indicator(sentiment_score):
+        """Get sentiment indicator HTML"""
+        if sentiment_score > 0.1:
+            return '<span class="sentiment-indicator sentiment-positive">😊 Positive</span>'
+        elif sentiment_score < -0.1:
+            return '<span class="sentiment-indicator sentiment-negative">😔 Negative</span>'
+        else:
+            return '<span class="sentiment-indicator sentiment-neutral">😐 Neutral</span>'
+    def process_and_display(file, summary_type, summary_length, enable_ai_features):
+        """Enhanced processing with comprehensive results display"""
+        if file is None:
+            return (
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(value="""
+                    <div style="text-align: center; padding: 60px; color: #666;">
+                        <h3>🚀 CatalystGPT-4 Ready</h3>
+                        <p>Upload a document to begin advanced AI-powered analysis</p>
+                        <p><small>Supports: PDF, Word (.docx), Text (.txt, .md, .rtf)</small></p>
+                    </div>
+                """, visible=True)
+            )
+        try:
+            # Use AI features based on toggle
+            actual_summary_type = summary_type if enable_ai_features else "extractive"
+            result, error = summarizer.process_document(file.name, actual_summary_type, summary_length)
+            if error:
+                error_html = f'''
+                <div class="error-container">
+                    <h4>❌ Processing Error</h4>
+                    <p><strong>Error:</strong> {error}</p>
+                    <p><small>Please try a different file or check the file format.</small></p>
+                </div>
+                '''
+                return (
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(value=error_html, visible=True)
+                )
+            # Format summary display
+            summary_html = f'''
+            <div class="summary-container">
+                <h3>🎯 Document Summary</h3>
+                <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 15px; margin-bottom: 15px;">
+                    <div><strong>📄 File:</strong> {result["file_name"]}</div>
+                    <div><strong>📊 Size:</strong> {format_file_size(result["file_size"])}</div>
+                    <div><strong>🤖 Model:</strong> {result["model_used"]}</div>
+                    <div><strong>📏 Length:</strong> {result["summary_length"].title()}</div>
+                </div>
+                <div style="background: rgba(255,255,255,0.15); padding: 20px; border-radius: 10px; line-height: 1.6;">
+                    {result["summary"]}
+                </div>
+            </div>
+            '''
+            # Format comprehensive statistics
+            stats = result["stats"]
+            readability = result["readability_score"]
+            # Create readability indicator
+            readability_color = "#28a745" if readability > 70 else "#ffc107" if readability > 40 else "#dc3545"
+            readability_text = "Easy" if readability > 70 else "Moderate" if readability > 40 else "Complex"
+            stats_html = f'''
+            <div class="stats-container">
+                <h3>📈 Document Analytics</h3>
+                <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 20px 0;">
+                    <div class="metric-card">
+                        <h4 style="margin: 0; color: #007bff;">📝 {stats["word_count"]:,}</h4>
+                        <small>Words</small>
+                    </div>
+                    <div class="metric-card">
+                        <h4 style="margin: 0; color: #28a745;">⏱️ {stats["estimated_reading_time"]} min</h4>
+                        <small>Reading Time</small>
+                    </div>
+                    <div class="metric-card">
+                        <h4 style="margin: 0; color: #17a2b8;">📑 {stats["sentence_count"]:,}</h4>
+                        <small>Sentences</small>
+                    </div>
+                    <div class="metric-card">
+                        <h4 style="margin: 0; color: #6f42c1;">🧠 {stats.get("unique_words", "N/A")}</h4>
+                        <small>Unique Words</small>
+                    </div>
+                </div>
+                <div style="margin: 20px 0;">
+                    <h4>📖 Readability Score</h4>
+                    <div class="progress-bar">
+                        <div class="progress-fill" style="width: {readability}%; background-color: {readability_color};"></div>
+                    </div>
+                    <p><strong>{readability:.1f}/100</strong> - {readability_text} to read</p>
+                </div>
+            '''
+            # Add sentiment analysis if available
+            if stats.get('sentiment'):
+                sentiment = stats['sentiment']
+                sentiment_html = get_sentiment_indicator(sentiment['compound'])
+                stats_html += f'''
+                <div style="margin: 20px 0;">
+                    <h4>😊 Document Sentiment</h4>
+                    {sentiment_html}
+                    <div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px; margin-top: 10px;">
+                        <small>Positive: {sentiment['positive']:.2f}</small>
+                        <small>Negative: {sentiment['negative']:.2f}</small>
+                        <small>Neutral: {sentiment['neutral']:.2f}</small>
+                    </div>
+                </div>
+                '''
+            # Add word frequency
+            if stats.get('top_words'):
+                stats_html += f'''
+                <div style="margin: 20px 0;">
+                    <h4>🔤 Most Frequent Words</h4>
+                    <div style="display: flex; flex-wrap: wrap; gap: 8px; margin-top: 10px;">
+                        {" ".join([f'<span style="background: rgba(255,255,255,0.2); padding: 6px 12px; border-radius: 15px; font-size: 13px;">{word} ({count})</span>' for word, count in stats["top_words"][:10]])}
+                    </div>
+                </div>
+                '''
+            stats_html += '</div>'
+            # Format key points
+            key_points_html = f'''
+            <div class="key-points-container">
+                <h3>🎯 Key Insights</h3>
+                <ul style="list-style: none; padding: 0;">
+            '''
+            for i, point in enumerate(result["key_points"], 1):
+                key_points_html += f'<li style="margin-bottom: 12px; padding: 10px; background: rgba(255,255,255,0.15); border-radius: 8px;"><strong>{i}.</strong> {point}</li>'
+            key_points_html += '</ul></div>'
+            # Format document outline
+            outline_html = ""
+            if result.get("outline"):
+                outline_html = f'''
+                <div class="outline-container">
+                    <h3>📋 Document Structure</h3>
+                    <ol style="padding-left: 20px;">
+                '''
+                for item in result["outline"]:
+                    outline_html += f'<li style="margin-bottom: 8px; padding: 5px 0;">{item}</li>'
+                outline_html += '</ol></div>'
+            return (
+                gr.update(value=summary_html, visible=True),
+                gr.update(value=stats_html, visible=True),
+                gr.update(value=key_points_html, visible=True),
+                gr.update(value=outline_html, visible=True if outline_html else False),
+                gr.update(visible=False)
+            )
+        except Exception as e:
+            error_html = f'''
+            <div class="error-container">
+                <h4>💥 Unexpected Error</h4>
+                <p><strong>Details:</strong> {str(e)}</p>
+                <p><small>Please try again or contact support if the issue persists.</small></p>
+            </div>
+            '''
+            return (
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(value=error_html, visible=True)
+            )
+    # Create the main interface
+    with gr.Blocks(css=css, title="🚀 CatalystGPT-4 Document Summarizer", theme=gr.themes.Soft()) as demo:
+        # Header
+        gr.HTML("""
+        <div class="catalyst-header">
+            <h1 style="margin: 0; font-size: 3em; font-weight: bold;">🚀 CatalystGPT-4</h1>
+            <h2 style="margin: 10px 0; font-size: 1.5em; opacity: 0.9;">Advanced Document Summarizer</h2>
+            <p style="margin: 15px 0 0 0; font-size: 1.1em; opacity: 0.8;">
+                Powered by AI • Extractive & Abstractive Summarization • Comprehensive Analytics
+            </p>
+        </div>
+        """)
+        with gr.Row():
+            # Left column - Enhanced Controls
+            with gr.Column(scale=1):
+                with gr.Group():
+                    gr.HTML('<div class="control-panel">')
+                    gr.Markdown("### 📁 Document Upload")
+                    file_upload = gr.File(
+                        label="Choose your document",
+                        file_types=[".pdf", ".docx", ".txt", ".md", ".rtf"],
+                        elem_classes="file-upload-area"
+                    )
+                    gr.Markdown("### ⚙️ Analysis Settings")
+                    enable_ai_features = gr.Checkbox(
+                        label="🤖 Enable AI Features",
+                        value=TRANSFORMERS_AVAILABLE,
+                        info="Use advanced AI models for better summarization",
+                        interactive=TRANSFORMERS_AVAILABLE
+                    )
+                    summary_type = gr.Radio(
+                        choices=[
+                            ("🧠 AI Summary (Neural)", "ai"),
+                            ("📝 Extractive Summary", "extractive")
+                        ],
+                        value="ai" if TRANSFORMERS_AVAILABLE else "extractive",
+                        label="Summarization Method",
+                        info="AI generates new text, Extractive selects key sentences"
+                    )
+                    summary_length = gr.Radio(
+                        choices=[
+                            ("⚡ Short & Concise", "short"),
+                            ("📄 Standard Length", "medium"),
+                            ("📖 Detailed Analysis", "long"),
+                            ("🔍 Comprehensive Report", "detailed")
+                        ],
+                        value="medium",
+                        label="Analysis Depth",
+                        info="Choose the level of detail for your analysis"
+                    )
+                    analyze_btn = gr.Button(
+                        "🚀 Analyze Document",
+                        variant="primary",
+                        size="lg",
+                        elem_classes="analyze-button"
+                    )
+                    gr.HTML('</div>')
+                # Enhanced Library Status
+                gr.Markdown(f"""
+                ### 📊 System Status
+                **Core Features:**
+                - 📄 **PDF Processing:** {"✅ PyMuPDF" if PYMUPDF_AVAILABLE else ("✅ PyPDF2" if PDF_AVAILABLE else "❌ Not Available")}
+                - 📝 **Word Documents:** {"✅ Available" if DOCX_AVAILABLE else "❌ Install python-docx"}
+                - 🤖 **AI Summarization:** {"✅ Available" if TRANSFORMERS_AVAILABLE else "❌ Install transformers"}
+                - 📈 **Advanced NLP:** {"✅ Available" if NLTK_AVAILABLE else "⚠️ Basic processing"}
+                - 😊 **Sentiment Analysis:** {"✅ Available" if (NLTK_AVAILABLE and summarizer.sentiment_analyzer) else "❌ Not Available"}
+                **Performance:**
+                - 🔧 **Device:** {"GPU" if DEVICE >= 0 else "CPU"}
+                - 💾 **Cache:** {"Enabled" if summarizer.cache is not None else "Disabled"}
+                """)
+            # Right column - Enhanced Results
+            with gr.Column(scale=2):
+                # Welcome message
+                welcome_msg = gr.HTML(
+                    value="""
+                    <div style="text-align: center; padding: 80px 20px; color: #666;">
+                        <div style="font-size: 4em; margin-bottom: 20px;">📚</div>
+                        <h2 style="color: #333; margin-bottom: 15px;">Ready for Analysis</h2>
+                        <p style="font-size: 1.1em; margin-bottom: 10px;">Upload any document to unlock AI-powered insights</p>
+                        <p><small style="color: #888;">Supports PDF, Word, Text, Markdown, and RTF files</small></p>
+                        <div style="margin-top: 30px; padding: 20px; background: #f8f9fa; border-radius: 10px; display: inline-block;">
+                            <strong>Features:</strong> AI Summarization • Key Points • Analytics • Sentiment Analysis
+                        </div>
+                    </div>
+                    """,
+                    visible=True
+                )
+                # Results sections
+                summary_display = gr.HTML(visible=False)
+                stats_display = gr.HTML(visible=False)
+                key_points_display = gr.HTML(visible=False)
+                outline_display = gr.HTML(visible=False)
+                error_display = gr.HTML(visible=False)
+        # Event handlers
+        def on_file_change(file):
+            if file is None:
+                return (
+                    gr.update(visible=True),
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(visible=False)
+                )
+            else:
+                return (
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(visible=False)
+                )
+        # Auto-hide welcome when file uploaded
+        file_upload.change(
+            fn=on_file_change,
+            inputs=[file_upload],
+            outputs=[welcome_msg, summary_display, stats_display, key_points_display, outline_display, error_display]
+        )
+        # Process document on button click
+        analyze_btn.click(
+            fn=process_and_display,
+            inputs=[file_upload, summary_type, summary_length, enable_ai_features],
+            outputs=[summary_display, stats_display, key_points_display, outline_display, error_display]
+        )
+        # Auto-process when settings change (if file uploaded)
+        for component in [summary_type, summary_length, enable_ai_features]:
+            component.change(
+                fn=process_and_display,
+                inputs=[file_upload, summary_type, summary_length, enable_ai_features],
+                outputs=[summary_display, stats_display, key_points_display, outline_display, error_display]
+            )
+        # Enhanced Footer
+        gr.HTML("""
+        <div style="margin-top: 50px; padding: 30px; background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
+                    border-radius: 15px; text-align: center; border-top: 3px solid #007bff;">
+            <h3 style="color: #333; margin-bottom: 20px;">🛠️ Installation & Setup</h3>
+            <div style="background: #343a40; color: #fff; padding: 15px; border-radius: 8px;
+                        font-family: 'Courier New', monospace; margin: 15px 0;">
+                <strong>Quick Install:</strong><br>
+                pip install gradio python-docx PyPDF2 transformers torch nltk PyMuPDF
+            </div>
+            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin-top: 20px;">
+                <div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
+                    <strong>🎯 Core Features</strong><br>
+                    <small>Multi-format support, AI summarization, key insights extraction</small>
+                </div>
+                <div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
+                    <strong>📊 Advanced Analytics</strong><br>
+                    <small>Sentiment analysis, readability scoring, word frequency</small>
+                </div>
+                <div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
+                    <strong>🚀 Performance</strong><br>
+                    <small>Intelligent caching, GPU acceleration, batch processing</small>
+                </div>
+            </div>
+            <p style="margin-top: 20px; color: #666;">
+                <strong>CatalystGPT-4</strong> - Advanced Document Analysis Platform
+            </p>
+        </div>
+        """)
+    return demo
+if __name__ == "__main__":
+    demo = create_catalyst_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+        show_tips=True,
+        enable_queue=True
+    )