Spaces:

shaheerawan3
/

AI_OutputAnalyzer

Sleeping

App Files Files Community

shaheerawan3 commited on Dec 23, 2024

Commit

d41b0e0

verified ·

1 Parent(s): f8166f7

Update app.py

Browse files

Files changed (1) hide show

app.py +335 -215

app.py CHANGED Viewed

@@ -9,9 +9,7 @@ from nltk.corpus import stopwords
 from nltk.sentiment import SentimentIntensityAnalyzer
 from gensim import corpora, models
 import spacy
-from bs4 import BeautifulSoup
 import requests
-import wikipedia
 from langdetect import detect
 import json
 import base64
@@ -19,173 +17,270 @@ from datetime import datetime
 import tempfile
 from fpdf import FPDF
 import os
-# Download required NLTK data
-import nltk
-try:
-    nltk.data.find('tokenizers/punkt')
-    nltk.data.find('stopwords')
-    nltk.data.find('vader_lexicon')
-except LookupError:
-    nltk.download('punkt')
-    nltk.download('stopwords')
-    nltk.download('vader_lexicon')
 class AdvancedAnalyzer:
     def __init__(self):
-        # Initialize sentiment analyzer
-        self.sentiment_analyzer = SentimentIntensityAnalyzer()
-        # Initialize NER model
         try:
             self.nlp = spacy.load('en_core_web_sm')
-        except:
-            os.system('python -m spacy download en_core_web_sm')
-            self.nlp = spacy.load('en_core_web_sm')
-        # Initialize multilingual sentiment model
-        self.sentiment_model = pipeline(
-            "sentiment-analysis",
-            model="nlptown/bert-base-multilingual-uncased-sentiment",
-            return_all_scores=True
-        )
-    def analyze_sentiment(self, text, language='en'):
-        """Advanced sentiment analysis with emotion detection"""
-        if language != 'en':
-            # Use multilingual model for non-English text
-            sentiments = self.sentiment_model(text)[0]
-            return {
-                'compound': max(s['score'] for s in sentiments),
-                'emotions': {s['label']: s['score'] for s in sentiments}
-            }
-        else:
-            # Use VADER for English text
-            scores = self.sentiment_analyzer.polarity_scores(text)
-            return {
-                'compound': scores['compound'],
-                'emotions': {
-                    'positive': scores['pos'],
-                    'negative': scores['neg'],
-                    'neutral': scores['neu']
-                }
-            }
-    def extract_entities(self, text):
-        """Named Entity Recognition"""
-        doc = self.nlp(text)
-        entities = {}
-        for ent in doc.ents:
-            if ent.label_ not in entities:
-                entities[ent.label_] = []
-            entities[ent.label_].append(ent.text)
-        return entities
-    def topic_modeling(self, text):
-        """Extract main topics from text"""
-        # Tokenize and remove stopwords
-        stop_words = set(stopwords.words('english'))
-        tokens = [word.lower() for word in word_tokenize(text)
-                 if word.lower() not in stop_words and word.isalnum()]
-        # Create dictionary and corpus
-        dictionary = corpora.Dictionary([tokens])
-        corpus = [dictionary.doc2bow(tokens)]
-        # Train LDA model
-        lda_model = models.LdaModel(
-            corpus,
-            num_topics=3,
-            id2word=dictionary,
-            passes=15
-        )
-        # Extract topics
-        topics = []
-        for idx, topic in lda_model.show_topics():
-            topics.append({
-                'id': idx,
-                'words': [word.split('*')[1].strip().strip('"')
-                         for word in topic.split('+')]
-            })
-        return topics
 class PDFGenerator:
     def __init__(self):
         self.pdf = FPDF()
-    def generate_report(self, analysis_results):
-        """Generate a professional PDF report"""
-        self.pdf.add_page()
-        # Header
         self.pdf.set_font('Arial', 'B', 16)
         self.pdf.cell(190, 10, 'AI Output Analysis Report', 0, 1, 'C')
         self.pdf.ln(10)
-        # Summary
         self.pdf.set_font('Arial', 'B', 12)
         self.pdf.cell(190, 10, 'Analysis Summary', 0, 1, 'L')
         self.pdf.set_font('Arial', '', 10)
-        # Add sentiment scores
         self.pdf.cell(190, 10,
-                     f"Overall Sentiment: {analysis_results['sentiment']['compound']:.2f}",
                      0, 1, 'L')
-        # Add topics
-        self.pdf.set_font('Arial', 'B', 12)
-        self.pdf.cell(190, 10, 'Main Topics', 0, 1, 'L')
-        self.pdf.set_font('Arial', '', 10)
-        for topic in analysis_results['topics']:
-            self.pdf.cell(190, 10,
-                         f"Topic {topic['id']+1}: {', '.join(topic['words'][:5])}",
-                         0, 1, 'L')
-        # Add entities
-        self.pdf.set_font('Arial', 'B', 12)
-        self.pdf.cell(190, 10, 'Named Entities', 0, 1, 'L')
-        self.pdf.set_font('Arial', '', 10)
-        for entity_type, entities in analysis_results['entities'].items():
-            if entities:
-                self.pdf.cell(190, 10,
-                            f"{entity_type}: {', '.join(entities[:5])}",
-                            0, 1, 'L')
-        # Footer
-        self.pdf.set_y(-15)
-        self.pdf.set_font('Arial', 'I', 8)
-        self.pdf.cell(0, 10, f'Generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}', 0, 0, 'C')
-        self.pdf.cell(0, 10, 'Created by Muhammad Shaheer', 0, 0, 'R')
-        # Save to temporary file
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
-            self.pdf.output(tmp.name)
-            return tmp.name
 def main():
-    st.set_page_config(page_title="Enhanced AI Output Analyzer", layout="wide")
-    # Custom CSS
     st.markdown("""
         <style>
         .main { padding: 2rem; }
-        .stMetric { background-color: #f0f2f6; padding: 1rem; border-radius: 0.5rem; }
-        .entity-tag { background-color: #e9ecef; padding: 0.2rem 0.5rem; border-radius: 0.25rem; margin: 0.2rem; }
-        .dark-mode { background-color: #1a1a1a; color: #ffffff; }
         </style>
     """, unsafe_allow_html=True)
-    # Sidebar
     with st.sidebar:
-        st.title("Settings")
-        theme = st.selectbox("Theme", ["Light", "Dark"])
-        language = st.selectbox("Language", ["English", "Spanish", "French", "German"])
-        st.markdown("---")
-        st.markdown("### Analysis Options")
-        show_sentiment = st.checkbox("Show Sentiment Analysis", True)
-        show_topics = st.checkbox("Show Topic Analysis", True)
-        show_entities = st.checkbox("Show Named Entities", True)
     # Main content
     st.title("Enhanced AI Output Analyzer")
@@ -193,92 +288,117 @@ def main():
     # Input section
     input_method = st.radio("Choose input method:", ["Text Input", "File Upload"])
     if input_method == "File Upload":
-        uploaded_file = st.file_uploader("Upload a text file", type=['txt'])
-        if uploaded_file:
-            text = uploaded_file.read().decode()
-        else:
-            text = ""
     else:
         text = st.text_area("Enter text to analyze:", height=200)
-    # Analysis button
-    if st.button("Analyze", type="primary") and text:
-        with st.spinner("Analyzing text..."):
-            analyzer = AdvancedAnalyzer()
-            # Perform analysis
-            results = {
-                'sentiment': analyzer.analyze_sentiment(text),
-                'entities': analyzer.extract_entities(text),
-                'topics': analyzer.topic_modeling(text)
-            }
-            # Display results
-            col1, col2, col3 = st.columns(3)
-            if show_sentiment:
-                with col1:
-                    st.metric("Overall Sentiment",
-                             f"{results['sentiment']['compound']:.2f}")
-            if show_topics:
-                with col2:
-                    st.metric("Topics Detected",
-                             len(results['topics']))
-            if show_entities:
-                with col3:
-                    st.metric("Entities Found",
-                             sum(len(ents) for ents in results['entities'].values()))
-            # Detailed results
-            st.subheader("Detailed Analysis")
-            tab1, tab2, tab3 = st.tabs(["Sentiment", "Topics", "Entities"])
-            with tab1:
-                emotions_df = pd.DataFrame(
-                    results['sentiment']['emotions'].items(),
-                    columns=['Emotion', 'Score']
-                )
-                st.plotly_chart(
-                    px.bar(emotions_df, x='Emotion', y='Score',
-                          title="Emotional Analysis"),
-                    use_container_width=True
-                )
-            with tab2:
-                for topic in results['topics']:
-                    st.write(f"Topic {topic['id']+1}:", ", ".join(topic['words']))
-            with tab3:
-                for entity_type, entities in results['entities'].items():
-                    if entities:
-                        st.write(f"**{entity_type}:**")
-                        st.write(", ".join(entities))
-            # Generate PDF report
-            pdf_generator = PDFGenerator()
-            pdf_path = pdf_generator.generate_report(results)
-            with open(pdf_path, "rb") as pdf_file:
-                st.download_button(
-                    label="Download Analysis Report (PDF)",
-                    data=pdf_file,
-                    file_name="analysis_report.pdf",
-                    mime="application/pdf"
                 )
-            # Clean up
-            os.unlink(pdf_path)
-    # Footer
-    st.markdown("---")
-    st.markdown(
-        "<p style='text-align: center; color: gray;'>Created by Muhammad Shaheer</p>",
-        unsafe_allow_html=True
-    )
 if __name__ == "__main__":
     main()

 from nltk.sentiment import SentimentIntensityAnalyzer
 from gensim import corpora, models
 import spacy
 import requests
 from langdetect import detect
 import json
 import base64
 import tempfile
 from fpdf import FPDF
 import os
+from functools import lru_cache
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Any, Optional
+import io
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+class TextProcessor:
+    """Handles text input processing and validation"""
+    @staticmethod
+    def process_file_upload(uploaded_file) -> Optional[str]:
+        """Process uploaded file and return text content"""
+        try:
+            if uploaded_file is None:
+                return None
+            # Get file extension
+            file_extension = uploaded_file.name.split('.')[-1].lower()
+            if file_extension == 'txt':
+                return uploaded_file.read().decode('utf-8')
+            else:
+                raise ValueError(f"Unsupported file type: {file_extension}")
+        except Exception as e:
+            logger.error(f"Error processing file upload: {str(e)}")
+            st.error(f"Error processing file: {str(e)}")
+            return None
+    @staticmethod
+    def validate_text(text: str) -> bool:
+        """Validate input text"""
+        if not text or len(text.strip()) == 0:
+            st.error("Please enter some text to analyze")
+            return False
+        if len(text.split()) > 10000:  # Arbitrary limit
+            st.error("Text is too long. Please enter a shorter text")
+            return False
+        return True
 class AdvancedAnalyzer:
+    """Handles text analysis using various NLP techniques"""
     def __init__(self):
+        self._initialize_models()
+    @lru_cache(maxsize=1)
+    def _initialize_models(self):
+        """Initialize all required models with caching"""
         try:
+            self.sentiment_analyzer = SentimentIntensityAnalyzer()
             self.nlp = spacy.load('en_core_web_sm')
+            self.sentiment_model = pipeline(
+                "sentiment-analysis",
+                model="nlptown/bert-base-multilingual-uncased-sentiment",
+                return_all_scores=True
+            )
+            logger.info("Models initialized successfully")
+        except Exception as e:
+            logger.error(f"Error initializing models: {str(e)}")
+            raise
+    def analyze_sentiment_batch(self, text: str, batch_size: int = 1000) -> Dict:
+        """Analyze sentiment in batches for better performance"""
+        sentences = sent_tokenize(text)
+        results = []
+        with ThreadPoolExecutor() as executor:
+            for i in range(0, len(sentences), batch_size):
+                batch = sentences[i:i + batch_size]
+                results.extend(executor.map(self.analyze_sentiment, batch))
+        # Aggregate results
+        compound = np.mean([r['compound'] for r in results])
+        emotions = {
+            'positive': np.mean([r['emotions']['positive'] for r in results]),
+            'negative': np.mean([r['emotions']['negative'] for r in results]),
+            'neutral': np.mean([r['emotions']['neutral'] for r in results])
+        }
+        return {'compound': compound, 'emotions': emotions}
+    def analyze_sentiment(self, text: str, language: str = 'en') -> Dict:
+        """Analyze sentiment with emotion detection"""
+        try:
+            if language != 'en':
+                sentiments = self.sentiment_model(text)[0]
+                return {
+                    'compound': max(s['score'] for s in sentiments),
+                    'emotions': {s['label']: s['score'] for s in sentiments}
+                }
+            else:
+                scores = self.sentiment_analyzer.polarity_scores(text)
+                return {
+                    'compound': scores['compound'],
+                    'emotions': {
+                        'positive': scores['pos'],
+                        'negative': scores['neg'],
+                        'neutral': scores['neu']
+                    }
+                }
+        except Exception as e:
+            logger.error(f"Error in sentiment analysis: {str(e)}")
+            raise
+    def extract_entities(self, text: str) -> Dict[str, List[str]]:
+        """Extract named entities with confidence scores"""
+        try:
+            doc = self.nlp(text)
+            entities = {}
+            for ent in doc.ents:
+                if ent.label_ not in entities:
+                    entities[ent.label_] = []
+                # Only include entities with high confidence
+                if ent.label_prob >= 0.8:
+                    entities[ent.label_].append({
+                        'text': ent.text,
+                        'confidence': round(ent.label_prob, 3)
+                    })
+            return entities
+        except Exception as e:
+            logger.error(f"Error in entity extraction: {str(e)}")
+            raise
+    def topic_modeling(self, text: str, num_topics: int = 3) -> List[Dict]:
+        """Extract main topics using LDA with preprocessing"""
+        try:
+            # Tokenize and clean text
+            doc = self.nlp(text.lower())
+            tokens = [
+                token.lemma_ for token in doc
+                if not token.is_stop and not token.is_punct and token.is_alpha
+            ]
+            # Create dictionary and corpus
+            texts = [tokens]
+            dictionary = corpora.Dictionary(texts)
+            corpus = [dictionary.doc2bow(text) for text in texts]
+            # Train LDA model with coherence optimization
+            lda_model = models.LdaModel(
+                corpus=corpus,
+                id2word=dictionary,
+                num_topics=num_topics,
+                random_state=42,
+                passes=15,
+                alpha='auto',
+                per_word_topics=True
+            )
+            # Extract topics with probabilities
+            topics = []
+            for idx, topic in lda_model.show_topics(formatted=False):
+                topics.append({
+                    'id': idx,
+                    'words': [(word, round(prob, 4))
+                             for word, prob in topic],
+                    'coherence': round(lda_model.get_topic_coherence(topic), 4)
+                })
+            return sorted(topics, key=lambda x: x['coherence'], reverse=True)
+        except Exception as e:
+            logger.error(f"Error in topic modeling: {str(e)}")
+            raise
 class PDFGenerator:
+    """Generates professional PDF reports with visualizations"""
     def __init__(self):
         self.pdf = FPDF()
+    def generate_report(self, analysis_results: Dict) -> str:
+        """Generate a detailed PDF report with charts"""
+        try:
+            self.pdf.add_page()
+            self._add_header()
+            self._add_summary(analysis_results)
+            self._add_sentiment_analysis(analysis_results)
+            self._add_topics(analysis_results)
+            self._add_entities(analysis_results)
+            self._add_footer()
+            # Save to temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
+                self.pdf.output(tmp.name)
+                return tmp.name
+        except Exception as e:
+            logger.error(f"Error generating PDF report: {str(e)}")
+            raise
+    def _add_header(self):
+        """Add report header"""
         self.pdf.set_font('Arial', 'B', 16)
         self.pdf.cell(190, 10, 'AI Output Analysis Report', 0, 1, 'C')
         self.pdf.ln(10)
+    def _add_summary(self, results: Dict):
+        """Add analysis summary"""
         self.pdf.set_font('Arial', 'B', 12)
         self.pdf.cell(190, 10, 'Analysis Summary', 0, 1, 'L')
         self.pdf.set_font('Arial', '', 10)
+        compound_score = results['sentiment']['compound']
+        sentiment_label = (
+            'Positive' if compound_score > 0.05
+            else 'Negative' if compound_score < -0.05
+            else 'Neutral'
+        )
         self.pdf.cell(190, 10,
+                     f"Overall Sentiment: {sentiment_label} ({compound_score:.2f})",
                      0, 1, 'L')
 def main():
+    st.set_page_config(
+        page_title="Enhanced AI Output Analyzer",
+        layout="wide",
+        initial_sidebar_state="expanded"
+    )
+    # Load custom CSS
     st.markdown("""
         <style>
         .main { padding: 2rem; }
+        .stMetric {
+            background-color: #f0f2f6;
+            padding: 1rem;
+            border-radius: 0.5rem;
+            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        }
+        .entity-tag {
+            background-color: #e9ecef;
+            padding: 0.2rem 0.5rem;
+            border-radius: 0.25rem;
+            margin: 0.2rem;
+            display: inline-block;
+        }
         </style>
     """, unsafe_allow_html=True)
+    # Initialize session state
+    if 'analysis_history' not in st.session_state:
+        st.session_state.analysis_history = []
+    # Sidebar configuration
     with st.sidebar:
+        st.title("Analysis Settings")
+        # Analysis options
+        st.subheader("Analysis Options")
+        num_topics = st.slider("Number of Topics", 2, 10, 3)
+        min_entity_confidence = st.slider("Entity Confidence Threshold", 0.0, 1.0, 0.8)
+        batch_size = st.select_slider(
+            "Processing Batch Size",
+            options=[500, 1000, 2000, 5000],
+            value=1000
+        )
     # Main content
     st.title("Enhanced AI Output Analyzer")
     # Input section
     input_method = st.radio("Choose input method:", ["Text Input", "File Upload"])
+    text_processor = TextProcessor()
     if input_method == "File Upload":
+        text = text_processor.process_file_upload(
+            st.file_uploader("Upload a text file", type=['txt'])
+        )
     else:
         text = st.text_area("Enter text to analyze:", height=200)
+    # Analysis section
+    if st.button("Analyze", type="primary") and text_processor.validate_text(text):
+        try:
+            with st.spinner("Performing analysis..."):
+                analyzer = AdvancedAnalyzer()
+                # Perform analysis with progress tracking
+                progress_bar = st.progress(0)
+                # Sentiment analysis
+                results = {
+                    'sentiment': analyzer.analyze_sentiment_batch(
+                        text, batch_size=batch_size
+                    )
+                }
+                progress_bar.progress(0.33)
+                # Topic modeling
+                results['topics'] = analyzer.topic_modeling(
+                    text, num_topics=num_topics
                 )
+                progress_bar.progress(0.66)
+                # Entity extraction
+                results['entities'] = analyzer.extract_entities(text)
+                progress_bar.progress(1.0)
+                # Display results
+                st.success("Analysis complete!")
+                # Save to history
+                st.session_state.analysis_history.append({
+                    'timestamp': datetime.now(),
+                    'results': results
+                })
+                # Display visualizations
+                display_results(results)
+                # Generate report
+                generate_downloadable_report(results)
+        except Exception as e:
+            logger.error(f"Error during analysis: {str(e)}")
+            st.error(f"An error occurred during analysis: {str(e)}")
+def display_results(results: Dict):
+    """Display analysis results with interactive visualizations"""
+    # Sentiment Analysis
+    st.subheader("Sentiment Analysis")
+    col1, col2 = st.columns(2)
+    with col1:
+        # Sentiment gauge
+        fig = go.Figure(go.Indicator(
+            mode="gauge+number",
+            value=results['sentiment']['compound'],
+            domain={'x': [0, 1], 'y': [0, 1]},
+            gauge={
+                'axis': {'range': [-1, 1]},
+                'bar': {'color': "darkblue"},
+                'steps': [
+                    {'range': [-1, -0.05], 'color': "lightcoral"},
+                    {'range': [-0.05, 0.05], 'color': "lightgray"},
+                    {'range': [0.05, 1], 'color': "lightgreen"}
+                ]
+            }
+        ))
+        st.plotly_chart(fig)
+    with col2:
+        # Emotions pie chart
+        emotions_df = pd.DataFrame(
+            results['sentiment']['emotions'].items(),
+            columns=['Emotion', 'Score']
+        )
+        fig = px.pie(
+            emotions_df,
+            values='Score',
+            names='Emotion',
+            title="Emotional Distribution"
+        )
+        st.plotly_chart(fig)
+def generate_downloadable_report(results: Dict):
+    """Generate and provide downloadable report"""
+    try:
+        pdf_generator = PDFGenerator()
+        pdf_path = pdf_generator.generate_report(results)
+        with open(pdf_path, "rb") as pdf_file:
+            st.download_button(
+                label="📊 Download Analysis Report (PDF)",
+                data=pdf_file,
+                file_name=f"analysis_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf",
+                mime="application/pdf"
+            )
+        # Clean up
+        os.unlink(pdf_path)
+    except Exception as e:
+        logger.error(f"Error generating downloadable report: {str(e)}")
+        st.error("Failed to generate report. Please try again.")
 if __name__ == "__main__":
     main()