Spaces:

shaheerawan3
/

AI_OutputAnalyzer

Sleeping

App Files Files Community

shaheerawan3 commited on Dec 23, 2024

Commit

c025acd

verified ·

1 Parent(s): c4a470c

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -351

app.py CHANGED Viewed

@@ -1,115 +1,63 @@
-# config.yaml
-config_yaml = """
-models:
-  spacy: en_core_web_sm
-  sentiment: nlptown/bert-base-multilingual-uncased-sentiment
-analysis:
-  batch_size: 1000
-  min_entity_confidence: 0.8
-  num_topics: 3
-  max_text_length: 50000
-security:
-  max_file_size: 5242880  # 5MB
-  allowed_extensions: [txt]
-logging:
-  level: INFO
-  format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-"""
-# utils/config.py
-import yaml
-from pathlib import Path
-def load_config():
-    config_path = Path("config.yaml")
-    if not config_path.exists():
-        with open(config_path, "w") as f:
-            f.write(config_yaml)
-    with open(config_path) as f:
-        return yaml.safe_load(f)
-# utils/logger.py
-import logging
-from typing import Optional
-def setup_logger(name: Optional[str] = None) -> logging.Logger:
     config = load_config()
-    logger = logging.getLogger(name or __name__)
-    logger.setLevel(config['logging']['level'])
-    if not logger.handlers:
-        handler = logging.StreamHandler()
-        formatter = logging.Formatter(config['logging']['format'])
-        handler.setFormatter(formatter)
-        logger.addHandler(handler)
-    return logger
-# text_processing.py
-from typing import Optional, Dict
-import streamlit as st
-from pathlib import Path
-import logging
-logger = setup_logger("text_processing")
-class TextProcessor:
-    def __init__(self, config: Dict):
-        self.config = config
-        self.max_file_size = config['security']['max_file_size']
-        self.allowed_extensions = config['security']['allowed_extensions']
-    def validate_file(self, uploaded_file) -> bool:
-        if uploaded_file is None:
-            return False
-        # Check file size
-        if uploaded_file.size > self.max_file_size:
-            st.error(f"File size exceeds {self.max_file_size/1024/1024}MB limit")
-            return False
-        # Check extension
-        ext = Path(uploaded_file.name).suffix[1:].lower()
-        if ext not in self.allowed_extensions:
-            st.error(f"Unsupported file type. Allowed types: {', '.join(self.allowed_extensions)}")
-            return False
-        return True
-    def process_file_upload(self, uploaded_file) -> Optional[str]:
         try:
-            if not self.validate_file(uploaded_file):
-                return None
-            return uploaded_file.read().decode('utf-8')
         except Exception as e:
-            logger.error(f"Error processing file upload: {str(e)}")
-            st.error(f"Error processing file: {str(e)}")
-            return None
-    @staticmethod
-    def validate_text(text: str, max_length: int) -> bool:
-        if not text or len(text.strip()) == 0:
-            st.error("Please enter some text to analyze")
-            return False
-        if len(text) > max_length:
-            st.error(f"Text exceeds maximum length of {max_length} characters")
-            return False
-        return True
-# analysis.py
 import streamlit as st
-import pandas as pd
 import numpy as np
-from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.decomposition import LatentDirichletAllocation
 import logging
 logger = logging.getLogger(__name__)
@@ -119,52 +67,35 @@ class TopicModeler:
         self.num_topics = num_topics
         self.lemmatizer = WordNetLemmatizer()
         self.vectorizer = CountVectorizer(
-            max_df=0.95,
-            min_df=2,
-            stop_words='english',
-            max_features=1000
         )
         self.lda = LatentDirichletAllocation(
             n_components=num_topics,
-            random_state=42,
-            max_iter=10
         )
     def preprocess_text(self, text):
-        """Preprocess text for topic modeling"""
         try:
-            # Tokenize
             tokens = word_tokenize(text.lower())
-            # Remove stopwords and lemmatize
             stop_words = set(stopwords.words('english'))
             tokens = [
                 self.lemmatizer.lemmatize(token)
                 for token in tokens
                 if token.isalnum() and token not in stop_words
             ]
             return ' '.join(tokens)
         except Exception as e:
             logger.error(f"Error in text preprocessing: {str(e)}")
             raise
     def extract_topics(self, text):
-        """Extract topics using LDA"""
         try:
-            # Preprocess text
             processed_text = self.preprocess_text(text)
-            # Create document-term matrix
             dtm = self.vectorizer.fit_transform([processed_text])
-            # Fit LDA model
             self.lda.fit(dtm)
-            # Get feature names
             feature_names = self.vectorizer.get_feature_names_out()
-            # Extract topics
             topics = []
             for topic_idx, topic in enumerate(self.lda.components_):
                 top_words = [
@@ -176,27 +107,24 @@ class TopicModeler:
                     'words': top_words,
                     'coherence': float(np.mean(topic))
                 })
             return topics
         except Exception as e:
             logger.error(f"Error in topic modeling: {str(e)}")
             raise
 class AdvancedAnalyzer:
-    def __init__(self):
         self.topic_modeler = TopicModeler()
         self._initialize_models()
     @st.cache_resource
     def _initialize_models(self):
-        """Initialize all required models"""
         try:
-            self.sentiment_analyzer = SentimentIntensityAnalyzer()
-            self.nlp = spacy.load('en_core_web_sm')
             self.sentiment_model = pipeline(
                 "sentiment-analysis",
-                model="nlptown/bert-base-multilingual-uncased-sentiment",
                 return_all_scores=True
             )
             logger.info("Models initialized successfully")
@@ -204,34 +132,29 @@ class AdvancedAnalyzer:
             logger.error(f"Error initializing models: {str(e)}")
             raise
-    def analyze_text(self, text: str, num_topics: int = 3) -> Dict:
-        """Complete text analysis pipeline"""
         try:
-            # Update number of topics if needed
             if num_topics != self.topic_modeler.num_topics:
                 self.topic_modeler = TopicModeler(num_topics)
-            # Perform analysis
             results = {
                 'sentiment': self.analyze_sentiment_batch(text),
                 'topics': self.topic_modeler.extract_topics(text),
                 'entities': self.extract_entities(text)
             }
             return results
         except Exception as e:
             logger.error(f"Error in analysis pipeline: {str(e)}")
             raise
-    def analyze_sentiment_batch(self, text: str, batch_size: int = 1000) -> Dict:
-        """Analyze sentiment in batches"""
         sentences = sent_tokenize(text)
         results = []
         with ThreadPoolExecutor() as executor:
             futures = [
-                executor.submit(self.analyze_sentiment, sentence)
                 for sentence in sentences
             ]
             for future in futures:
@@ -244,229 +167,59 @@ class AdvancedAnalyzer:
         if not results:
             raise ValueError("No successful sentiment analysis results")
-        compound = np.mean([r['compound'] for r in results])
-        emotions = {
-            'positive': np.mean([r['emotions']['positive'] for r in results]),
-            'negative': np.mean([r['emotions']['negative'] for r in results]),
-            'neutral': np.mean([r['emotions']['neutral'] for r in results])
         }
-        return {'compound': compound, 'emotions': emotions}
-    # ... rest of the AdvancedAnalyzer methods remain the same ...
-class AdvancedAnalyzer:
-    def __init__(self, config: Dict):
-        self.config = config
-        self._initialize_models()
-    @st.cache_resource
-    def _initialize_models(self):
-        try:
-            self.sentiment_analyzer = SentimentIntensityAnalyzer()
-            self.nlp = spacy.load(self.config['models']['spacy'])
-            self.sentiment_model = pipeline(
-                "sentiment-analysis",
-                model=self.config['models']['sentiment'],
-                return_all_scores=True
-            )
-            logger.info("Models initialized successfully")
-        except Exception as e:
-            logger.error(f"Error initializing models: {str(e)}")
-            raise
-    def analyze_text(self, text: str) -> Dict:
-        """Complete text analysis pipeline"""
-        results = {}
-        # Use st.progress to show analysis progress
-        progress_bar = st.progress(0)
-        status_text = st.empty()
-        try:
-            # Sentiment Analysis
-            status_text.text("Analyzing sentiment...")
-            results['sentiment'] = self.analyze_sentiment_batch(
-                text,
-                self.config['analysis']['batch_size']
-            )
-            progress_bar.progress(0.33)
-            # Topic Modeling
-            status_text.text("Extracting topics...")
-            results['topics'] = self.topic_modeling(
-                text,
-                self.config['analysis']['num_topics']
-            )
-            progress_bar.progress(0.66)
-            # Entity Extraction
-            status_text.text("Identifying entities...")
-            results['entities'] = self.extract_entities(text)
-            progress_bar.progress(1.0)
-            status_text.text("Analysis complete!")
-            return results
-        except Exception as e:
-            logger.error(f"Error in analysis pipeline: {str(e)}")
-            raise
-        finally:
-            progress_bar.empty()
-            status_text.empty()
-    # Rest of the AdvancedAnalyzer methods remain the same...
-# ui.py
-import streamlit as st
-import plotly.graph_objects as go
-import plotly.express as px
-import pandas as pd
-from typing import Dict
-class UI:
-    @staticmethod
-    def setup_page():
-        st.set_page_config(
-            page_title="Enhanced AI Output Analyzer",
-            layout="wide",
-            initial_sidebar_state="expanded"
-        )
-        st.markdown("""
-            <style>
-            .main { padding: 2rem; }
-            .stMetric {
-                background-color: var(--background-color);
-                padding: 1rem;
-                border-radius: 0.5rem;
-                box-shadow: 0 2px 4px rgba(0,0,0,0.1);
             }
-            .entity-tag {
-                background-color: var(--secondary-background-color);
-                padding: 0.2rem 0.5rem;
-                border-radius: 0.25rem;
-                margin: 0.2rem;
-                display: inline-block;
-            }
-            </style>
-        """, unsafe_allow_html=True)
-    @staticmethod
-    def display_results(results: Dict):
-        """Display analysis results with interactive visualizations"""
-        st.subheader("Analysis Results")
-        # Create tabs for different analyses
-        sentiment_tab, topics_tab, entities_tab = st.tabs([
-            "Sentiment Analysis",
-            "Topic Modeling",
-            "Named Entities"
-        ])
-        with sentiment_tab:
-            UI._display_sentiment(results['sentiment'])
-        with topics_tab:
-            UI._display_topics(results['topics'])
-        with entities_tab:
-            UI._display_entities(results['entities'])
-    @staticmethod
-    def _display_sentiment(sentiment_results: Dict):
-        col1, col2 = st.columns(2)
-        with col1:
-            # Sentiment gauge
-            fig = go.Figure(go.Indicator(
-                mode="gauge+number",
-                value=sentiment_results['compound'],
-                domain={'x': [0, 1], 'y': [0, 1]},
-                gauge={
-                    'axis': {'range': [-1, 1]},
-                    'bar': {'color': "darkblue"},
-                    'steps': [
-                        {'range': [-1, -0.05], 'color': "lightcoral"},
-                        {'range': [-0.05, 0.05], 'color': "lightgray"},
-                        {'range': [0.05, 1], 'color': "lightgreen"}
-                    ]
-                }
-            ))
-            st.plotly_chart(fig)
-        with col2:
-            # Emotions pie chart
-            emotions_df = pd.DataFrame(
-                sentiment_results['emotions'].items(),
-                columns=['Emotion', 'Score']
-            )
-            fig = px.pie(
-                emotions_df,
-                values='Score',
-                names='Emotion',
-                title="Emotional Distribution"
-            )
-            st.plotly_chart(fig)
-    # Rest of the UI methods...
-# main.py
-import streamlit as st
-from utils.config import load_config
-from text_processing import TextProcessor
-from analysis import AdvancedAnalyzer
-from ui import UI
-def main():
-    # Load configuration
-    config = load_config()
-    # Setup UI
-    UI.setup_page()
-    # Initialize processors
-    text_processor = TextProcessor(config)
-    analyzer = AdvancedAnalyzer(config)
-    # Sidebar configuration
-    with st.sidebar:
-        st.title("Analysis Settings")
-        config['analysis']['num_topics'] = st.slider(
-            "Number of Topics",
-            2, 10,
-            config['analysis']['num_topics']
-        )
-        config['analysis']['min_entity_confidence'] = st.slider(
-            "Entity Confidence Threshold",
-            0.0, 1.0,
-            config['analysis']['min_entity_confidence']
-        )
-    # Main content
-    st.title("Enhanced AI Output Analyzer")
-    # Input section
-    input_method = st.radio("Choose input method:", ["Text Input", "File Upload"])
-    if input_method == "File Upload":
-        text = text_processor.process_file_upload(
-            st.file_uploader("Upload a text file", type=['txt'])
-        )
-    else:
-        text = st.text_area("Enter text to analyze:", height=200)
-    # Analysis section
-    if st.button("Analyze", type="primary") and text_processor.validate_text(
-        text, config['analysis']['max_text_length']
-    ):
-        try:
-            with st.spinner("Analyzing text..."):
-                results = analyzer.analyze_text(text)
-                UI.display_results(results)
-        except Exception as e:
-            st.error(f"An error occurred during analysis: {str(e)}")
-if __name__ == "__main__":
-    main()

+# src/main.py
+import streamlit as st
+from utils.config import load_config
+from core.text_processing import TextProcessor
+from core.analysis import AdvancedAnalyzer
+from core.ui import UI
+def main():
     config = load_config()
+    UI.setup_page()
+    text_processor = TextProcessor(config)
+    analyzer = AdvancedAnalyzer(config)
+    with st.sidebar:
+        st.title("Analysis Settings")
+        config['analysis']['num_topics'] = st.slider(
+            "Number of Topics", 2, 10,
+            config['analysis']['num_topics']
+        )
+        config['analysis']['min_entity_confidence'] = st.slider(
+            "Entity Confidence Threshold", 0.0, 1.0,
+            config['analysis']['min_entity_confidence']
+        )
+    st.title("Enhanced AI Output Analyzer")
+    input_method = st.radio("Choose input method:", ["Text Input", "File Upload"])
+    if input_method == "File Upload":
+        text = text_processor.process_file_upload(
+            st.file_uploader("Upload a text file", type=['txt'])
+        )
+    else:
+        text = st.text_area("Enter text to analyze:", height=200)
+    if st.button("Analyze", type="primary") and text_processor.validate_text(
+        text, config['analysis']['max_text_length']
+    ):
         try:
+            with st.spinner("Analyzing text..."):
+                results = analyzer.analyze_text(text)
+                UI.display_results(results)
         except Exception as e:
+            st.error(f"An error occurred during analysis: {str(e)}")
+if __name__ == "__main__":
+    main()
+# src/core/analysis.py
 import streamlit as st
 import numpy as np
+from nltk.tokenize import word_tokenize, sent_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.decomposition import LatentDirichletAllocation
+from concurrent.futures import ThreadPoolExecutor
+from transformers import pipeline
+import spacy
+from typing import Dict
 import logging
 logger = logging.getLogger(__name__)
         self.num_topics = num_topics
         self.lemmatizer = WordNetLemmatizer()
         self.vectorizer = CountVectorizer(
+            max_df=0.95, min_df=2,
+            stop_words='english', max_features=1000
         )
         self.lda = LatentDirichletAllocation(
             n_components=num_topics,
+            random_state=42, max_iter=10
         )
     def preprocess_text(self, text):
         try:
             tokens = word_tokenize(text.lower())
             stop_words = set(stopwords.words('english'))
             tokens = [
                 self.lemmatizer.lemmatize(token)
                 for token in tokens
                 if token.isalnum() and token not in stop_words
             ]
             return ' '.join(tokens)
         except Exception as e:
             logger.error(f"Error in text preprocessing: {str(e)}")
             raise
     def extract_topics(self, text):
         try:
             processed_text = self.preprocess_text(text)
             dtm = self.vectorizer.fit_transform([processed_text])
             self.lda.fit(dtm)
             feature_names = self.vectorizer.get_feature_names_out()
             topics = []
             for topic_idx, topic in enumerate(self.lda.components_):
                 top_words = [
                     'words': top_words,
                     'coherence': float(np.mean(topic))
                 })
             return topics
         except Exception as e:
             logger.error(f"Error in topic modeling: {str(e)}")
             raise
 class AdvancedAnalyzer:
+    def __init__(self, config):
+        self.config = config
         self.topic_modeler = TopicModeler()
         self._initialize_models()
     @st.cache_resource
     def _initialize_models(self):
         try:
+            self.nlp = spacy.load(self.config['models']['spacy'])
             self.sentiment_model = pipeline(
                 "sentiment-analysis",
+                model=self.config['models']['sentiment'],
                 return_all_scores=True
             )
             logger.info("Models initialized successfully")
             logger.error(f"Error initializing models: {str(e)}")
             raise
+    def analyze_text(self, text: str) -> Dict:
         try:
+            num_topics = self.config['analysis']['num_topics']
             if num_topics != self.topic_modeler.num_topics:
                 self.topic_modeler = TopicModeler(num_topics)
             results = {
                 'sentiment': self.analyze_sentiment_batch(text),
                 'topics': self.topic_modeler.extract_topics(text),
                 'entities': self.extract_entities(text)
             }
             return results
         except Exception as e:
             logger.error(f"Error in analysis pipeline: {str(e)}")
             raise
+    def analyze_sentiment_batch(self, text: str) -> Dict:
         sentences = sent_tokenize(text)
         results = []
         with ThreadPoolExecutor() as executor:
             futures = [
+                executor.submit(self.sentiment_model, sentence)
                 for sentence in sentences
             ]
             for future in futures:
         if not results:
             raise ValueError("No successful sentiment analysis results")
+        # Process and aggregate results
+        scores = np.mean([r[0]['score'] for r in results])
+        return {
+            'score': float(scores),
+            'label': 'positive' if scores > 0.5 else 'negative'
         }
+    def extract_entities(self, text: str) -> list:
+        doc = self.nlp(text)
+        return [
+            {
+                'text': ent.text,
+                'label': ent.label_,
+                'confidence': float(ent._.confidence)
+                if hasattr(ent._, 'confidence') else 1.0
             }
+            for ent in doc.ents
+            if (hasattr(ent._, 'confidence') and
+                ent._.confidence >= self.config['analysis']['min_entity_confidence'])
+        ]
+# src/utils/config.py
+import yaml
+from pathlib import Path
+def load_config():
+    current_dir = Path(__file__).parent.parent.parent
+    config_path = current_dir / "config.yaml"
+    if not config_path.exists():
+        config = {
+            'models': {
+                'spacy': 'en_core_web_sm',
+                'sentiment': 'nlptown/bert-base-multilingual-uncased-sentiment'
+            },
+            'analysis': {
+                'batch_size': 1000,
+                'min_entity_confidence': 0.8,
+                'num_topics': 3,
+                'max_text_length': 50000
+            },
+            'security': {
+                'max_file_size': 5242880,
+                'allowed_extensions': ['txt']
+            },
+            'logging': {
+                'level': 'INFO',
+                'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+            }
+        }
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f, default_flow_style=False)
+    with open(config_path) as f:
+        return yaml.safe_load(f)