Spaces:

shaheerawan3
/

AI_OutputAnalyzer

Sleeping

App Files Files Community

shaheerawan3 commited on Dec 23, 2024

Commit

e7fcdfa

verified ·

1 Parent(s): 318abcf

Update app.py

Browse files

Files changed (1) hide show

app.py +150 -9

app.py CHANGED Viewed

@@ -102,17 +102,158 @@ class TextProcessor:
         return True
 # analysis.py
-from typing import Dict, List
-import spacy
-from transformers import pipeline
-from nltk.sentiment import SentimentIntensityAnalyzer
-from nltk.tokenize import sent_tokenize
-from gensim import corpora, models
-import numpy as np
-from concurrent.futures import ThreadPoolExecutor
 import streamlit as st
-logger = setup_logger("analysis")
 class AdvancedAnalyzer:
     def __init__(self, config: Dict):

         return True
 # analysis.py
 import streamlit as st
+import pandas as pd
+import numpy as np
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.decomposition import LatentDirichletAllocation
+import logging
+logger = logging.getLogger(__name__)
+class TopicModeler:
+    def __init__(self, num_topics=3):
+        self.num_topics = num_topics
+        self.lemmatizer = WordNetLemmatizer()
+        self.vectorizer = CountVectorizer(
+            max_df=0.95,
+            min_df=2,
+            stop_words='english',
+            max_features=1000
+        )
+        self.lda = LatentDirichletAllocation(
+            n_components=num_topics,
+            random_state=42,
+            max_iter=10
+        )
+    def preprocess_text(self, text):
+        """Preprocess text for topic modeling"""
+        try:
+            # Tokenize
+            tokens = word_tokenize(text.lower())
+            # Remove stopwords and lemmatize
+            stop_words = set(stopwords.words('english'))
+            tokens = [
+                self.lemmatizer.lemmatize(token)
+                for token in tokens
+                if token.isalnum() and token not in stop_words
+            ]
+            return ' '.join(tokens)
+        except Exception as e:
+            logger.error(f"Error in text preprocessing: {str(e)}")
+            raise
+    def extract_topics(self, text):
+        """Extract topics using LDA"""
+        try:
+            # Preprocess text
+            processed_text = self.preprocess_text(text)
+            # Create document-term matrix
+            dtm = self.vectorizer.fit_transform([processed_text])
+            # Fit LDA model
+            self.lda.fit(dtm)
+            # Get feature names
+            feature_names = self.vectorizer.get_feature_names_out()
+            # Extract topics
+            topics = []
+            for topic_idx, topic in enumerate(self.lda.components_):
+                top_words = [
+                    feature_names[i]
+                    for i in topic.argsort()[:-10:-1]
+                ]
+                topics.append({
+                    'id': topic_idx,
+                    'words': top_words,
+                    'coherence': float(np.mean(topic))
+                })
+            return topics
+        except Exception as e:
+            logger.error(f"Error in topic modeling: {str(e)}")
+            raise
+class AdvancedAnalyzer:
+    def __init__(self):
+        self.topic_modeler = TopicModeler()
+        self._initialize_models()
+    @st.cache_resource
+    def _initialize_models(self):
+        """Initialize all required models"""
+        try:
+            self.sentiment_analyzer = SentimentIntensityAnalyzer()
+            self.nlp = spacy.load('en_core_web_sm')
+            self.sentiment_model = pipeline(
+                "sentiment-analysis",
+                model="nlptown/bert-base-multilingual-uncased-sentiment",
+                return_all_scores=True
+            )
+            logger.info("Models initialized successfully")
+        except Exception as e:
+            logger.error(f"Error initializing models: {str(e)}")
+            raise
+    def analyze_text(self, text: str, num_topics: int = 3) -> Dict:
+        """Complete text analysis pipeline"""
+        try:
+            # Update number of topics if needed
+            if num_topics != self.topic_modeler.num_topics:
+                self.topic_modeler = TopicModeler(num_topics)
+            # Perform analysis
+            results = {
+                'sentiment': self.analyze_sentiment_batch(text),
+                'topics': self.topic_modeler.extract_topics(text),
+                'entities': self.extract_entities(text)
+            }
+            return results
+        except Exception as e:
+            logger.error(f"Error in analysis pipeline: {str(e)}")
+            raise
+    def analyze_sentiment_batch(self, text: str, batch_size: int = 1000) -> Dict:
+        """Analyze sentiment in batches"""
+        sentences = sent_tokenize(text)
+        results = []
+        with ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(self.analyze_sentiment, sentence)
+                for sentence in sentences
+            ]
+            for future in futures:
+                try:
+                    results.append(future.result())
+                except Exception as e:
+                    logger.error(f"Error in sentiment analysis: {str(e)}")
+                    continue
+        if not results:
+            raise ValueError("No successful sentiment analysis results")
+        compound = np.mean([r['compound'] for r in results])
+        emotions = {
+            'positive': np.mean([r['emotions']['positive'] for r in results]),
+            'negative': np.mean([r['emotions']['negative'] for r in results]),
+            'neutral': np.mean([r['emotions']['neutral'] for r in results])
+        }
+        return {'compound': compound, 'emotions': emotions}
+    # ... rest of the AdvancedAnalyzer methods remain the same ...
 class AdvancedAnalyzer:
     def __init__(self, config: Dict):