shaheerawan3 commited on
Commit
e7fcdfa
·
verified ·
1 Parent(s): 318abcf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +150 -9
app.py CHANGED
@@ -102,17 +102,158 @@ class TextProcessor:
102
  return True
103
 
104
  # analysis.py
105
- from typing import Dict, List
106
- import spacy
107
- from transformers import pipeline
108
- from nltk.sentiment import SentimentIntensityAnalyzer
109
- from nltk.tokenize import sent_tokenize
110
- from gensim import corpora, models
111
- import numpy as np
112
- from concurrent.futures import ThreadPoolExecutor
113
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
- logger = setup_logger("analysis")
116
 
117
  class AdvancedAnalyzer:
118
  def __init__(self, config: Dict):
 
102
  return True
103
 
104
  # analysis.py
 
 
 
 
 
 
 
 
105
  import streamlit as st
106
+ import pandas as pd
107
+ import numpy as np
108
+ from nltk.tokenize import word_tokenize
109
+ from nltk.corpus import stopwords
110
+ from nltk.stem import WordNetLemmatizer
111
+ from sklearn.feature_extraction.text import CountVectorizer
112
+ from sklearn.decomposition import LatentDirichletAllocation
113
+ import logging
114
+
115
+ logger = logging.getLogger(__name__)
116
+
117
+ class TopicModeler:
118
+ def __init__(self, num_topics=3):
119
+ self.num_topics = num_topics
120
+ self.lemmatizer = WordNetLemmatizer()
121
+ self.vectorizer = CountVectorizer(
122
+ max_df=0.95,
123
+ min_df=2,
124
+ stop_words='english',
125
+ max_features=1000
126
+ )
127
+ self.lda = LatentDirichletAllocation(
128
+ n_components=num_topics,
129
+ random_state=42,
130
+ max_iter=10
131
+ )
132
+
133
+ def preprocess_text(self, text):
134
+ """Preprocess text for topic modeling"""
135
+ try:
136
+ # Tokenize
137
+ tokens = word_tokenize(text.lower())
138
+
139
+ # Remove stopwords and lemmatize
140
+ stop_words = set(stopwords.words('english'))
141
+ tokens = [
142
+ self.lemmatizer.lemmatize(token)
143
+ for token in tokens
144
+ if token.isalnum() and token not in stop_words
145
+ ]
146
+
147
+ return ' '.join(tokens)
148
+ except Exception as e:
149
+ logger.error(f"Error in text preprocessing: {str(e)}")
150
+ raise
151
+
152
+ def extract_topics(self, text):
153
+ """Extract topics using LDA"""
154
+ try:
155
+ # Preprocess text
156
+ processed_text = self.preprocess_text(text)
157
+
158
+ # Create document-term matrix
159
+ dtm = self.vectorizer.fit_transform([processed_text])
160
+
161
+ # Fit LDA model
162
+ self.lda.fit(dtm)
163
+
164
+ # Get feature names
165
+ feature_names = self.vectorizer.get_feature_names_out()
166
+
167
+ # Extract topics
168
+ topics = []
169
+ for topic_idx, topic in enumerate(self.lda.components_):
170
+ top_words = [
171
+ feature_names[i]
172
+ for i in topic.argsort()[:-10:-1]
173
+ ]
174
+ topics.append({
175
+ 'id': topic_idx,
176
+ 'words': top_words,
177
+ 'coherence': float(np.mean(topic))
178
+ })
179
+
180
+ return topics
181
+
182
+ except Exception as e:
183
+ logger.error(f"Error in topic modeling: {str(e)}")
184
+ raise
185
+
186
+ class AdvancedAnalyzer:
187
+ def __init__(self):
188
+ self.topic_modeler = TopicModeler()
189
+ self._initialize_models()
190
+
191
+ @st.cache_resource
192
+ def _initialize_models(self):
193
+ """Initialize all required models"""
194
+ try:
195
+ self.sentiment_analyzer = SentimentIntensityAnalyzer()
196
+ self.nlp = spacy.load('en_core_web_sm')
197
+ self.sentiment_model = pipeline(
198
+ "sentiment-analysis",
199
+ model="nlptown/bert-base-multilingual-uncased-sentiment",
200
+ return_all_scores=True
201
+ )
202
+ logger.info("Models initialized successfully")
203
+ except Exception as e:
204
+ logger.error(f"Error initializing models: {str(e)}")
205
+ raise
206
+
207
+ def analyze_text(self, text: str, num_topics: int = 3) -> Dict:
208
+ """Complete text analysis pipeline"""
209
+ try:
210
+ # Update number of topics if needed
211
+ if num_topics != self.topic_modeler.num_topics:
212
+ self.topic_modeler = TopicModeler(num_topics)
213
+
214
+ # Perform analysis
215
+ results = {
216
+ 'sentiment': self.analyze_sentiment_batch(text),
217
+ 'topics': self.topic_modeler.extract_topics(text),
218
+ 'entities': self.extract_entities(text)
219
+ }
220
+
221
+ return results
222
+
223
+ except Exception as e:
224
+ logger.error(f"Error in analysis pipeline: {str(e)}")
225
+ raise
226
+
227
+ def analyze_sentiment_batch(self, text: str, batch_size: int = 1000) -> Dict:
228
+ """Analyze sentiment in batches"""
229
+ sentences = sent_tokenize(text)
230
+ results = []
231
+
232
+ with ThreadPoolExecutor() as executor:
233
+ futures = [
234
+ executor.submit(self.analyze_sentiment, sentence)
235
+ for sentence in sentences
236
+ ]
237
+ for future in futures:
238
+ try:
239
+ results.append(future.result())
240
+ except Exception as e:
241
+ logger.error(f"Error in sentiment analysis: {str(e)}")
242
+ continue
243
+
244
+ if not results:
245
+ raise ValueError("No successful sentiment analysis results")
246
+
247
+ compound = np.mean([r['compound'] for r in results])
248
+ emotions = {
249
+ 'positive': np.mean([r['emotions']['positive'] for r in results]),
250
+ 'negative': np.mean([r['emotions']['negative'] for r in results]),
251
+ 'neutral': np.mean([r['emotions']['neutral'] for r in results])
252
+ }
253
+
254
+ return {'compound': compound, 'emotions': emotions}
255
 
256
+ # ... rest of the AdvancedAnalyzer methods remain the same ...
257
 
258
  class AdvancedAnalyzer:
259
  def __init__(self, config: Dict):