shaheerawan3 commited on
Commit
c025acd
·
verified ·
1 Parent(s): c4a470c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -351
app.py CHANGED
@@ -1,115 +1,63 @@
1
- # config.yaml
2
- config_yaml = """
3
- models:
4
- spacy: en_core_web_sm
5
- sentiment: nlptown/bert-base-multilingual-uncased-sentiment
6
-
7
- analysis:
8
- batch_size: 1000
9
- min_entity_confidence: 0.8
10
- num_topics: 3
11
- max_text_length: 50000
12
-
13
- security:
14
- max_file_size: 5242880 # 5MB
15
- allowed_extensions: [txt]
16
-
17
- logging:
18
- level: INFO
19
- format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
20
- """
21
-
22
- # utils/config.py
23
- import yaml
24
- from pathlib import Path
25
-
26
- def load_config():
27
- config_path = Path("config.yaml")
28
- if not config_path.exists():
29
- with open(config_path, "w") as f:
30
- f.write(config_yaml)
31
- with open(config_path) as f:
32
- return yaml.safe_load(f)
33
-
34
- # utils/logger.py
35
- import logging
36
- from typing import Optional
37
 
38
- def setup_logger(name: Optional[str] = None) -> logging.Logger:
39
  config = load_config()
40
- logger = logging.getLogger(name or __name__)
41
- logger.setLevel(config['logging']['level'])
42
 
43
- if not logger.handlers:
44
- handler = logging.StreamHandler()
45
- formatter = logging.Formatter(config['logging']['format'])
46
- handler.setFormatter(formatter)
47
- logger.addHandler(handler)
48
 
49
- return logger
50
-
51
- # text_processing.py
52
- from typing import Optional, Dict
53
- import streamlit as st
54
- from pathlib import Path
55
- import logging
56
-
57
- logger = setup_logger("text_processing")
58
-
59
- class TextProcessor:
60
- def __init__(self, config: Dict):
61
- self.config = config
62
- self.max_file_size = config['security']['max_file_size']
63
- self.allowed_extensions = config['security']['allowed_extensions']
64
 
65
- def validate_file(self, uploaded_file) -> bool:
66
- if uploaded_file is None:
67
- return False
68
-
69
- # Check file size
70
- if uploaded_file.size > self.max_file_size:
71
- st.error(f"File size exceeds {self.max_file_size/1024/1024}MB limit")
72
- return False
73
-
74
- # Check extension
75
- ext = Path(uploaded_file.name).suffix[1:].lower()
76
- if ext not in self.allowed_extensions:
77
- st.error(f"Unsupported file type. Allowed types: {', '.join(self.allowed_extensions)}")
78
- return False
79
-
80
- return True
81
 
82
- def process_file_upload(self, uploaded_file) -> Optional[str]:
 
 
83
  try:
84
- if not self.validate_file(uploaded_file):
85
- return None
86
-
87
- return uploaded_file.read().decode('utf-8')
88
-
89
  except Exception as e:
90
- logger.error(f"Error processing file upload: {str(e)}")
91
- st.error(f"Error processing file: {str(e)}")
92
- return None
93
 
94
- @staticmethod
95
- def validate_text(text: str, max_length: int) -> bool:
96
- if not text or len(text.strip()) == 0:
97
- st.error("Please enter some text to analyze")
98
- return False
99
- if len(text) > max_length:
100
- st.error(f"Text exceeds maximum length of {max_length} characters")
101
- return False
102
- return True
103
 
104
- # analysis.py
105
  import streamlit as st
106
- import pandas as pd
107
  import numpy as np
108
- from nltk.tokenize import word_tokenize
109
  from nltk.corpus import stopwords
110
  from nltk.stem import WordNetLemmatizer
111
  from sklearn.feature_extraction.text import CountVectorizer
112
  from sklearn.decomposition import LatentDirichletAllocation
 
 
 
 
113
  import logging
114
 
115
  logger = logging.getLogger(__name__)
@@ -119,52 +67,35 @@ class TopicModeler:
119
  self.num_topics = num_topics
120
  self.lemmatizer = WordNetLemmatizer()
121
  self.vectorizer = CountVectorizer(
122
- max_df=0.95,
123
- min_df=2,
124
- stop_words='english',
125
- max_features=1000
126
  )
127
  self.lda = LatentDirichletAllocation(
128
  n_components=num_topics,
129
- random_state=42,
130
- max_iter=10
131
  )
132
 
133
  def preprocess_text(self, text):
134
- """Preprocess text for topic modeling"""
135
  try:
136
- # Tokenize
137
  tokens = word_tokenize(text.lower())
138
-
139
- # Remove stopwords and lemmatize
140
  stop_words = set(stopwords.words('english'))
141
  tokens = [
142
  self.lemmatizer.lemmatize(token)
143
  for token in tokens
144
  if token.isalnum() and token not in stop_words
145
  ]
146
-
147
  return ' '.join(tokens)
148
  except Exception as e:
149
  logger.error(f"Error in text preprocessing: {str(e)}")
150
  raise
151
 
152
  def extract_topics(self, text):
153
- """Extract topics using LDA"""
154
  try:
155
- # Preprocess text
156
  processed_text = self.preprocess_text(text)
157
-
158
- # Create document-term matrix
159
  dtm = self.vectorizer.fit_transform([processed_text])
160
-
161
- # Fit LDA model
162
  self.lda.fit(dtm)
163
-
164
- # Get feature names
165
  feature_names = self.vectorizer.get_feature_names_out()
166
 
167
- # Extract topics
168
  topics = []
169
  for topic_idx, topic in enumerate(self.lda.components_):
170
  top_words = [
@@ -176,27 +107,24 @@ class TopicModeler:
176
  'words': top_words,
177
  'coherence': float(np.mean(topic))
178
  })
179
-
180
  return topics
181
-
182
  except Exception as e:
183
  logger.error(f"Error in topic modeling: {str(e)}")
184
  raise
185
 
186
  class AdvancedAnalyzer:
187
- def __init__(self):
 
188
  self.topic_modeler = TopicModeler()
189
  self._initialize_models()
190
 
191
  @st.cache_resource
192
  def _initialize_models(self):
193
- """Initialize all required models"""
194
  try:
195
- self.sentiment_analyzer = SentimentIntensityAnalyzer()
196
- self.nlp = spacy.load('en_core_web_sm')
197
  self.sentiment_model = pipeline(
198
  "sentiment-analysis",
199
- model="nlptown/bert-base-multilingual-uncased-sentiment",
200
  return_all_scores=True
201
  )
202
  logger.info("Models initialized successfully")
@@ -204,34 +132,29 @@ class AdvancedAnalyzer:
204
  logger.error(f"Error initializing models: {str(e)}")
205
  raise
206
 
207
- def analyze_text(self, text: str, num_topics: int = 3) -> Dict:
208
- """Complete text analysis pipeline"""
209
  try:
210
- # Update number of topics if needed
211
  if num_topics != self.topic_modeler.num_topics:
212
  self.topic_modeler = TopicModeler(num_topics)
213
 
214
- # Perform analysis
215
  results = {
216
  'sentiment': self.analyze_sentiment_batch(text),
217
  'topics': self.topic_modeler.extract_topics(text),
218
  'entities': self.extract_entities(text)
219
  }
220
-
221
  return results
222
-
223
  except Exception as e:
224
  logger.error(f"Error in analysis pipeline: {str(e)}")
225
  raise
226
 
227
- def analyze_sentiment_batch(self, text: str, batch_size: int = 1000) -> Dict:
228
- """Analyze sentiment in batches"""
229
  sentences = sent_tokenize(text)
230
  results = []
231
 
232
  with ThreadPoolExecutor() as executor:
233
  futures = [
234
- executor.submit(self.analyze_sentiment, sentence)
235
  for sentence in sentences
236
  ]
237
  for future in futures:
@@ -244,229 +167,59 @@ class AdvancedAnalyzer:
244
  if not results:
245
  raise ValueError("No successful sentiment analysis results")
246
 
247
- compound = np.mean([r['compound'] for r in results])
248
- emotions = {
249
- 'positive': np.mean([r['emotions']['positive'] for r in results]),
250
- 'negative': np.mean([r['emotions']['negative'] for r in results]),
251
- 'neutral': np.mean([r['emotions']['neutral'] for r in results])
252
  }
253
-
254
- return {'compound': compound, 'emotions': emotions}
255
-
256
- # ... rest of the AdvancedAnalyzer methods remain the same ...
257
 
258
- class AdvancedAnalyzer:
259
- def __init__(self, config: Dict):
260
- self.config = config
261
- self._initialize_models()
262
-
263
- @st.cache_resource
264
- def _initialize_models(self):
265
- try:
266
- self.sentiment_analyzer = SentimentIntensityAnalyzer()
267
- self.nlp = spacy.load(self.config['models']['spacy'])
268
- self.sentiment_model = pipeline(
269
- "sentiment-analysis",
270
- model=self.config['models']['sentiment'],
271
- return_all_scores=True
272
- )
273
- logger.info("Models initialized successfully")
274
- except Exception as e:
275
- logger.error(f"Error initializing models: {str(e)}")
276
- raise
277
-
278
- def analyze_text(self, text: str) -> Dict:
279
- """Complete text analysis pipeline"""
280
- results = {}
281
-
282
- # Use st.progress to show analysis progress
283
- progress_bar = st.progress(0)
284
- status_text = st.empty()
285
-
286
- try:
287
- # Sentiment Analysis
288
- status_text.text("Analyzing sentiment...")
289
- results['sentiment'] = self.analyze_sentiment_batch(
290
- text,
291
- self.config['analysis']['batch_size']
292
- )
293
- progress_bar.progress(0.33)
294
-
295
- # Topic Modeling
296
- status_text.text("Extracting topics...")
297
- results['topics'] = self.topic_modeling(
298
- text,
299
- self.config['analysis']['num_topics']
300
- )
301
- progress_bar.progress(0.66)
302
-
303
- # Entity Extraction
304
- status_text.text("Identifying entities...")
305
- results['entities'] = self.extract_entities(text)
306
- progress_bar.progress(1.0)
307
-
308
- status_text.text("Analysis complete!")
309
- return results
310
-
311
- except Exception as e:
312
- logger.error(f"Error in analysis pipeline: {str(e)}")
313
- raise
314
- finally:
315
- progress_bar.empty()
316
- status_text.empty()
317
-
318
- # Rest of the AdvancedAnalyzer methods remain the same...
319
-
320
- # ui.py
321
- import streamlit as st
322
- import plotly.graph_objects as go
323
- import plotly.express as px
324
- import pandas as pd
325
- from typing import Dict
326
-
327
- class UI:
328
- @staticmethod
329
- def setup_page():
330
- st.set_page_config(
331
- page_title="Enhanced AI Output Analyzer",
332
- layout="wide",
333
- initial_sidebar_state="expanded"
334
- )
335
-
336
- st.markdown("""
337
- <style>
338
- .main { padding: 2rem; }
339
- .stMetric {
340
- background-color: var(--background-color);
341
- padding: 1rem;
342
- border-radius: 0.5rem;
343
- box-shadow: 0 2px 4px rgba(0,0,0,0.1);
344
  }
345
- .entity-tag {
346
- background-color: var(--secondary-background-color);
347
- padding: 0.2rem 0.5rem;
348
- border-radius: 0.25rem;
349
- margin: 0.2rem;
350
- display: inline-block;
351
- }
352
- </style>
353
- """, unsafe_allow_html=True)
354
 
355
- @staticmethod
356
- def display_results(results: Dict):
357
- """Display analysis results with interactive visualizations"""
358
- st.subheader("Analysis Results")
359
-
360
- # Create tabs for different analyses
361
- sentiment_tab, topics_tab, entities_tab = st.tabs([
362
- "Sentiment Analysis",
363
- "Topic Modeling",
364
- "Named Entities"
365
- ])
366
-
367
- with sentiment_tab:
368
- UI._display_sentiment(results['sentiment'])
369
-
370
- with topics_tab:
371
- UI._display_topics(results['topics'])
372
-
373
- with entities_tab:
374
- UI._display_entities(results['entities'])
375
-
376
- @staticmethod
377
- def _display_sentiment(sentiment_results: Dict):
378
- col1, col2 = st.columns(2)
379
-
380
- with col1:
381
- # Sentiment gauge
382
- fig = go.Figure(go.Indicator(
383
- mode="gauge+number",
384
- value=sentiment_results['compound'],
385
- domain={'x': [0, 1], 'y': [0, 1]},
386
- gauge={
387
- 'axis': {'range': [-1, 1]},
388
- 'bar': {'color': "darkblue"},
389
- 'steps': [
390
- {'range': [-1, -0.05], 'color': "lightcoral"},
391
- {'range': [-0.05, 0.05], 'color': "lightgray"},
392
- {'range': [0.05, 1], 'color': "lightgreen"}
393
- ]
394
- }
395
- ))
396
- st.plotly_chart(fig)
397
-
398
- with col2:
399
- # Emotions pie chart
400
- emotions_df = pd.DataFrame(
401
- sentiment_results['emotions'].items(),
402
- columns=['Emotion', 'Score']
403
- )
404
- fig = px.pie(
405
- emotions_df,
406
- values='Score',
407
- names='Emotion',
408
- title="Emotional Distribution"
409
- )
410
- st.plotly_chart(fig)
411
-
412
- # Rest of the UI methods...
413
-
414
- # main.py
415
- import streamlit as st
416
- from utils.config import load_config
417
- from text_processing import TextProcessor
418
- from analysis import AdvancedAnalyzer
419
- from ui import UI
420
 
421
- def main():
422
- # Load configuration
423
- config = load_config()
424
-
425
- # Setup UI
426
- UI.setup_page()
427
-
428
- # Initialize processors
429
- text_processor = TextProcessor(config)
430
- analyzer = AdvancedAnalyzer(config)
431
-
432
- # Sidebar configuration
433
- with st.sidebar:
434
- st.title("Analysis Settings")
435
- config['analysis']['num_topics'] = st.slider(
436
- "Number of Topics",
437
- 2, 10,
438
- config['analysis']['num_topics']
439
- )
440
- config['analysis']['min_entity_confidence'] = st.slider(
441
- "Entity Confidence Threshold",
442
- 0.0, 1.0,
443
- config['analysis']['min_entity_confidence']
444
- )
445
-
446
- # Main content
447
- st.title("Enhanced AI Output Analyzer")
448
-
449
- # Input section
450
- input_method = st.radio("Choose input method:", ["Text Input", "File Upload"])
451
 
452
- if input_method == "File Upload":
453
- text = text_processor.process_file_upload(
454
- st.file_uploader("Upload a text file", type=['txt'])
455
- )
456
- else:
457
- text = st.text_area("Enter text to analyze:", height=200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
 
459
- # Analysis section
460
- if st.button("Analyze", type="primary") and text_processor.validate_text(
461
- text, config['analysis']['max_text_length']
462
- ):
463
- try:
464
- with st.spinner("Analyzing text..."):
465
- results = analyzer.analyze_text(text)
466
- UI.display_results(results)
467
-
468
- except Exception as e:
469
- st.error(f"An error occurred during analysis: {str(e)}")
470
-
471
- if __name__ == "__main__":
472
- main()
 
1
+ # src/main.py
2
+ import streamlit as st
3
+ from utils.config import load_config
4
+ from core.text_processing import TextProcessor
5
+ from core.analysis import AdvancedAnalyzer
6
+ from core.ui import UI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ def main():
9
  config = load_config()
10
+ UI.setup_page()
 
11
 
12
+ text_processor = TextProcessor(config)
13
+ analyzer = AdvancedAnalyzer(config)
 
 
 
14
 
15
+ with st.sidebar:
16
+ st.title("Analysis Settings")
17
+ config['analysis']['num_topics'] = st.slider(
18
+ "Number of Topics", 2, 10,
19
+ config['analysis']['num_topics']
20
+ )
21
+ config['analysis']['min_entity_confidence'] = st.slider(
22
+ "Entity Confidence Threshold", 0.0, 1.0,
23
+ config['analysis']['min_entity_confidence']
24
+ )
 
 
 
 
 
25
 
26
+ st.title("Enhanced AI Output Analyzer")
27
+ input_method = st.radio("Choose input method:", ["Text Input", "File Upload"])
28
+
29
+ if input_method == "File Upload":
30
+ text = text_processor.process_file_upload(
31
+ st.file_uploader("Upload a text file", type=['txt'])
32
+ )
33
+ else:
34
+ text = st.text_area("Enter text to analyze:", height=200)
 
 
 
 
 
 
 
35
 
36
+ if st.button("Analyze", type="primary") and text_processor.validate_text(
37
+ text, config['analysis']['max_text_length']
38
+ ):
39
  try:
40
+ with st.spinner("Analyzing text..."):
41
+ results = analyzer.analyze_text(text)
42
+ UI.display_results(results)
 
 
43
  except Exception as e:
44
+ st.error(f"An error occurred during analysis: {str(e)}")
 
 
45
 
46
+ if __name__ == "__main__":
47
+ main()
 
 
 
 
 
 
 
48
 
49
+ # src/core/analysis.py
50
  import streamlit as st
 
51
  import numpy as np
52
+ from nltk.tokenize import word_tokenize, sent_tokenize
53
  from nltk.corpus import stopwords
54
  from nltk.stem import WordNetLemmatizer
55
  from sklearn.feature_extraction.text import CountVectorizer
56
  from sklearn.decomposition import LatentDirichletAllocation
57
+ from concurrent.futures import ThreadPoolExecutor
58
+ from transformers import pipeline
59
+ import spacy
60
+ from typing import Dict
61
  import logging
62
 
63
  logger = logging.getLogger(__name__)
 
67
  self.num_topics = num_topics
68
  self.lemmatizer = WordNetLemmatizer()
69
  self.vectorizer = CountVectorizer(
70
+ max_df=0.95, min_df=2,
71
+ stop_words='english', max_features=1000
 
 
72
  )
73
  self.lda = LatentDirichletAllocation(
74
  n_components=num_topics,
75
+ random_state=42, max_iter=10
 
76
  )
77
 
78
  def preprocess_text(self, text):
 
79
  try:
 
80
  tokens = word_tokenize(text.lower())
 
 
81
  stop_words = set(stopwords.words('english'))
82
  tokens = [
83
  self.lemmatizer.lemmatize(token)
84
  for token in tokens
85
  if token.isalnum() and token not in stop_words
86
  ]
 
87
  return ' '.join(tokens)
88
  except Exception as e:
89
  logger.error(f"Error in text preprocessing: {str(e)}")
90
  raise
91
 
92
  def extract_topics(self, text):
 
93
  try:
 
94
  processed_text = self.preprocess_text(text)
 
 
95
  dtm = self.vectorizer.fit_transform([processed_text])
 
 
96
  self.lda.fit(dtm)
 
 
97
  feature_names = self.vectorizer.get_feature_names_out()
98
 
 
99
  topics = []
100
  for topic_idx, topic in enumerate(self.lda.components_):
101
  top_words = [
 
107
  'words': top_words,
108
  'coherence': float(np.mean(topic))
109
  })
 
110
  return topics
 
111
  except Exception as e:
112
  logger.error(f"Error in topic modeling: {str(e)}")
113
  raise
114
 
115
  class AdvancedAnalyzer:
116
+ def __init__(self, config):
117
+ self.config = config
118
  self.topic_modeler = TopicModeler()
119
  self._initialize_models()
120
 
121
  @st.cache_resource
122
  def _initialize_models(self):
 
123
  try:
124
+ self.nlp = spacy.load(self.config['models']['spacy'])
 
125
  self.sentiment_model = pipeline(
126
  "sentiment-analysis",
127
+ model=self.config['models']['sentiment'],
128
  return_all_scores=True
129
  )
130
  logger.info("Models initialized successfully")
 
132
  logger.error(f"Error initializing models: {str(e)}")
133
  raise
134
 
135
+ def analyze_text(self, text: str) -> Dict:
 
136
  try:
137
+ num_topics = self.config['analysis']['num_topics']
138
  if num_topics != self.topic_modeler.num_topics:
139
  self.topic_modeler = TopicModeler(num_topics)
140
 
 
141
  results = {
142
  'sentiment': self.analyze_sentiment_batch(text),
143
  'topics': self.topic_modeler.extract_topics(text),
144
  'entities': self.extract_entities(text)
145
  }
 
146
  return results
 
147
  except Exception as e:
148
  logger.error(f"Error in analysis pipeline: {str(e)}")
149
  raise
150
 
151
+ def analyze_sentiment_batch(self, text: str) -> Dict:
 
152
  sentences = sent_tokenize(text)
153
  results = []
154
 
155
  with ThreadPoolExecutor() as executor:
156
  futures = [
157
+ executor.submit(self.sentiment_model, sentence)
158
  for sentence in sentences
159
  ]
160
  for future in futures:
 
167
  if not results:
168
  raise ValueError("No successful sentiment analysis results")
169
 
170
+ # Process and aggregate results
171
+ scores = np.mean([r[0]['score'] for r in results])
172
+ return {
173
+ 'score': float(scores),
174
+ 'label': 'positive' if scores > 0.5 else 'negative'
175
  }
 
 
 
 
176
 
177
+ def extract_entities(self, text: str) -> list:
178
+ doc = self.nlp(text)
179
+ return [
180
+ {
181
+ 'text': ent.text,
182
+ 'label': ent.label_,
183
+ 'confidence': float(ent._.confidence)
184
+ if hasattr(ent._, 'confidence') else 1.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  }
186
+ for ent in doc.ents
187
+ if (hasattr(ent._, 'confidence') and
188
+ ent._.confidence >= self.config['analysis']['min_entity_confidence'])
189
+ ]
 
 
 
 
 
190
 
191
+ # src/utils/config.py
192
+ import yaml
193
+ from pathlib import Path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ def load_config():
196
+ current_dir = Path(__file__).parent.parent.parent
197
+ config_path = current_dir / "config.yaml"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
+ if not config_path.exists():
200
+ config = {
201
+ 'models': {
202
+ 'spacy': 'en_core_web_sm',
203
+ 'sentiment': 'nlptown/bert-base-multilingual-uncased-sentiment'
204
+ },
205
+ 'analysis': {
206
+ 'batch_size': 1000,
207
+ 'min_entity_confidence': 0.8,
208
+ 'num_topics': 3,
209
+ 'max_text_length': 50000
210
+ },
211
+ 'security': {
212
+ 'max_file_size': 5242880,
213
+ 'allowed_extensions': ['txt']
214
+ },
215
+ 'logging': {
216
+ 'level': 'INFO',
217
+ 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
218
+ }
219
+ }
220
+
221
+ with open(config_path, 'w') as f:
222
+ yaml.dump(config, f, default_flow_style=False)
223
 
224
+ with open(config_path) as f:
225
+ return yaml.safe_load(f)