entropy25 commited on
Commit
3670fc5
·
verified ·
1 Parent(s): 797f9f3

Upload 10 files

Browse files
Files changed (10) hide show
  1. Dockerfile +34 -0
  2. README.md +512 -10
  3. analyzer.py +218 -0
  4. app.py +284 -0
  5. config.py +33 -0
  6. docker-compose.yml +42 -0
  7. models.py +49 -0
  8. requirements.txt +13 -0
  9. utils.py +157 -0
  10. visualizer.py +184 -0
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ build-essential \
9
+ curl \
10
+ software-properties-common \
11
+ git \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Copy requirements first for better caching
15
+ COPY requirements.txt .
16
+
17
+ # Install Python dependencies
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ # Copy application code
21
+ COPY . .
22
+
23
+ # Create non-root user
24
+ RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
25
+ USER appuser
26
+
27
+ # Expose port
28
+ EXPOSE 7860
29
+
30
+ # Health check
31
+ HEALTHCHECK CMD curl --fail http://localhost:7860 || exit 1
32
+
33
+ # Run the application
34
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,10 +1,512 @@
1
- ---
2
- title: Sentiment Analyzer3
3
- emoji: 🐨
4
- colorFrom: blue
5
- colorTo: green
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title: Sentiment Analysis docker
2
+
3
+ emoji: 📊
4
+
5
+ colorFrom: gray
6
+
7
+ colorTo: gray
8
+
9
+ sdk: gradio
10
+
11
+ sdk\_version: 5.34.1
12
+
13
+ app\_file: app.py
14
+
15
+ pinned: false
16
+
17
+ license: mit
18
+
19
+ short\_description: sentiment-analysis
20
+
21
+
22
+
23
+ \# 🎬 AI Movie Sentiment Analyzer
24
+
25
+
26
+
27
+ A sophisticated sentiment analysis application for movie reviews using advanced deep learning techniques with BERT, LIME, and SHAP explanations.
28
+
29
+
30
+
31
+ \## Features
32
+
33
+
34
+
35
+ \- \*\*Fast Sentiment Analysis\*\*: Quick movie review sentiment classification
36
+
37
+ \- \*\*Advanced Explanations\*\*: LIME and SHAP-based word importance analysis
38
+
39
+ \- \*\*Batch Processing\*\*: Analyze multiple reviews simultaneously
40
+
41
+ \- \*\*Interactive Visualizations\*\*: Charts, gauges, word clouds, and heatmaps
42
+
43
+ \- \*\*History Tracking\*\*: Keep track of all analyses with trend visualization
44
+
45
+ \- \*\*Data Export\*\*: Export results in CSV and JSON formats
46
+
47
+ \- \*\*File Upload Support\*\*: Process CSV and text files
48
+
49
+ \- \*\*Multiple Themes\*\*: Customizable color themes for visualizations
50
+
51
+
52
+
53
+ \## Project Structure
54
+
55
+
56
+
57
+ ```
58
+
59
+ sentiment\\\_analyzer/
60
+
61
+ ├── config.py # Configuration management
62
+
63
+ ├── models.py # Model loading and management
64
+
65
+ ├── analyzer.py # Core sentiment analysis logic
66
+
67
+ ├── visualizer.py # Visualization components
68
+
69
+ ├── utils.py # Utility functions and data handling
70
+
71
+ ├── app.py # Gradio interface and main application
72
+
73
+ ├── requirements.txt # Python dependencies
74
+
75
+ ├── Dockerfile # Docker container configuration
76
+
77
+ ├── docker-compose.yml # Docker Compose setup
78
+
79
+ └── README.md # Project documentation
80
+
81
+ ```
82
+
83
+
84
+
85
+ \## Installation
86
+
87
+
88
+
89
+ \### Local Installation
90
+
91
+
92
+
93
+ 1\. \*\*Clone the repository\*\*
94
+
95
+   ```bash
96
+
97
+   git clone <repository-url>
98
+
99
+   cd sentiment\_analyzer
100
+
101
+   ```
102
+
103
+
104
+
105
+ 2\. \*\*Create virtual environment\*\*
106
+
107
+   ```bash
108
+
109
+   python -m venv venv
110
+
111
+   source venv/bin/activate # On Windows: venv\\Scripts\\activate
112
+
113
+   ```
114
+
115
+
116
+
117
+ 3\. \*\*Install dependencies\*\*
118
+
119
+   ```bash
120
+
121
+   pip install -r requirements.txt
122
+
123
+   ```
124
+
125
+
126
+
127
+ 4\. \*\*Run the application\*\*
128
+
129
+   ```bash
130
+
131
+   python app.py
132
+
133
+   ```
134
+
135
+
136
+
137
+ \### Docker Installation
138
+
139
+
140
+
141
+ 1\. \*\*Using Docker Compose (Recommended)\*\*
142
+
143
+   ```bash
144
+
145
+   docker-compose up --build
146
+
147
+   ```
148
+
149
+
150
+
151
+ 2\. \*\*Using Docker directly\*\*
152
+
153
+   ```bash
154
+
155
+   docker build -t sentiment-analyzer .
156
+
157
+   docker run -p 7860:7860 sentiment-analyzer
158
+
159
+   ```
160
+
161
+
162
+
163
+ \## Usage
164
+
165
+
166
+
167
+ \### Web Interface
168
+
169
+
170
+
171
+ 1\. Open your browser and navigate to `http://localhost:7860`
172
+
173
+ 2\. Choose from three main tabs:
174
+
175
+   - \*\*Quick Analysis\*\*: Fast sentiment analysis with basic visualizations
176
+
177
+   - \*\*Advanced Analysis\*\*: Deep analysis with LIME/SHAP explanations
178
+
179
+   - \*\*Batch Analysis\*\*: Process multiple reviews at once
180
+
181
+
182
+
183
+ \### API Usage
184
+
185
+
186
+
187
+ The application can be extended to provide API endpoints for programmatic access.
188
+
189
+
190
+
191
+ \## Configuration
192
+
193
+
194
+
195
+ Modify `config.py` to customize:
196
+
197
+
198
+
199
+ \- \*\*Model Settings\*\*: Batch sizes, text length limits
200
+
201
+ \- \*\*Visualization\*\*: Figure sizes, color themes
202
+
203
+ \- \*\*Processing\*\*: Cache sizes, stop words
204
+
205
+ \- \*\*History\*\*: Maximum history size
206
+
207
+
208
+
209
+ \## Model Information
210
+
211
+
212
+
213
+ \- \*\*Base Model\*\*: BERT (entropy25/sentimentanalysis)
214
+
215
+ \- \*\*Classes\*\*: Positive, Negative
216
+
217
+ \- \*\*Explanation Methods\*\*: LIME, SHAP
218
+
219
+ \- \*\*Supported Languages\*\*: English
220
+
221
+
222
+
223
+ \## Features Detail
224
+
225
+
226
+
227
+ \### Quick Analysis
228
+
229
+ \- Fast sentiment classification
230
+
231
+ \- Confidence scoring
232
+
233
+ \- Probability visualization
234
+
235
+ \- Word cloud generation
236
+
237
+
238
+
239
+ \### Advanced Analysis
240
+
241
+ \- LIME-based word importance
242
+
243
+ \- SHAP value calculation
244
+
245
+ \- Interactive heatmap visualization
246
+
247
+ \- Detailed explanations
248
+
249
+
250
+
251
+ \### Batch Processing
252
+
253
+ \- CSV/TXT file upload
254
+
255
+ \- Bulk sentiment analysis
256
+
257
+ \- Comprehensive result visualization
258
+
259
+ \- Progress tracking
260
+
261
+
262
+
263
+ \### History \& Export
264
+
265
+ \- Analysis history tracking
266
+
267
+ \- Trend visualization
268
+
269
+ \- CSV/JSON export
270
+
271
+ \- Data persistence
272
+
273
+
274
+
275
+ \## Performance
276
+
277
+
278
+
279
+ \- \*\*GPU Support\*\*: Automatic CUDA detection
280
+
281
+ \- \*\*Memory Management\*\*: Efficient batch processing
282
+
283
+ \- \*\*Caching\*\*: LRU cache for text processing
284
+
285
+ \- \*\*Resource Optimization\*\*: Context managers for memory cleanup
286
+
287
+
288
+
289
+ \## Dependencies
290
+
291
+
292
+
293
+ \### Core Dependencies
294
+
295
+ \- `torch`: Deep learning framework
296
+
297
+ \- `transformers`: BERT model implementation
298
+
299
+ \- `gradio`: Web interface framework
300
+
301
+
302
+
303
+ \### Analysis \& Visualization
304
+
305
+ \- `lime`: Local interpretable model explanations
306
+
307
+ \- `shap`: Shapley additive explanations
308
+
309
+ \- `matplotlib`: Plotting and visualization
310
+
311
+ \- `wordcloud`: Word cloud generation
312
+
313
+
314
+
315
+ \### Data Processing
316
+
317
+ \- `pandas`: Data manipulation
318
+
319
+ \- `numpy`: Numerical computing
320
+
321
+
322
+
323
+ \## Development
324
+
325
+
326
+
327
+ \### Adding New Features
328
+
329
+
330
+
331
+ 1\. \*\*New Analyzers\*\*: Add to `analyzer.py`
332
+
333
+ 2\. \*\*Visualizations\*\*: Extend `visualizer.py`
334
+
335
+ 3\. \*\*UI Components\*\*: Modify `app.py`
336
+
337
+ 4\. \*\*Configuration\*\*: Update `config.py`
338
+
339
+
340
+
341
+ \### Testing
342
+
343
+
344
+
345
+ ```bash
346
+
347
+ \\# Run tests (if implemented)
348
+
349
+ python -m pytest tests/
350
+
351
+
352
+
353
+ \\# Manual testing
354
+
355
+ python -c "from analyzer import SentimentEngine; engine = SentimentEngine(); print(engine.analyze\\\_single\\\_fast('Great movie!'))"
356
+
357
+ ```
358
+
359
+
360
+
361
+ \## Deployment
362
+
363
+
364
+
365
+ \### Production Deployment
366
+
367
+
368
+
369
+ 1\. \*\*Environment Variables\*\*
370
+
371
+   ```bash
372
+
373
+   export GRADIO\_SERVER\_NAME=0.0.0.0
374
+
375
+   export GRADIO\_SERVER\_PORT=7860
376
+
377
+   ```
378
+
379
+
380
+
381
+ 2\. \*\*Resource Requirements\*\*
382
+
383
+   - CPU: 2+ cores recommended
384
+
385
+   - RAM: 4GB+ recommended
386
+
387
+   - GPU: Optional (CUDA support)
388
+
389
+
390
+
391
+ 3\. \*\*Monitoring\*\*
392
+
393
+   - Health checks included in Docker setup
394
+
395
+   - Logging configured for production use
396
+
397
+
398
+
399
+ \## Troubleshooting
400
+
401
+
402
+
403
+ \### Common Issues
404
+
405
+
406
+
407
+ 1\. \*\*CUDA Out of Memory\*\*
408
+
409
+   - Reduce `BATCH\\\_PROCESSING\\\_SIZE` in config
410
+
411
+   - Use CPU-only mode
412
+
413
+
414
+
415
+ 2\. \*\*Model Loading Errors\*\*
416
+
417
+   - Check internet connection
418
+
419
+   - Verify Hugging Face model availability
420
+
421
+
422
+
423
+ 3\. \*\*File Processing Issues\*\*
424
+
425
+   - Ensure proper file encoding (UTF-8 recommended)
426
+
427
+   - Check CSV format and column structure
428
+
429
+
430
+
431
+ \### Performance Optimization
432
+
433
+
434
+
435
+ \- Use GPU if available
436
+
437
+ \- Adjust batch sizes based on available memory
438
+
439
+ \- Enable caching for repeated analyses
440
+
441
+ \- Use Docker for consistent performance
442
+
443
+
444
+
445
+ \## Contributing
446
+
447
+
448
+
449
+ 1\. Fork the repository
450
+
451
+ 2\. Create a feature branch
452
+
453
+ 3\. Make your changes
454
+
455
+ 4\. Add tests if applicable
456
+
457
+ 5\. Submit a pull request
458
+
459
+
460
+
461
+ \## License
462
+
463
+
464
+
465
+ This project is licensed under the MIT License - see the LICENSE file for details.
466
+
467
+
468
+
469
+ \## Acknowledgments
470
+
471
+
472
+
473
+ \- Hugging Face for BERT model hosting
474
+
475
+ \- LIME and SHAP libraries for explainable AI
476
+
477
+ \- Gradio for the intuitive web interface
478
+
479
+ \- The open-source community for various dependencies
480
+
481
+
482
+
483
+ \## Support
484
+
485
+
486
+
487
+ For issues and questions:
488
+
489
+ 1\. Check the troubleshooting section
490
+
491
+ 2\. Review existing GitHub issues
492
+
493
+ 3\. Create a new issue with detailed information
494
+
495
+
496
+
497
+ \## Changelog
498
+
499
+
500
+
501
+ \### v1.0.0
502
+
503
+ \- Initial release with core functionality
504
+
505
+ \- BERT-based sentiment analysis
506
+
507
+ \- LIME and SHAP explanations
508
+
509
+ \- Gradio web interface
510
+
511
+ \- Docker support
512
+
analyzer.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import re
3
+ import logging
4
+ from typing import List, Dict, Tuple
5
+ from functools import lru_cache
6
+ from lime.lime_text import LimeTextExplainer
7
+
8
+ from config import config
9
+ from models import ModelManager
10
+ from utils import handle_errors
11
+
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class TextProcessor:
17
+ """Optimized text processing"""
18
+ @staticmethod
19
+ @lru_cache(maxsize=config.CACHE_SIZE)
20
+ def clean_text(text: str) -> Tuple[str, ...]:
21
+ """Single-pass text cleaning"""
22
+ words = re.findall(r'\b\w{3,}\b', text.lower())
23
+ return tuple(w for w in words if w not in config.STOP_WORDS)
24
+
25
+
26
+ class SentimentEngine:
27
+ """Streamlined sentiment analysis engine with LIME and SHAP"""
28
+ def __init__(self):
29
+ self.model_manager = ModelManager()
30
+ self.lime_explainer = LimeTextExplainer(class_names=['Negative', 'Positive'])
31
+ self.shap_explainer = None
32
+
33
+ def predict_proba(self, texts):
34
+ """Prediction function for LIME"""
35
+ if isinstance(texts, str):
36
+ texts = [texts]
37
+
38
+ inputs = self.model_manager.tokenizer(
39
+ texts, return_tensors="pt", padding=True,
40
+ truncation=True, max_length=config.MAX_TEXT_LENGTH
41
+ ).to(self.model_manager.device)
42
+
43
+ with torch.no_grad():
44
+ outputs = self.model_manager.model(**inputs)
45
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()
46
+
47
+ return probs
48
+
49
+ @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0})
50
+ def analyze_single_fast(self, text: str) -> Dict:
51
+ """Fast single text analysis without keyword extraction"""
52
+ if not text.strip():
53
+ raise ValueError("Empty text")
54
+
55
+ probs = self.predict_proba([text])[0]
56
+ sentiment = "Positive" if probs[1] > probs[0] else "Negative"
57
+
58
+ return {
59
+ 'sentiment': sentiment,
60
+ 'confidence': float(probs.max()),
61
+ 'pos_prob': float(probs[1]),
62
+ 'neg_prob': float(probs[0])
63
+ }
64
+
65
+ def extract_key_words_lime(self, text: str, top_k: int = 10) -> List[Tuple[str, float]]:
66
+ """Advanced keyword extraction using LIME"""
67
+ try:
68
+ explanation = self.lime_explainer.explain_instance(
69
+ text, self.predict_proba, num_features=top_k, num_samples=200
70
+ )
71
+
72
+ word_scores = []
73
+ for word, score in explanation.as_list():
74
+ if len(word.strip()) >= config.MIN_WORD_LENGTH:
75
+ word_scores.append((word.strip().lower(), abs(score)))
76
+
77
+ word_scores.sort(key=lambda x: x[1], reverse=True)
78
+ return word_scores[:top_k]
79
+
80
+ except Exception as e:
81
+ logger.error(f"LIME extraction failed: {e}")
82
+ return []
83
+
84
+ def extract_key_words_shap(self, text: str, top_k: int = 10) -> List[Tuple[str, float]]:
85
+ """Advanced keyword extraction using SHAP"""
86
+ try:
87
+ # Simple SHAP implementation using model predictions
88
+ words = text.split()
89
+ word_scores = []
90
+
91
+ # Get baseline prediction
92
+ baseline_prob = self.predict_proba([text])[0][1] # Positive probability
93
+
94
+ # Calculate importance by removing each word
95
+ for i, word in enumerate(words):
96
+ # Create text without this word
97
+ modified_words = words[:i] + words[i+1:]
98
+ modified_text = ' '.join(modified_words)
99
+
100
+ if modified_text.strip():
101
+ modified_prob = self.predict_proba([modified_text])[0][1]
102
+ importance = abs(baseline_prob - modified_prob)
103
+
104
+ clean_word = re.sub(r'[^\w]', '', word.lower())
105
+ if len(clean_word) >= config.MIN_WORD_LENGTH:
106
+ word_scores.append((clean_word, importance))
107
+
108
+ # Remove duplicates and sort
109
+ unique_scores = {}
110
+ for word, score in word_scores:
111
+ if word in unique_scores:
112
+ unique_scores[word] = max(unique_scores[word], score)
113
+ else:
114
+ unique_scores[word] = score
115
+
116
+ sorted_scores = sorted(unique_scores.items(), key=lambda x: x[1], reverse=True)
117
+ return sorted_scores[:top_k]
118
+
119
+ except Exception as e:
120
+ logger.error(f"SHAP extraction failed: {e}")
121
+ return []
122
+
123
+ def create_heatmap_html(self, text: str, word_scores: Dict[str, float]) -> str:
124
+ """Create HTML heatmap visualization"""
125
+ words = text.split()
126
+ html_parts = ['<div style="font-family: Arial; font-size: 16px; line-height: 1.6;">']
127
+
128
+ if word_scores:
129
+ max_score = max(abs(score) for score in word_scores.values())
130
+ min_score = min(word_scores.values())
131
+ else:
132
+ max_score = min_score = 0
133
+
134
+ for word in words:
135
+ clean_word = re.sub(r'[^\w]', '', word.lower())
136
+ score = word_scores.get(clean_word, 0)
137
+
138
+ if score > 0:
139
+ intensity = min(255, int(180 * (score / max_score) if max_score > 0 else 0))
140
+ color = f"rgba(0, {intensity}, 0, 0.3)"
141
+ elif score < 0:
142
+ intensity = min(255, int(180 * (abs(score) / abs(min_score)) if min_score < 0 else 0))
143
+ color = f"rgba({intensity}, 0, 0, 0.3)"
144
+ else:
145
+ color = "transparent"
146
+
147
+ html_parts.append(
148
+ f'<span style="background-color: {color}; padding: 2px; margin: 1px; '
149
+ f'border-radius: 3px;" title="Score: {score:.3f}">{word}</span> '
150
+ )
151
+
152
+ html_parts.append('</div>')
153
+ return ''.join(html_parts)
154
+
155
+ @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0, 'lime_words': [], 'shap_words': [], 'heatmap_html': ''})
156
+ def analyze_single_advanced(self, text: str) -> Dict:
157
+ """Advanced single text analysis with LIME and SHAP explanation"""
158
+ if not text.strip():
159
+ raise ValueError("Empty text")
160
+
161
+ probs = self.predict_proba([text])[0]
162
+ sentiment = "Positive" if probs[1] > probs[0] else "Negative"
163
+
164
+ # Extract key words using both LIME and SHAP
165
+ lime_words = self.extract_key_words_lime(text)
166
+ shap_words = self.extract_key_words_shap(text)
167
+
168
+ # Create heatmap HTML using LIME results
169
+ word_scores_dict = dict(lime_words)
170
+ heatmap_html = self.create_heatmap_html(text, word_scores_dict)
171
+
172
+ return {
173
+ 'sentiment': sentiment,
174
+ 'confidence': float(probs.max()),
175
+ 'pos_prob': float(probs[1]),
176
+ 'neg_prob': float(probs[0]),
177
+ 'lime_words': lime_words,
178
+ 'shap_words': shap_words,
179
+ 'heatmap_html': heatmap_html
180
+ }
181
+
182
+ @handle_errors(default_return=[])
183
+ def analyze_batch(self, texts: List[str], progress_callback=None) -> List[Dict]:
184
+ """Optimized batch processing"""
185
+ if len(texts) > config.BATCH_SIZE_LIMIT:
186
+ texts = texts[:config.BATCH_SIZE_LIMIT]
187
+
188
+ results = []
189
+ batch_size = config.BATCH_PROCESSING_SIZE
190
+
191
+ for i in range(0, len(texts), batch_size):
192
+ batch = texts[i:i+batch_size]
193
+
194
+ if progress_callback:
195
+ progress_callback((i + len(batch)) / len(texts))
196
+
197
+ inputs = self.model_manager.tokenizer(
198
+ batch, return_tensors="pt", padding=True,
199
+ truncation=True, max_length=config.MAX_TEXT_LENGTH
200
+ ).to(self.model_manager.device)
201
+
202
+ with torch.no_grad():
203
+ outputs = self.model_manager.model(**inputs)
204
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()
205
+
206
+ for text, prob in zip(batch, probs):
207
+ sentiment = "Positive" if prob[1] > prob[0] else "Negative"
208
+
209
+ results.append({
210
+ 'text': text[:50] + '...' if len(text) > 50 else text,
211
+ 'full_text': text,
212
+ 'sentiment': sentiment,
213
+ 'confidence': float(prob.max()),
214
+ 'pos_prob': float(prob[1]),
215
+ 'neg_prob': float(prob[0])
216
+ })
217
+
218
+ return results
app.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import logging
4
+ from collections import Counter
5
+
6
+ from config import config
7
+ from analyzer import SentimentEngine
8
+ from visualizer import PlotFactory, ThemeContext
9
+ from utils import HistoryManager, DataHandler, handle_errors, managed_figure
10
+
11
+
12
+ class SentimentApp:
13
+ """Main application orchestrator"""
14
+
15
+ def __init__(self):
16
+ self.engine = SentimentEngine()
17
+ self.history = HistoryManager()
18
+ self.data_handler = DataHandler()
19
+
20
+ self.examples = [
21
+ ["While the film's visual effects were undeniably impressive, the story lacked emotional weight, and the pacing felt inconsistent throughout."],
22
+ ["An extraordinary achievement in filmmaking — the direction was masterful, the script was sharp, and every performance added depth and realism."],
23
+ ["Despite a promising start, the film quickly devolved into a series of clichés, with weak character development and an ending that felt rushed and unearned."],
24
+ ["A beautifully crafted story with heartfelt moments and a soundtrack that perfectly captured the emotional tone of each scene."],
25
+ ["The movie was far too long, with unnecessary subplots and dull dialogue that made it difficult to stay engaged until the end."]
26
+ ]
27
+
28
+ @handle_errors(default_return=("Please enter text", None, None, None))
29
+ def analyze_single_fast(self, text: str, theme: str = 'default'):
30
+ """Fast single text analysis without keywords"""
31
+ if not text.strip():
32
+ return "Please enter text", None, None, None
33
+
34
+ result = self.engine.analyze_single_fast(text)
35
+
36
+ self.history.add({
37
+ 'text': text[:100],
38
+ 'full_text': text,
39
+ **result
40
+ })
41
+
42
+ theme_ctx = ThemeContext(theme)
43
+ probs = np.array([result['neg_prob'], result['pos_prob']])
44
+
45
+ prob_plot = PlotFactory.create_sentiment_bars(probs, theme_ctx)
46
+ gauge_plot = PlotFactory.create_confidence_gauge(result['confidence'], result['sentiment'], theme_ctx)
47
+ cloud_plot = PlotFactory.create_wordcloud(text, result['sentiment'], theme_ctx)
48
+
49
+ result_text = f"Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.3f})"
50
+
51
+ return result_text, prob_plot, gauge_plot, cloud_plot
52
+
53
+ @handle_errors(default_return=("Please enter text", None, None, None))
54
+ def analyze_single_advanced(self, text: str, theme: str = 'default'):
55
+ """Advanced single text analysis with LIME and SHAP explanation"""
56
+ if not text.strip():
57
+ return "Please enter text", None, None, None
58
+
59
+ result = self.engine.analyze_single_advanced(text)
60
+
61
+ self.history.add({
62
+ 'text': text[:100],
63
+ 'full_text': text,
64
+ **result
65
+ })
66
+
67
+ theme_ctx = ThemeContext(theme)
68
+
69
+ lime_plot = PlotFactory.create_lime_keyword_chart(result['lime_words'], result['sentiment'], theme_ctx)
70
+ shap_plot = PlotFactory.create_shap_keyword_chart(result['shap_words'], result['sentiment'], theme_ctx)
71
+
72
+ lime_words_str = ", ".join([f"{word}({score:.3f})" for word, score in result['lime_words'][:5]])
73
+ shap_words_str = ", ".join([f"{word}({score:.3f})" for word, score in result['shap_words'][:5]])
74
+
75
+ result_text = (f"Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.3f})\n"
76
+ f"LIME Key Words: {lime_words_str}\n"
77
+ f"SHAP Key Words: {shap_words_str}")
78
+
79
+ return result_text, lime_plot, shap_plot, result['heatmap_html']
80
+
81
+ @handle_errors(default_return=None)
82
+ def analyze_batch(self, reviews: str, progress=None):
83
+ """Batch analysis"""
84
+ if not reviews.strip():
85
+ return None
86
+
87
+ texts = [r.strip() for r in reviews.split('\n') if r.strip()]
88
+ if len(texts) < 2:
89
+ return None
90
+
91
+ results = self.engine.analyze_batch(texts, progress)
92
+
93
+ for result in results:
94
+ self.history.add(result)
95
+
96
+ theme_ctx = ThemeContext('default')
97
+ return PlotFactory.create_batch_analysis(results, theme_ctx)
98
+
99
+ @handle_errors(default_return=(None, "No history available"))
100
+ def plot_history(self, theme: str = 'default'):
101
+ """Plot analysis history"""
102
+ history = self.history.get_all()
103
+ if len(history) < 2:
104
+ return None, f"Need at least 2 analyses for trends. Current: {len(history)}"
105
+
106
+ theme_ctx = ThemeContext(theme)
107
+
108
+ with managed_figure(figsize=(12, 8)) as fig:
109
+ gs = fig.add_gridspec(2, 1, hspace=0.3)
110
+
111
+ indices = list(range(len(history)))
112
+ pos_probs = [item['pos_prob'] for item in history]
113
+ confs = [item['confidence'] for item in history]
114
+
115
+ # Sentiment trend
116
+ ax1 = fig.add_subplot(gs[0, 0])
117
+ colors = [theme_ctx.colors['pos'] if p > 0.5 else theme_ctx.colors['neg']
118
+ for p in pos_probs]
119
+ ax1.scatter(indices, pos_probs, c=colors, alpha=0.7, s=60)
120
+ ax1.plot(indices, pos_probs, alpha=0.5, linewidth=2)
121
+ ax1.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
122
+ ax1.set_title('Sentiment History')
123
+ ax1.set_ylabel('Positive Probability')
124
+ ax1.grid(True, alpha=0.3)
125
+
126
+ # Confidence trend
127
+ ax2 = fig.add_subplot(gs[1, 0])
128
+ ax2.bar(indices, confs, alpha=0.7, color='lightblue', edgecolor='navy')
129
+ ax2.set_title('Confidence Over Time')
130
+ ax2.set_xlabel('Analysis Number')
131
+ ax2.set_ylabel('Confidence')
132
+ ax2.grid(True, alpha=0.3)
133
+
134
+ fig.tight_layout()
135
+ return fig, f"History: {len(history)} analyses"
136
+
137
+
138
+ def create_interface():
139
+ """Create streamlined Gradio interface"""
140
+ app = SentimentApp()
141
+
142
+ with gr.Blocks(theme=gr.themes.Soft(), title="Movie Sentiment Analyzer") as demo:
143
+ gr.Markdown("# 🎬 AI Movie Sentiment Analyzer")
144
+ gr.Markdown("Fast sentiment analysis with advanced deep learning explanations")
145
+
146
+ with gr.Tab("Quick Analysis"):
147
+ with gr.Row():
148
+ with gr.Column():
149
+ text_input = gr.Textbox(
150
+ label="Movie Review",
151
+ placeholder="Enter your movie review...",
152
+ lines=5
153
+ )
154
+ with gr.Row():
155
+ analyze_btn = gr.Button("Analyze", variant="primary")
156
+ theme_selector = gr.Dropdown(
157
+ choices=list(config.THEMES.keys()),
158
+ value="default",
159
+ label="Theme"
160
+ )
161
+
162
+ gr.Examples(
163
+ examples=app.examples,
164
+ inputs=text_input
165
+ )
166
+
167
+ with gr.Column():
168
+ result_output = gr.Textbox(label="Result", lines=3)
169
+
170
+ with gr.Row():
171
+ prob_plot = gr.Plot(label="Probabilities")
172
+ gauge_plot = gr.Plot(label="Confidence")
173
+
174
+ with gr.Row():
175
+ wordcloud_plot = gr.Plot(label="Word Cloud")
176
+
177
+ with gr.Tab("Advanced Analysis"):
178
+ with gr.Row():
179
+ with gr.Column():
180
+ adv_text_input = gr.Textbox(
181
+ label="Movie Review",
182
+ placeholder="Enter your movie review for deep analysis...",
183
+ lines=5
184
+ )
185
+ with gr.Row():
186
+ adv_analyze_btn = gr.Button("Deep Analyze", variant="primary")
187
+ adv_theme_selector = gr.Dropdown(
188
+ choices=list(config.THEMES.keys()),
189
+ value="default",
190
+ label="Theme"
191
+ )
192
+
193
+ gr.Examples(
194
+ examples=app.examples,
195
+ inputs=adv_text_input
196
+ )
197
+
198
+ with gr.Column():
199
+ adv_result_output = gr.Textbox(label="Analysis Result", lines=4)
200
+
201
+ with gr.Row():
202
+ lime_plot = gr.Plot(label="LIME: Key Contributing Words")
203
+ shap_plot = gr.Plot(label="SHAP: Key Contributing Words")
204
+
205
+ with gr.Row():
206
+ heatmap_output = gr.HTML(label="Word Importance Heatmap (LIME-based)")
207
+
208
+ with gr.Tab("Batch Analysis"):
209
+ with gr.Row():
210
+ with gr.Column():
211
+ file_upload = gr.File(label="Upload File", file_types=[".csv", ".txt"])
212
+ batch_input = gr.Textbox(
213
+ label="Reviews (one per line)",
214
+ lines=8
215
+ )
216
+
217
+ with gr.Column():
218
+ load_btn = gr.Button("Load File")
219
+ batch_btn = gr.Button("Analyze Batch", variant="primary")
220
+
221
+ batch_plot = gr.Plot(label="Batch Results")
222
+
223
+ with gr.Tab("History & Export"):
224
+ with gr.Row():
225
+ refresh_btn = gr.Button("Refresh")
226
+ clear_btn = gr.Button("Clear", variant="stop")
227
+
228
+ with gr.Row():
229
+ csv_btn = gr.Button("Export CSV")
230
+ json_btn = gr.Button("Export JSON")
231
+
232
+ history_status = gr.Textbox(label="Status")
233
+ history_plot = gr.Plot(label="History Trends")
234
+ csv_file = gr.File(label="CSV Download", visible=True)
235
+ json_file = gr.File(label="JSON Download", visible=True)
236
+
237
+ # Event bindings for Quick Analysis
238
+ analyze_btn.click(
239
+ app.analyze_single_fast,
240
+ inputs=[text_input, theme_selector],
241
+ outputs=[result_output, prob_plot, gauge_plot, wordcloud_plot]
242
+ )
243
+
244
+ # Event bindings for Advanced Analysis
245
+ adv_analyze_btn.click(
246
+ app.analyze_single_advanced,
247
+ inputs=[adv_text_input, adv_theme_selector],
248
+ outputs=[adv_result_output, lime_plot, shap_plot, heatmap_output]
249
+ )
250
+
251
+ # Event bindings for Batch Analysis
252
+ load_btn.click(app.data_handler.process_file, inputs=file_upload, outputs=batch_input)
253
+ batch_btn.click(app.analyze_batch, inputs=batch_input, outputs=batch_plot)
254
+
255
+ # Event bindings for History & Export
256
+ refresh_btn.click(
257
+ lambda theme: app.plot_history(theme),
258
+ inputs=theme_selector,
259
+ outputs=[history_plot, history_status]
260
+ )
261
+
262
+ clear_btn.click(
263
+ lambda: f"Cleared {app.history.clear()} entries",
264
+ outputs=history_status
265
+ )
266
+
267
+ csv_btn.click(
268
+ lambda: app.data_handler.export_data(app.history.get_all(), 'csv'),
269
+ outputs=[csv_file, history_status]
270
+ )
271
+
272
+ json_btn.click(
273
+ lambda: app.data_handler.export_data(app.history.get_all(), 'json'),
274
+ outputs=[json_file, history_status]
275
+ )
276
+
277
+ return demo
278
+
279
+
280
+ # Application Entry Point
281
+ if __name__ == "__main__":
282
+ logging.basicConfig(level=logging.INFO)
283
+ demo = create_interface()
284
+ demo.launch(share=True)
config.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Tuple, Dict
3
+
4
+
5
+ @dataclass
6
+ class Config:
7
+ MAX_HISTORY_SIZE: int = 1000
8
+ BATCH_SIZE_LIMIT: int = 50
9
+ MAX_TEXT_LENGTH: int = 512
10
+ MIN_WORD_LENGTH: int = 2
11
+ CACHE_SIZE: int = 128
12
+ BATCH_PROCESSING_SIZE: int = 8
13
+
14
+ # Visualization settings
15
+ FIGURE_SIZE_SINGLE: Tuple[int, int] = (8, 5)
16
+ FIGURE_SIZE_BATCH: Tuple[int, int] = (12, 8)
17
+ WORDCLOUD_SIZE: Tuple[int, int] = (10, 5)
18
+
19
+ THEMES = {
20
+ 'default': {'pos': '#4ecdc4', 'neg': '#ff6b6b'},
21
+ 'ocean': {'pos': '#0077be', 'neg': '#ff6b35'},
22
+ 'forest': {'pos': '#228b22', 'neg': '#dc143c'},
23
+ 'sunset': {'pos': '#ff8c00', 'neg': '#8b0000'}
24
+ }
25
+
26
+ STOP_WORDS = {
27
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
28
+ 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be',
29
+ 'been', 'have', 'has', 'had', 'will', 'would', 'could', 'should'
30
+ }
31
+
32
+
33
+ config = Config()
docker-compose.yml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ sentiment-analyzer:
5
+ build: .
6
+ ports:
7
+ - "7860:7860"
8
+ environment:
9
+ - PYTHONPATH=/app
10
+ - GRADIO_SERVER_NAME=0.0.0.0
11
+ - GRADIO_SERVER_PORT=7860
12
+ volumes:
13
+ - ./data:/app/data
14
+ - ./logs:/app/logs
15
+ restart: unless-stopped
16
+ healthcheck:
17
+ test: ["CMD", "curl", "-f", "http://localhost:7860"]
18
+ interval: 30s
19
+ timeout: 10s
20
+ retries: 3
21
+ start_period: 40s
22
+ deploy:
23
+ resources:
24
+ limits:
25
+ cpus: '2.0'
26
+ memory: 4G
27
+ reservations:
28
+ cpus: '1.0'
29
+ memory: 2G
30
+
31
+ # Optional: Add Redis for caching (uncomment if needed)
32
+ # redis:
33
+ # image: redis:7-alpine
34
+ # ports:
35
+ # - "6379:6379"
36
+ # command: redis-server --appendonly yes
37
+ # volumes:
38
+ # - redis_data:/data
39
+ # restart: unless-stopped
40
+
41
+ # volumes:
42
+ # redis_data:
models.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import logging
3
+ from transformers import BertTokenizer, BertForSequenceClassification
4
+
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class ModelManager:
10
+ """Lazy loading model manager"""
11
+ _instance = None
12
+ _model = None
13
+ _tokenizer = None
14
+ _device = None
15
+
16
+ def __new__(cls):
17
+ if cls._instance is None:
18
+ cls._instance = super().__new__(cls)
19
+ return cls._instance
20
+
21
+ @property
22
+ def model(self):
23
+ if self._model is None:
24
+ self._load_model()
25
+ return self._model
26
+
27
+ @property
28
+ def tokenizer(self):
29
+ if self._tokenizer is None:
30
+ self._load_model()
31
+ return self._tokenizer
32
+
33
+ @property
34
+ def device(self):
35
+ if self._device is None:
36
+ self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
+ return self._device
38
+
39
+ def _load_model(self):
40
+ """Load model and tokenizer"""
41
+ try:
42
+ self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
43
+ self._tokenizer = BertTokenizer.from_pretrained("entropy25/sentimentanalysis")
44
+ self._model = BertForSequenceClassification.from_pretrained("entropy25/sentimentanalysis")
45
+ self._model.to(self._device)
46
+ logger.info(f"Model loaded on {self._device}")
47
+ except Exception as e:
48
+ logger.error(f"Model loading failed: {e}")
49
+ raise
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=1.9.0
2
+ transformers>=4.20.0
3
+ gradio>=3.45.0
4
+ matplotlib>=3.5.0
5
+ numpy>=1.21.0
6
+ wordcloud>=1.9.0
7
+ pandas>=1.3.0
8
+ lime>=0.2.0
9
+ shap>=0.41.0
10
+ scikit-learn>=1.0.0
11
+ Pillow>=8.3.0
12
+ requests>=2.25.0
13
+ tqdm>=4.62.0
utils.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import pandas as pd
3
+ import csv
4
+ import json
5
+ import tempfile
6
+ import gc
7
+ import logging
8
+ from datetime import datetime
9
+ from functools import wraps
10
+ from contextlib import contextmanager
11
+ from typing import List, Dict, Optional, Tuple, Any, Callable
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ # Decorators and Context Managers
18
+ def handle_errors(default_return=None):
19
+ """Centralized error handling decorator"""
20
+ def decorator(func: Callable) -> Callable:
21
+ @wraps(func)
22
+ def wrapper(*args, **kwargs):
23
+ try:
24
+ return func(*args, **kwargs)
25
+ except Exception as e:
26
+ logger.error(f"{func.__name__} failed: {e}")
27
+ return default_return if default_return is not None else f"Error: {str(e)}"
28
+ return wrapper
29
+ return decorator
30
+
31
+
32
+ @contextmanager
33
+ def managed_figure(*args, **kwargs):
34
+ """Context manager for matplotlib figures to prevent memory leaks"""
35
+ fig = plt.figure(*args, **kwargs)
36
+ try:
37
+ yield fig
38
+ finally:
39
+ plt.close(fig)
40
+ gc.collect()
41
+
42
+
43
+ class HistoryManager:
44
+ """Simplified history management"""
45
+ def __init__(self):
46
+ self._history = []
47
+
48
+ def add(self, entry: Dict):
49
+ from config import config
50
+ self._history.append({**entry, 'timestamp': datetime.now().isoformat()})
51
+ if len(self._history) > config.MAX_HISTORY_SIZE:
52
+ self._history = self._history[-config.MAX_HISTORY_SIZE:]
53
+
54
+ def get_all(self) -> List[Dict]:
55
+ return self._history.copy()
56
+
57
+ def clear(self) -> int:
58
+ count = len(self._history)
59
+ self._history.clear()
60
+ return count
61
+
62
+ def size(self) -> int:
63
+ return len(self._history)
64
+
65
+
66
+ class DataHandler:
67
+ """Handles all data operations"""
68
+
69
+ @staticmethod
70
+ @handle_errors(default_return=(None, "Export failed"))
71
+ def export_data(data: List[Dict], format_type: str) -> Tuple[Optional[str], str]:
72
+ """Universal data export"""
73
+ if not data:
74
+ return None, "No data to export"
75
+
76
+ temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False,
77
+ suffix=f'.{format_type}', encoding='utf-8')
78
+
79
+ if format_type == 'csv':
80
+ writer = csv.writer(temp_file)
81
+ writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Pos_Prob', 'Neg_Prob'])
82
+ for entry in data:
83
+ writer.writerow([
84
+ entry.get('timestamp', ''),
85
+ entry.get('text', ''),
86
+ entry.get('sentiment', ''),
87
+ f"{entry.get('confidence', 0):.4f}",
88
+ f"{entry.get('pos_prob', 0):.4f}",
89
+ f"{entry.get('neg_prob', 0):.4f}"
90
+ ])
91
+ elif format_type == 'json':
92
+ json.dump(data, temp_file, indent=2, ensure_ascii=False)
93
+
94
+ temp_file.close()
95
+ return temp_file.name, f"Exported {len(data)} entries"
96
+
97
+ @staticmethod
98
+ @handle_errors(default_return="")
99
+ def process_file(file) -> str:
100
+ """Process uploaded file with improved CSV handling"""
101
+ if not file:
102
+ return ""
103
+
104
+ try:
105
+ file_path = file.name
106
+
107
+ if file_path.endswith('.csv'):
108
+ for encoding in ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']:
109
+ try:
110
+ df = pd.read_csv(file_path, encoding=encoding)
111
+
112
+ text_columns = []
113
+ for col in df.columns:
114
+ sample_values = df[col].dropna().head(10)
115
+ if len(sample_values) > 0:
116
+ text_count = sum(1 for val in sample_values
117
+ if isinstance(val, str) and len(str(val).strip()) > 10)
118
+ if text_count > len(sample_values) * 0.7:
119
+ text_columns.append(col)
120
+
121
+ if text_columns:
122
+ selected_column = text_columns[0]
123
+ else:
124
+ selected_column = df.columns[0]
125
+
126
+ reviews = df[selected_column].dropna().astype(str).tolist()
127
+
128
+ cleaned_reviews = []
129
+ for review in reviews:
130
+ review = review.strip()
131
+ if len(review) > 10 and review.lower() != 'nan':
132
+ cleaned_reviews.append(review)
133
+
134
+ if cleaned_reviews:
135
+ logger.info(f"Successfully read {len(cleaned_reviews)} reviews from CSV")
136
+ return '\n'.join(cleaned_reviews)
137
+
138
+ except Exception as e:
139
+ continue
140
+
141
+ return "Error: Could not read CSV file. Please check the file format and encoding."
142
+
143
+ else:
144
+ for encoding in ['utf-8', 'latin-1', 'cp1252']:
145
+ try:
146
+ with open(file_path, 'r', encoding=encoding) as f:
147
+ content = f.read().strip()
148
+ if content:
149
+ return content
150
+ except Exception as e:
151
+ continue
152
+
153
+ return "Error: Could not read text file. Please check the file encoding."
154
+
155
+ except Exception as e:
156
+ logger.error(f"File processing error: {e}")
157
+ return f"Error processing file: {str(e)}"
visualizer.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import numpy as np
3
+ from wordcloud import WordCloud
4
+ from collections import Counter
5
+ from typing import List, Dict, Tuple, Optional
6
+ import gc
7
+
8
+ from config import config
9
+ from utils import handle_errors, managed_figure
10
+
11
+
12
+ class ThemeContext:
13
+ """Theme management context"""
14
+ def __init__(self, theme: str = 'default'):
15
+ self.theme = theme
16
+ self.colors = config.THEMES.get(theme, config.THEMES['default'])
17
+
18
+
19
+ class PlotFactory:
20
+ """Factory for creating plots with proper memory management"""
21
+
22
+ @staticmethod
23
+ @handle_errors(default_return=None)
24
+ def create_sentiment_bars(probs: np.ndarray, theme: ThemeContext) -> plt.Figure:
25
+ """Create sentiment probability bars"""
26
+ with managed_figure(figsize=config.FIGURE_SIZE_SINGLE) as fig:
27
+ ax = fig.add_subplot(111)
28
+ labels = ["Negative", "Positive"]
29
+ colors = [theme.colors['neg'], theme.colors['pos']]
30
+
31
+ bars = ax.bar(labels, probs, color=colors, alpha=0.8)
32
+ ax.set_title("Sentiment Probabilities", fontweight='bold')
33
+ ax.set_ylabel("Probability")
34
+ ax.set_ylim(0, 1)
35
+
36
+ for bar, prob in zip(bars, probs):
37
+ ax.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.02,
38
+ f'{prob:.3f}', ha='center', va='bottom', fontweight='bold')
39
+
40
+ fig.tight_layout()
41
+ return fig
42
+
43
+ @staticmethod
44
+ @handle_errors(default_return=None)
45
+ def create_confidence_gauge(confidence: float, sentiment: str, theme: ThemeContext) -> plt.Figure:
46
+ """Create confidence gauge"""
47
+ with managed_figure(figsize=config.FIGURE_SIZE_SINGLE) as fig:
48
+ ax = fig.add_subplot(111)
49
+
50
+ theta = np.linspace(0, np.pi, 100)
51
+ colors = [theme.colors['neg'] if i < 50 else theme.colors['pos'] for i in range(100)]
52
+
53
+ for i in range(len(theta)-1):
54
+ ax.fill_between([theta[i], theta[i+1]], [0, 0], [0.8, 0.8],
55
+ color=colors[i], alpha=0.7)
56
+
57
+ pos = np.pi * (0.5 + (0.4 if sentiment == 'Positive' else -0.4) * confidence)
58
+ ax.plot([pos, pos], [0, 0.6], 'k-', linewidth=6)
59
+ ax.plot(pos, 0.6, 'ko', markersize=10)
60
+
61
+ ax.set_xlim(0, np.pi)
62
+ ax.set_ylim(0, 1)
63
+ ax.set_title(f'{sentiment} - Confidence: {confidence:.3f}', fontweight='bold')
64
+ ax.set_xticks([0, np.pi/2, np.pi])
65
+ ax.set_xticklabels(['Negative', 'Neutral', 'Positive'])
66
+ ax.axis('off')
67
+
68
+ fig.tight_layout()
69
+ return fig
70
+
71
+ @staticmethod
72
+ @handle_errors(default_return=None)
73
+ def create_lime_keyword_chart(lime_words: List[Tuple[str, float]], sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
74
+ """Create horizontal bar chart for LIME key contributing words"""
75
+ if not lime_words:
76
+ return None
77
+
78
+ with managed_figure(figsize=config.FIGURE_SIZE_SINGLE) as fig:
79
+ ax = fig.add_subplot(111)
80
+
81
+ words = [word for word, score in lime_words]
82
+ scores = [score for word, score in lime_words]
83
+
84
+ color = theme.colors['pos'] if sentiment == 'Positive' else theme.colors['neg']
85
+
86
+ bars = ax.barh(range(len(words)), scores, color=color, alpha=0.7)
87
+ ax.set_yticks(range(len(words)))
88
+ ax.set_yticklabels(words)
89
+ ax.set_xlabel('LIME Attention Weight')
90
+ ax.set_title(f'LIME: Top Contributing Words ({sentiment})', fontweight='bold')
91
+
92
+ for i, (bar, score) in enumerate(zip(bars, scores)):
93
+ ax.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2.,
94
+ f'{score:.3f}', ha='left', va='center', fontsize=9)
95
+
96
+ ax.invert_yaxis()
97
+ ax.grid(axis='x', alpha=0.3)
98
+ fig.tight_layout()
99
+ return fig
100
+
101
+ @staticmethod
102
+ @handle_errors(default_return=None)
103
+ def create_shap_keyword_chart(shap_words: List[Tuple[str, float]], sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
104
+ """Create horizontal bar chart for SHAP key contributing words"""
105
+ if not shap_words:
106
+ return None
107
+
108
+ with managed_figure(figsize=config.FIGURE_SIZE_SINGLE) as fig:
109
+ ax = fig.add_subplot(111)
110
+
111
+ words = [word for word, score in shap_words]
112
+ scores = [score for word, score in shap_words]
113
+
114
+ color = theme.colors['pos'] if sentiment == 'Positive' else theme.colors['neg']
115
+
116
+ bars = ax.barh(range(len(words)), scores, color=color, alpha=0.7)
117
+ ax.set_yticks(range(len(words)))
118
+ ax.set_yticklabels(words)
119
+ ax.set_xlabel('SHAP Value')
120
+ ax.set_title(f'SHAP: Top Contributing Words ({sentiment})', fontweight='bold')
121
+
122
+ for i, (bar, score) in enumerate(zip(bars, scores)):
123
+ ax.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2.,
124
+ f'{score:.3f}', ha='left', va='center', fontsize=9)
125
+
126
+ ax.invert_yaxis()
127
+ ax.grid(axis='x', alpha=0.3)
128
+ fig.tight_layout()
129
+ return fig
130
+
131
+ @staticmethod
132
+ @handle_errors(default_return=None)
133
+ def create_wordcloud(text: str, sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
134
+ """Create word cloud"""
135
+ if len(text.split()) < 3:
136
+ return None
137
+
138
+ colormap = 'Greens' if sentiment == 'Positive' else 'Reds'
139
+ wc = WordCloud(width=800, height=400, background_color='white',
140
+ colormap=colormap, max_words=30).generate(text)
141
+
142
+ with managed_figure(figsize=config.WORDCLOUD_SIZE) as fig:
143
+ ax = fig.add_subplot(111)
144
+ ax.imshow(wc, interpolation='bilinear')
145
+ ax.axis('off')
146
+ ax.set_title(f'{sentiment} Word Cloud', fontweight='bold')
147
+ fig.tight_layout()
148
+ return fig
149
+
150
+ @staticmethod
151
+ @handle_errors(default_return=None)
152
+ def create_batch_analysis(results: List[Dict], theme: ThemeContext) -> plt.Figure:
153
+ """Create comprehensive batch visualization"""
154
+ with managed_figure(figsize=config.FIGURE_SIZE_BATCH) as fig:
155
+ gs = fig.add_gridspec(2, 2, hspace=0.3, wspace=0.3)
156
+
157
+ # Sentiment distribution
158
+ ax1 = fig.add_subplot(gs[0, 0])
159
+ sent_counts = Counter([r['sentiment'] for r in results])
160
+ colors = [theme.colors['pos'], theme.colors['neg']]
161
+ ax1.pie(sent_counts.values(), labels=sent_counts.keys(),
162
+ autopct='%1.1f%%', colors=colors[:len(sent_counts)])
163
+ ax1.set_title('Sentiment Distribution')
164
+
165
+ # Confidence histogram
166
+ ax2 = fig.add_subplot(gs[0, 1])
167
+ confs = [r['confidence'] for r in results]
168
+ ax2.hist(confs, bins=8, alpha=0.7, color='skyblue', edgecolor='black')
169
+ ax2.set_title('Confidence Distribution')
170
+ ax2.set_xlabel('Confidence')
171
+
172
+ # Sentiment over time
173
+ ax3 = fig.add_subplot(gs[1, :])
174
+ pos_probs = [r['pos_prob'] for r in results]
175
+ indices = range(len(results))
176
+ colors_scatter = [theme.colors['pos'] if r['sentiment'] == 'Positive'
177
+ else theme.colors['neg'] for r in results]
178
+ ax3.scatter(indices, pos_probs, c=colors_scatter, alpha=0.7, s=60)
179
+ ax3.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
180
+ ax3.set_title('Sentiment Progression')
181
+ ax3.set_xlabel('Review Index')
182
+ ax3.set_ylabel('Positive Probability')
183
+
184
+ return fig