Upload 3 files
Browse files- Multimodal_Text_Analytics.ipynb +1346 -0
- app.py +1271 -0
- requirements.txt +34 -0
Multimodal_Text_Analytics.ipynb
ADDED
|
@@ -0,0 +1,1346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "3baa95af-73a1-4d3c-a562-f90777f1f0c0",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# Text Data Analysis AI Assistant with Gradio\n",
|
| 9 |
+
" - Intelligent Customer Feedback Analysis System with Multiple AI APIs"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "code",
|
| 14 |
+
"execution_count": 1,
|
| 15 |
+
"id": "31a6bbea-df57-40ed-afd3-4df75cc86d0a",
|
| 16 |
+
"metadata": {},
|
| 17 |
+
"outputs": [
|
| 18 |
+
{
|
| 19 |
+
"name": "stderr",
|
| 20 |
+
"output_type": "stream",
|
| 21 |
+
"text": [
|
| 22 |
+
"[nltk_data] Downloading package brown to /Users/fola-ai/nltk_data...\n",
|
| 23 |
+
"[nltk_data] Package brown is already up-to-date!\n",
|
| 24 |
+
"[nltk_data] Downloading package punkt_tab to /Users/fola-\n",
|
| 25 |
+
"[nltk_data] ai/nltk_data...\n",
|
| 26 |
+
"[nltk_data] Package punkt_tab is already up-to-date!\n",
|
| 27 |
+
"[nltk_data] Downloading package wordnet to /Users/fola-ai/nltk_data...\n",
|
| 28 |
+
"[nltk_data] Package wordnet is already up-to-date!\n",
|
| 29 |
+
"[nltk_data] Downloading package averaged_perceptron_tagger_eng to\n",
|
| 30 |
+
"[nltk_data] /Users/fola-ai/nltk_data...\n",
|
| 31 |
+
"[nltk_data] Unzipping taggers/averaged_perceptron_tagger_eng.zip.\n",
|
| 32 |
+
"[nltk_data] Downloading package conll2000 to /Users/fola-\n",
|
| 33 |
+
"[nltk_data] ai/nltk_data...\n",
|
| 34 |
+
"[nltk_data] Unzipping corpora/conll2000.zip.\n",
|
| 35 |
+
"[nltk_data] Downloading package movie_reviews to /Users/fola-\n",
|
| 36 |
+
"[nltk_data] ai/nltk_data...\n",
|
| 37 |
+
"[nltk_data] Unzipping corpora/movie_reviews.zip.\n"
|
| 38 |
+
]
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"name": "stdout",
|
| 42 |
+
"output_type": "stream",
|
| 43 |
+
"text": [
|
| 44 |
+
"Finished.\n"
|
| 45 |
+
]
|
| 46 |
+
}
|
| 47 |
+
],
|
| 48 |
+
"source": [
|
| 49 |
+
"# ===== IMPORTS SECTION =====\n",
|
| 50 |
+
"# Core libraries\n",
|
| 51 |
+
"import os\n",
|
| 52 |
+
"import warnings\n",
|
| 53 |
+
"warnings.filterwarnings('ignore')\n",
|
| 54 |
+
"\n",
|
| 55 |
+
"# Environment and API\n",
|
| 56 |
+
"from dotenv import load_dotenv\n",
|
| 57 |
+
"from anthropic import Anthropic\n",
|
| 58 |
+
"\n",
|
| 59 |
+
"# Additional AI APIs\n",
|
| 60 |
+
"try:\n",
|
| 61 |
+
" from openai import OpenAI\n",
|
| 62 |
+
"except ImportError:\n",
|
| 63 |
+
" OpenAI = None\n",
|
| 64 |
+
" \n",
|
| 65 |
+
"try:\n",
|
| 66 |
+
" from groq import Groq\n",
|
| 67 |
+
"except ImportError:\n",
|
| 68 |
+
" Groq = None\n",
|
| 69 |
+
" \n",
|
| 70 |
+
"try:\n",
|
| 71 |
+
" import google.generativeai as genai\n",
|
| 72 |
+
"except ImportError:\n",
|
| 73 |
+
" genai = None\n",
|
| 74 |
+
"\n",
|
| 75 |
+
"# Data processing\n",
|
| 76 |
+
"import pandas as pd\n",
|
| 77 |
+
"import numpy as np\n",
|
| 78 |
+
"from datetime import datetime, timedelta\n",
|
| 79 |
+
"import json\n",
|
| 80 |
+
"import gc # For garbage collection\n",
|
| 81 |
+
"\n",
|
| 82 |
+
"# Natural Language Processing\n",
|
| 83 |
+
"import nltk\n",
|
| 84 |
+
"from nltk.corpus import stopwords\n",
|
| 85 |
+
"from nltk.tokenize import word_tokenize\n",
|
| 86 |
+
"from nltk.stem import WordNetLemmatizer\n",
|
| 87 |
+
"from textblob import TextBlob\n",
|
| 88 |
+
"import re\n",
|
| 89 |
+
"from collections import Counter\n",
|
| 90 |
+
"\n",
|
| 91 |
+
"# Machine Learning\n",
|
| 92 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
|
| 93 |
+
"from sklearn.decomposition import LatentDirichletAllocation\n",
|
| 94 |
+
"from sklearn.cluster import KMeans\n",
|
| 95 |
+
"from sklearn.preprocessing import StandardScaler\n",
|
| 96 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
| 97 |
+
"\n",
|
| 98 |
+
"# Visualization\n",
|
| 99 |
+
"import plotly.express as px\n",
|
| 100 |
+
"import plotly.graph_objects as go\n",
|
| 101 |
+
"from plotly.subplots import make_subplots\n",
|
| 102 |
+
"import matplotlib.pyplot as plt\n",
|
| 103 |
+
"import seaborn as sns\n",
|
| 104 |
+
"\n",
|
| 105 |
+
"# Web interface\n",
|
| 106 |
+
"import gradio as gr\n",
|
| 107 |
+
"\n",
|
| 108 |
+
"# Download required NLTK data\n",
|
| 109 |
+
"nltk.download('punkt', quiet=True)\n",
|
| 110 |
+
"nltk.download('punkt_tab', quiet=True) # New tokenizer format\n",
|
| 111 |
+
"nltk.download('stopwords', quiet=True)\n",
|
| 112 |
+
"nltk.download('wordnet', quiet=True)\n",
|
| 113 |
+
"nltk.download('averaged_perceptron_tagger', quiet=True)\n",
|
| 114 |
+
"nltk.download('omw-1.4', quiet=True) # For WordNet lemmatizer\n",
|
| 115 |
+
"nltk.download('brown', quiet=True) # Required for TextBlob\n",
|
| 116 |
+
"\n",
|
| 117 |
+
"# Download TextBlob corpora\n",
|
| 118 |
+
"try:\n",
|
| 119 |
+
" from textblob import download_corpora\n",
|
| 120 |
+
" download_corpora.main()\n",
|
| 121 |
+
"except:\n",
|
| 122 |
+
" # Alternative method if the above doesn't work\n",
|
| 123 |
+
" import subprocess\n",
|
| 124 |
+
" import sys\n",
|
| 125 |
+
" try:\n",
|
| 126 |
+
" subprocess.run([sys.executable, \"-m\", \"textblob.download_corpora\"], \n",
|
| 127 |
+
" capture_output=True, text=True, timeout=30)\n",
|
| 128 |
+
" except:\n",
|
| 129 |
+
" print(\"Warning: Could not download TextBlob corpora. Sentiment analysis may not work properly.\")\n",
|
| 130 |
+
" print(\"Please run: python -m textblob.download_corpora\")"
|
| 131 |
+
]
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
"cell_type": "code",
|
| 135 |
+
"execution_count": 2,
|
| 136 |
+
"id": "db7c1e72-7960-4968-9a72-0f62ca7140d9",
|
| 137 |
+
"metadata": {},
|
| 138 |
+
"outputs": [
|
| 139 |
+
{
|
| 140 |
+
"data": {
|
| 141 |
+
"text/plain": [
|
| 142 |
+
"True"
|
| 143 |
+
]
|
| 144 |
+
},
|
| 145 |
+
"execution_count": 2,
|
| 146 |
+
"metadata": {},
|
| 147 |
+
"output_type": "execute_result"
|
| 148 |
+
}
|
| 149 |
+
],
|
| 150 |
+
"source": [
|
| 151 |
+
"load_dotenv(override=True)"
|
| 152 |
+
]
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"cell_type": "code",
|
| 156 |
+
"execution_count": 3,
|
| 157 |
+
"id": "bded62da-82ab-4e17-bbf5-3edfe1b39398",
|
| 158 |
+
"metadata": {},
|
| 159 |
+
"outputs": [],
|
| 160 |
+
"source": [
|
| 161 |
+
"# ===== SMART COLUMN DETECTOR =====\n",
|
| 162 |
+
"class SmartColumnDetector:\n",
|
| 163 |
+
" \"\"\"Intelligently detect and extract relevant columns from uploaded data\"\"\"\n",
|
| 164 |
+
" \n",
|
| 165 |
+
" def __init__(self):\n",
|
| 166 |
+
" # Keywords for detecting different column types\n",
|
| 167 |
+
" self.text_keywords = ['comment', 'feedback', 'review', 'description', 'text', \n",
|
| 168 |
+
" 'response', 'opinion', 'message', 'notes', 'remarks']\n",
|
| 169 |
+
" self.id_keywords = ['id', 'identifier', 'key', 'number', 'code', 'ref', \n",
|
| 170 |
+
" 'reference', 'index', 'uuid']\n",
|
| 171 |
+
" self.product_keywords = ['product', 'item', 'model', 'variant', 'type', \n",
|
| 172 |
+
" 'category', 'brand', 'name', 'sku']\n",
|
| 173 |
+
" self.date_keywords = ['date', 'time', 'created', 'updated', 'timestamp']\n",
|
| 174 |
+
" \n",
|
| 175 |
+
" def detect_column_types(self, df):\n",
|
| 176 |
+
" \"\"\"Detect column types based on column names and content\"\"\"\n",
|
| 177 |
+
" detected = {\n",
|
| 178 |
+
" 'text_columns': [],\n",
|
| 179 |
+
" 'id_columns': [],\n",
|
| 180 |
+
" 'product_columns': [],\n",
|
| 181 |
+
" 'date_columns': [],\n",
|
| 182 |
+
" 'other_columns': []\n",
|
| 183 |
+
" }\n",
|
| 184 |
+
" \n",
|
| 185 |
+
" for col in df.columns:\n",
|
| 186 |
+
" col_lower = col.lower()\n",
|
| 187 |
+
" \n",
|
| 188 |
+
" # Check for text columns\n",
|
| 189 |
+
" if any(keyword in col_lower for keyword in self.text_keywords):\n",
|
| 190 |
+
" detected['text_columns'].append(col)\n",
|
| 191 |
+
" # Check for ID columns\n",
|
| 192 |
+
" elif any(keyword in col_lower for keyword in self.id_keywords):\n",
|
| 193 |
+
" detected['id_columns'].append(col)\n",
|
| 194 |
+
" # Check for product columns\n",
|
| 195 |
+
" elif any(keyword in col_lower for keyword in self.product_keywords):\n",
|
| 196 |
+
" detected['product_columns'].append(col)\n",
|
| 197 |
+
" # Check for date columns\n",
|
| 198 |
+
" elif any(keyword in col_lower for keyword in self.date_keywords):\n",
|
| 199 |
+
" detected['date_columns'].append(col)\n",
|
| 200 |
+
" else:\n",
|
| 201 |
+
" # Analyze content to determine type\n",
|
| 202 |
+
" sample = df[col].dropna().head(100)\n",
|
| 203 |
+
" if len(sample) > 0:\n",
|
| 204 |
+
" # Check if mostly text\n",
|
| 205 |
+
" if df[col].dtype == 'object':\n",
|
| 206 |
+
" avg_length = sample.astype(str).str.len().mean()\n",
|
| 207 |
+
" if avg_length > 50: # Likely text content\n",
|
| 208 |
+
" detected['text_columns'].append(col)\n",
|
| 209 |
+
" elif avg_length < 20 and df[col].nunique() / len(df) > 0.5:\n",
|
| 210 |
+
" detected['id_columns'].append(col)\n",
|
| 211 |
+
" else:\n",
|
| 212 |
+
" detected['product_columns'].append(col)\n",
|
| 213 |
+
" else:\n",
|
| 214 |
+
" detected['other_columns'].append(col)\n",
|
| 215 |
+
" \n",
|
| 216 |
+
" return detected\n",
|
| 217 |
+
" \n",
|
| 218 |
+
" def extract_relevant_data(self, df):\n",
|
| 219 |
+
" \"\"\"Extract only relevant columns and create optimized dataset\"\"\"\n",
|
| 220 |
+
" detected = self.detect_column_types(df)\n",
|
| 221 |
+
" \n",
|
| 222 |
+
" # Create new dataframe with relevant columns\n",
|
| 223 |
+
" extracted_data = pd.DataFrame()\n",
|
| 224 |
+
" \n",
|
| 225 |
+
" # Add unique identifier\n",
|
| 226 |
+
" if detected['id_columns'] and len(detected['id_columns']) > 0:\n",
|
| 227 |
+
" extracted_data['unique_id'] = df[detected['id_columns'][0]]\n",
|
| 228 |
+
" else:\n",
|
| 229 |
+
" extracted_data['unique_id'] = range(1, len(df) + 1)\n",
|
| 230 |
+
" \n",
|
| 231 |
+
" # Add product information\n",
|
| 232 |
+
" if detected['product_columns'] and len(detected['product_columns']) > 0:\n",
|
| 233 |
+
" # Convert to list if needed and limit to 2 product columns\n",
|
| 234 |
+
" product_cols = list(detected['product_columns'])[:2]\n",
|
| 235 |
+
" for col in product_cols:\n",
|
| 236 |
+
" extracted_data[f'product_{col}'] = df[col]\n",
|
| 237 |
+
" \n",
|
| 238 |
+
" # Combine text columns\n",
|
| 239 |
+
" if detected['text_columns'] and len(detected['text_columns']) > 0:\n",
|
| 240 |
+
" text_cols = list(detected['text_columns']) # Ensure it's a list\n",
|
| 241 |
+
" text_data = []\n",
|
| 242 |
+
" for idx in df.index:\n",
|
| 243 |
+
" combined_text = ' '.join([\n",
|
| 244 |
+
" str(df.loc[idx, col]) \n",
|
| 245 |
+
" for col in text_cols \n",
|
| 246 |
+
" if col in df.columns and pd.notna(df.loc[idx, col])\n",
|
| 247 |
+
" ])\n",
|
| 248 |
+
" text_data.append(combined_text)\n",
|
| 249 |
+
" extracted_data['combined_text'] = text_data\n",
|
| 250 |
+
" else:\n",
|
| 251 |
+
" # If no text columns detected, create empty combined_text\n",
|
| 252 |
+
" extracted_data['combined_text'] = [''] * len(df)\n",
|
| 253 |
+
" \n",
|
| 254 |
+
" # Add date columns\n",
|
| 255 |
+
" if detected['date_columns'] and len(detected['date_columns']) > 0:\n",
|
| 256 |
+
" extracted_data['date'] = pd.to_datetime(df[detected['date_columns'][0]], errors='coerce')\n",
|
| 257 |
+
" \n",
|
| 258 |
+
" return extracted_data, detected"
|
| 259 |
+
]
|
| 260 |
+
},
|
| 261 |
+
{
|
| 262 |
+
"cell_type": "code",
|
| 263 |
+
"execution_count": 4,
|
| 264 |
+
"id": "626af7bf-b4cf-4259-b409-18e5225555aa",
|
| 265 |
+
"metadata": {},
|
| 266 |
+
"outputs": [],
|
| 267 |
+
"source": [
|
| 268 |
+
"# ===== ENHANCED TEXT PROCESSOR =====\n",
|
| 269 |
+
"class EnhancedTextProcessor:\n",
|
| 270 |
+
" \"\"\"Enhanced text preprocessing with actionable insights extraction\"\"\"\n",
|
| 271 |
+
"\n",
|
| 272 |
+
" def __init__(self):\n",
|
| 273 |
+
" self.lemmatizer = WordNetLemmatizer()\n",
|
| 274 |
+
" self.stop_words = set(stopwords.words('english'))\n",
|
| 275 |
+
" \n",
|
| 276 |
+
" # Initialize actionable insights dictionary with common customer feedback phrases\n",
|
| 277 |
+
" self.actionable_dictionary = {\n",
|
| 278 |
+
" 'improve speed': ['slow', 'faster', 'quick', 'speed up', 'takes too long', 'waiting'],\n",
|
| 279 |
+
" 'better quality': ['poor quality', 'cheap', 'breaks', 'defective', 'flimsy', 'weak'],\n",
|
| 280 |
+
" 'enhance ui': ['confusing', 'hard to use', 'complicated', 'not intuitive', 'difficult to navigate'],\n",
|
| 281 |
+
" 'fix bugs': ['bug', 'error', 'crash', 'freeze', 'not working', 'glitch', 'broken'],\n",
|
| 282 |
+
" 'add features': ['missing', 'need', 'want', 'should have', 'would be nice', 'lacks'],\n",
|
| 283 |
+
" 'improve support': ['no response', 'unhelpful', 'rude', 'poor service', 'bad support'],\n",
|
| 284 |
+
" 'better packaging': ['damaged', 'poor packaging', 'arrived broken', 'not protected'],\n",
|
| 285 |
+
" 'clearer docs': ['unclear', 'no instructions', 'confusing manual', 'hard to understand'],\n",
|
| 286 |
+
" 'reduce price': ['expensive', 'overpriced', 'too costly', 'not worth', 'cheaper'],\n",
|
| 287 |
+
" 'faster delivery': ['late', 'delayed', 'slow shipping', 'took forever', 'still waiting'],\n",
|
| 288 |
+
" 'better communication': ['no updates', 'not informed', 'lack of communication', 'no tracking'],\n",
|
| 289 |
+
" 'improve reliability': ['unreliable', 'stops working', 'inconsistent', 'sometimes works'],\n",
|
| 290 |
+
" 'enhance performance': ['slow performance', 'laggy', 'sluggish', 'not responsive'],\n",
|
| 291 |
+
" 'better design': ['ugly', 'poor design', 'looks cheap', 'not attractive', 'outdated look'],\n",
|
| 292 |
+
" 'more options': ['limited options', 'no variety', 'need more choices', 'only one option']\n",
|
| 293 |
+
" }\n",
|
| 294 |
+
"\n",
|
| 295 |
+
" def clean_text(self, text):\n",
|
| 296 |
+
" \"\"\"Clean and normalize text\"\"\"\n",
|
| 297 |
+
" if pd.isna(text) or text == '':\n",
|
| 298 |
+
" return \"\"\n",
|
| 299 |
+
"\n",
|
| 300 |
+
" text = str(text).lower()\n",
|
| 301 |
+
" text = re.sub(r'[^a-zA-Z0-9\\s]', '', text)\n",
|
| 302 |
+
" text = ' '.join(text.split())\n",
|
| 303 |
+
" return text\n",
|
| 304 |
+
"\n",
|
| 305 |
+
" def extract_actionable_insights(self, text):\n",
|
| 306 |
+
" \"\"\"Extract actionable insights using dictionary matching\"\"\"\n",
|
| 307 |
+
" if pd.isna(text) or text == '':\n",
|
| 308 |
+
" return \"\"\n",
|
| 309 |
+
" \n",
|
| 310 |
+
" text_lower = text.lower()\n",
|
| 311 |
+
" found_insights = []\n",
|
| 312 |
+
" \n",
|
| 313 |
+
" # Check each actionable item against the text\n",
|
| 314 |
+
" for action, keywords in self.actionable_dictionary.items():\n",
|
| 315 |
+
" for keyword in keywords:\n",
|
| 316 |
+
" if keyword in text_lower:\n",
|
| 317 |
+
" found_insights.append(action)\n",
|
| 318 |
+
" break # Only add each action once\n",
|
| 319 |
+
" \n",
|
| 320 |
+
" # Return top 3 most relevant insights\n",
|
| 321 |
+
" if found_insights:\n",
|
| 322 |
+
" return ', '.join(found_insights[:3])\n",
|
| 323 |
+
" return \"\"\n",
|
| 324 |
+
"\n",
|
| 325 |
+
" def extract_specific_topics(self, text):\n",
|
| 326 |
+
" \"\"\"Extract specific topics from text using keyword extraction\"\"\"\n",
|
| 327 |
+
" if pd.isna(text) or text == '' or len(text) < 10:\n",
|
| 328 |
+
" return ['', '', '']\n",
|
| 329 |
+
" \n",
|
| 330 |
+
" # Clean text first\n",
|
| 331 |
+
" text_lower = text.lower()\n",
|
| 332 |
+
" \n",
|
| 333 |
+
" # Remove stopwords for better topic extraction\n",
|
| 334 |
+
" words = word_tokenize(text_lower)\n",
|
| 335 |
+
" filtered_words = [w for w in words if w not in self.stop_words and len(w) > 3]\n",
|
| 336 |
+
" \n",
|
| 337 |
+
" # Extract noun phrases and important terms\n",
|
| 338 |
+
" blob = TextBlob(text)\n",
|
| 339 |
+
" noun_phrases = blob.noun_phrases\n",
|
| 340 |
+
" \n",
|
| 341 |
+
" # Combine noun phrases with high-frequency meaningful words\n",
|
| 342 |
+
" topics = []\n",
|
| 343 |
+
" \n",
|
| 344 |
+
" # Add noun phrases (these are usually good topics)\n",
|
| 345 |
+
" for phrase in noun_phrases[:5]: # Limit to top 5 noun phrases\n",
|
| 346 |
+
" if len(phrase.split()) <= 3: # Only short phrases\n",
|
| 347 |
+
" topics.append(phrase)\n",
|
| 348 |
+
" \n",
|
| 349 |
+
" # Add frequent meaningful words if we don't have enough topics\n",
|
| 350 |
+
" if len(topics) < 3:\n",
|
| 351 |
+
" word_freq = Counter(filtered_words)\n",
|
| 352 |
+
" for word, _ in word_freq.most_common(5):\n",
|
| 353 |
+
" if word not in str(topics): # Avoid duplicates\n",
|
| 354 |
+
" topics.append(word)\n",
|
| 355 |
+
" if len(topics) >= 3:\n",
|
| 356 |
+
" break\n",
|
| 357 |
+
" \n",
|
| 358 |
+
" # Ensure we always return 3 items (empty string if not enough topics)\n",
|
| 359 |
+
" topics = topics[:3]\n",
|
| 360 |
+
" while len(topics) < 3:\n",
|
| 361 |
+
" topics.append('')\n",
|
| 362 |
+
" \n",
|
| 363 |
+
" return topics\n",
|
| 364 |
+
"\n",
|
| 365 |
+
" def determine_topic(self, text):\n",
|
| 366 |
+
" \"\"\"Legacy method kept for compatibility - returns first specific topic\"\"\"\n",
|
| 367 |
+
" topics = self.extract_specific_topics(text)\n",
|
| 368 |
+
" return topics[0] if topics[0] else 'General'"
|
| 369 |
+
]
|
| 370 |
+
},
|
| 371 |
+
{
|
| 372 |
+
"cell_type": "code",
|
| 373 |
+
"execution_count": 5,
|
| 374 |
+
"id": "b2eb5f17-7400-4591-8c0e-de7645b87c72",
|
| 375 |
+
"metadata": {},
|
| 376 |
+
"outputs": [],
|
| 377 |
+
"source": [
|
| 378 |
+
"# ===== SEARCH ENGINE =====\n",
|
| 379 |
+
"class TextSearchEngine:\n",
|
| 380 |
+
" \"\"\"Advanced search functionality for text data with semantic capabilities\"\"\"\n",
|
| 381 |
+
" \n",
|
| 382 |
+
" def __init__(self):\n",
|
| 383 |
+
" self.vectorizer = TfidfVectorizer(\n",
|
| 384 |
+
" max_features=1000,\n",
|
| 385 |
+
" ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams for better matching\n",
|
| 386 |
+
" stop_words='english',\n",
|
| 387 |
+
" use_idf=True,\n",
|
| 388 |
+
" smooth_idf=True,\n",
|
| 389 |
+
" sublinear_tf=True # Apply sublinear tf scaling\n",
|
| 390 |
+
" )\n",
|
| 391 |
+
" self.tfidf_matrix = None\n",
|
| 392 |
+
" self.data = None\n",
|
| 393 |
+
" \n",
|
| 394 |
+
" # Synonym dictionary for semantic search\n",
|
| 395 |
+
" self.synonyms = {\n",
|
| 396 |
+
" 'fast': ['quick', 'rapid', 'speedy', 'swift', 'prompt'],\n",
|
| 397 |
+
" 'slow': ['sluggish', 'delayed', 'laggy', 'lengthy', 'prolonged'],\n",
|
| 398 |
+
" 'good': ['excellent', 'great', 'wonderful', 'fantastic', 'amazing', 'positive'],\n",
|
| 399 |
+
" 'bad': ['poor', 'terrible', 'awful', 'negative', 'horrible', 'disappointing'],\n",
|
| 400 |
+
" 'problem': ['issue', 'bug', 'error', 'defect', 'fault', 'glitch'],\n",
|
| 401 |
+
" 'help': ['support', 'assistance', 'aid', 'service'],\n",
|
| 402 |
+
" 'price': ['cost', 'fee', 'charge', 'rate', 'payment', 'expensive', 'cheap'],\n",
|
| 403 |
+
" 'quality': ['standard', 'grade', 'condition', 'caliber'],\n",
|
| 404 |
+
" 'delivery': ['shipping', 'dispatch', 'arrival', 'transport'],\n",
|
| 405 |
+
" 'easy': ['simple', 'straightforward', 'effortless', 'user-friendly'],\n",
|
| 406 |
+
" 'hard': ['difficult', 'complex', 'complicated', 'challenging'],\n",
|
| 407 |
+
" 'broken': ['damaged', 'defective', 'faulty', 'malfunctioning'],\n",
|
| 408 |
+
" 'love': ['like', 'enjoy', 'appreciate', 'adore'],\n",
|
| 409 |
+
" 'hate': ['dislike', 'despise', 'detest'],\n",
|
| 410 |
+
" 'feature': ['function', 'capability', 'option', 'characteristic'],\n",
|
| 411 |
+
" 'customer': ['client', 'buyer', 'purchaser', 'consumer', 'user']\n",
|
| 412 |
+
" }\n",
|
| 413 |
+
" \n",
|
| 414 |
+
" def expand_query_with_synonyms(self, query):\n",
|
| 415 |
+
" \"\"\"Expand search query with synonyms for better semantic matching\"\"\"\n",
|
| 416 |
+
" query_words = query.lower().split()\n",
|
| 417 |
+
" expanded_terms = []\n",
|
| 418 |
+
" \n",
|
| 419 |
+
" for word in query_words:\n",
|
| 420 |
+
" # Add the original word\n",
|
| 421 |
+
" expanded_terms.append(word)\n",
|
| 422 |
+
" \n",
|
| 423 |
+
" # Add synonyms if available\n",
|
| 424 |
+
" if word in self.synonyms:\n",
|
| 425 |
+
" expanded_terms.extend(self.synonyms[word])\n",
|
| 426 |
+
" \n",
|
| 427 |
+
" # Check if word is a synonym of something else\n",
|
| 428 |
+
" for key, syns in self.synonyms.items():\n",
|
| 429 |
+
" if word in syns:\n",
|
| 430 |
+
" expanded_terms.append(key)\n",
|
| 431 |
+
" expanded_terms.extend([s for s in syns if s != word])\n",
|
| 432 |
+
" \n",
|
| 433 |
+
" # Remove duplicates while preserving order\n",
|
| 434 |
+
" seen = set()\n",
|
| 435 |
+
" unique_terms = []\n",
|
| 436 |
+
" for term in expanded_terms:\n",
|
| 437 |
+
" if term not in seen:\n",
|
| 438 |
+
" unique_terms.append(term)\n",
|
| 439 |
+
" seen.add(term)\n",
|
| 440 |
+
" \n",
|
| 441 |
+
" return ' '.join(unique_terms)\n",
|
| 442 |
+
" \n",
|
| 443 |
+
" def build_index(self, df, text_column):\n",
|
| 444 |
+
" \"\"\"Build search index from text data\"\"\"\n",
|
| 445 |
+
" self.data = df.copy()\n",
|
| 446 |
+
" texts = df[text_column].fillna('').tolist()\n",
|
| 447 |
+
" \n",
|
| 448 |
+
" # Add other searchable columns to improve search\n",
|
| 449 |
+
" if 'topic_1' in df.columns:\n",
|
| 450 |
+
" texts = [f\"{text} {df.iloc[i]['topic_1']} {df.iloc[i]['topic_2']} {df.iloc[i]['topic_3']}\" \n",
|
| 451 |
+
" for i, text in enumerate(texts)]\n",
|
| 452 |
+
" if 'actionable_insights' in df.columns:\n",
|
| 453 |
+
" texts = [f\"{texts[i]} {df.iloc[i]['actionable_insights']}\" \n",
|
| 454 |
+
" for i in range(len(texts))]\n",
|
| 455 |
+
" \n",
|
| 456 |
+
" self.tfidf_matrix = self.vectorizer.fit_transform(texts)\n",
|
| 457 |
+
" \n",
|
| 458 |
+
" def search(self, query, top_k=10):\n",
|
| 459 |
+
" \"\"\"Enhanced search with semantic understanding\"\"\"\n",
|
| 460 |
+
" if self.tfidf_matrix is None:\n",
|
| 461 |
+
" return pd.DataFrame()\n",
|
| 462 |
+
" \n",
|
| 463 |
+
" # Expand query with synonyms\n",
|
| 464 |
+
" expanded_query = self.expand_query_with_synonyms(query)\n",
|
| 465 |
+
" \n",
|
| 466 |
+
" # Vectorize both original and expanded queries\n",
|
| 467 |
+
" query_vector = self.vectorizer.transform([query])\n",
|
| 468 |
+
" expanded_vector = self.vectorizer.transform([expanded_query])\n",
|
| 469 |
+
" \n",
|
| 470 |
+
" # Calculate similarities for both\n",
|
| 471 |
+
" similarities_orig = cosine_similarity(query_vector, self.tfidf_matrix).flatten()\n",
|
| 472 |
+
" similarities_exp = cosine_similarity(expanded_vector, self.tfidf_matrix).flatten()\n",
|
| 473 |
+
" \n",
|
| 474 |
+
" # Combine scores (weighted average - original query gets more weight)\n",
|
| 475 |
+
" combined_similarities = (0.7 * similarities_orig + 0.3 * similarities_exp)\n",
|
| 476 |
+
" \n",
|
| 477 |
+
" # Get top results\n",
|
| 478 |
+
" top_indices = combined_similarities.argsort()[-top_k:][::-1]\n",
|
| 479 |
+
" top_scores = combined_similarities[top_indices]\n",
|
| 480 |
+
" \n",
|
| 481 |
+
" # Filter results with score > 0.05 (lower threshold for better recall)\n",
|
| 482 |
+
" valid_indices = [idx for idx, score in zip(top_indices, top_scores) if score > 0.05]\n",
|
| 483 |
+
" \n",
|
| 484 |
+
" if valid_indices:\n",
|
| 485 |
+
" results = self.data.iloc[valid_indices].copy()\n",
|
| 486 |
+
" results['search_score'] = [combined_similarities[idx] for idx in valid_indices]\n",
|
| 487 |
+
" \n",
|
| 488 |
+
" # Boost results that have exact matches\n",
|
| 489 |
+
" query_lower = query.lower()\n",
|
| 490 |
+
" for idx in results.index:\n",
|
| 491 |
+
" if 'combined_text' in results.columns:\n",
|
| 492 |
+
" if query_lower in str(results.at[idx, 'combined_text']).lower():\n",
|
| 493 |
+
" results.at[idx, 'search_score'] *= 1.5 # Boost exact matches\n",
|
| 494 |
+
" \n",
|
| 495 |
+
" return results.sort_values('search_score', ascending=False)\n",
|
| 496 |
+
" \n",
|
| 497 |
+
" return pd.DataFrame()\n"
|
| 498 |
+
]
|
| 499 |
+
},
|
| 500 |
+
{
|
| 501 |
+
"cell_type": "code",
|
| 502 |
+
"execution_count": 6,
|
| 503 |
+
"id": "e8b88155-971f-4dd5-b26c-104a737bc426",
|
| 504 |
+
"metadata": {},
|
| 505 |
+
"outputs": [],
|
| 506 |
+
"source": [
|
| 507 |
+
"# ===== API CONFIGURATION =====\n",
|
| 508 |
+
"class AIModelManager:\n",
|
| 509 |
+
" \"\"\"Manages multiple AI model APIs and provides unified interface\"\"\"\n",
|
| 510 |
+
" \n",
|
| 511 |
+
" def __init__(self):\n",
|
| 512 |
+
" self.available_models = {}\n",
|
| 513 |
+
" self.clients = {}\n",
|
| 514 |
+
" self.current_model = None\n",
|
| 515 |
+
" self.initialize_apis()\n",
|
| 516 |
+
" \n",
|
| 517 |
+
" def initialize_apis(self):\n",
|
| 518 |
+
" \"\"\"Initialize all available AI APIs\"\"\"\n",
|
| 519 |
+
" \n",
|
| 520 |
+
" # Anthropic\n",
|
| 521 |
+
" ANTHROPIC_API_KEY = os.getenv(\"ANTHROPIC_API_KEY\")\n",
|
| 522 |
+
" if ANTHROPIC_API_KEY:\n",
|
| 523 |
+
" try:\n",
|
| 524 |
+
" self.clients['anthropic'] = Anthropic(api_key=ANTHROPIC_API_KEY)\n",
|
| 525 |
+
" self.available_models['Claude 3 Haiku'] = {\n",
|
| 526 |
+
" 'provider': 'anthropic',\n",
|
| 527 |
+
" 'model': 'claude-3-haiku-20240307'\n",
|
| 528 |
+
" }\n",
|
| 529 |
+
" print(f\"Anthropic API Key exists and begins {ANTHROPIC_API_KEY[:4]}\")\n",
|
| 530 |
+
" except Exception as e:\n",
|
| 531 |
+
" print(f\"Error initializing Anthropic: {e}\")\n",
|
| 532 |
+
" else:\n",
|
| 533 |
+
" print(\"Anthropic API Key not set\")\n",
|
| 534 |
+
" \n",
|
| 535 |
+
" # OpenAI\n",
|
| 536 |
+
" OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
|
| 537 |
+
" if OPENAI_API_KEY and OpenAI:\n",
|
| 538 |
+
" try:\n",
|
| 539 |
+
" self.clients['openai'] = OpenAI(api_key=OPENAI_API_KEY)\n",
|
| 540 |
+
" self.available_models['GPT-4o-mini'] = {\n",
|
| 541 |
+
" 'provider': 'openai',\n",
|
| 542 |
+
" 'model': 'gpt-4o-mini'\n",
|
| 543 |
+
" }\n",
|
| 544 |
+
" self.available_models['GPT-3.5 Turbo'] = {\n",
|
| 545 |
+
" 'provider': 'openai',\n",
|
| 546 |
+
" 'model': 'gpt-3.5-turbo'\n",
|
| 547 |
+
" }\n",
|
| 548 |
+
" print(f\"OpenAI API Key exists and begins {OPENAI_API_KEY[:7]}\")\n",
|
| 549 |
+
" except Exception as e:\n",
|
| 550 |
+
" print(f\"Error initializing OpenAI: {e}\")\n",
|
| 551 |
+
" else:\n",
|
| 552 |
+
" print(\"OpenAI API Key not set or library not installed\")\n",
|
| 553 |
+
" \n",
|
| 554 |
+
" # Deepseek (uses OpenAI-compatible API)\n",
|
| 555 |
+
" DEEPSEEK_API_KEY = os.getenv(\"DEEPSEEK_API_KEY\")\n",
|
| 556 |
+
" if DEEPSEEK_API_KEY and OpenAI:\n",
|
| 557 |
+
" try:\n",
|
| 558 |
+
" self.clients['deepseek'] = OpenAI(\n",
|
| 559 |
+
" api_key=DEEPSEEK_API_KEY,\n",
|
| 560 |
+
" base_url=\"https://api.deepseek.com\"\n",
|
| 561 |
+
" )\n",
|
| 562 |
+
" self.available_models['Deepseek Chat'] = {\n",
|
| 563 |
+
" 'provider': 'deepseek',\n",
|
| 564 |
+
" 'model': 'deepseek-chat'\n",
|
| 565 |
+
" }\n",
|
| 566 |
+
" print(f\"Deepseek API Key exists and begins {DEEPSEEK_API_KEY[:7]}\")\n",
|
| 567 |
+
" except Exception as e:\n",
|
| 568 |
+
" print(f\"Error initializing Deepseek: {e}\")\n",
|
| 569 |
+
" else:\n",
|
| 570 |
+
" print(\"Deepseek API Key not set or OpenAI library not installed\")\n",
|
| 571 |
+
" \n",
|
| 572 |
+
" # Groq\n",
|
| 573 |
+
" GROQ_API_KEY = os.getenv(\"GROQ_API_KEY\")\n",
|
| 574 |
+
" if GROQ_API_KEY and Groq:\n",
|
| 575 |
+
" try:\n",
|
| 576 |
+
" self.clients['groq'] = Groq(api_key=GROQ_API_KEY)\n",
|
| 577 |
+
" self.available_models['Llama 3.3 70B'] = {\n",
|
| 578 |
+
" 'provider': 'groq',\n",
|
| 579 |
+
" 'model': 'llama-3.3-70b-versatile'\n",
|
| 580 |
+
" }\n",
|
| 581 |
+
" self.available_models['Mixtral 8x7B'] = {\n",
|
| 582 |
+
" 'provider': 'groq',\n",
|
| 583 |
+
" 'model': 'mixtral-8x7b-32768'\n",
|
| 584 |
+
" }\n",
|
| 585 |
+
" print(f\"Groq API Key exists and begins {GROQ_API_KEY[:4]}\")\n",
|
| 586 |
+
" except Exception as e:\n",
|
| 587 |
+
" print(f\"Error initializing Groq: {e}\")\n",
|
| 588 |
+
" else:\n",
|
| 589 |
+
" print(\"Groq API Key not set or library not installed\")\n",
|
| 590 |
+
" \n",
|
| 591 |
+
" # Google Gemini\n",
|
| 592 |
+
" GOOGLE_API_KEY = os.getenv(\"GOOGLE_API_KEY\")\n",
|
| 593 |
+
" if GOOGLE_API_KEY and genai:\n",
|
| 594 |
+
" try:\n",
|
| 595 |
+
" genai.configure(api_key=GOOGLE_API_KEY)\n",
|
| 596 |
+
" self.clients['google'] = genai\n",
|
| 597 |
+
" self.available_models['Gemini 1.5 Flash'] = {\n",
|
| 598 |
+
" 'provider': 'google',\n",
|
| 599 |
+
" 'model': 'gemini-1.5-flash'\n",
|
| 600 |
+
" }\n",
|
| 601 |
+
" self.available_models['Gemini 1.5 Pro'] = {\n",
|
| 602 |
+
" 'provider': 'google',\n",
|
| 603 |
+
" 'model': 'gemini-1.5-pro'\n",
|
| 604 |
+
" }\n",
|
| 605 |
+
" print(f\"Google API Key exists and begins {GOOGLE_API_KEY[:2]}\")\n",
|
| 606 |
+
" except Exception as e:\n",
|
| 607 |
+
" print(f\"Error initializing Google Gemini: {e}\")\n",
|
| 608 |
+
" else:\n",
|
| 609 |
+
" print(\"Google API Key not set or library not installed\")\n",
|
| 610 |
+
" \n",
|
| 611 |
+
" # Set default model\n",
|
| 612 |
+
" if self.available_models:\n",
|
| 613 |
+
" self.current_model = list(self.available_models.keys())[0]\n",
|
| 614 |
+
" \n",
|
| 615 |
+
" def get_available_models(self):\n",
|
| 616 |
+
" \"\"\"Return list of available model names\"\"\"\n",
|
| 617 |
+
" return list(self.available_models.keys())\n",
|
| 618 |
+
" \n",
|
| 619 |
+
" def set_model(self, model_name):\n",
|
| 620 |
+
" \"\"\"Set the current model\"\"\"\n",
|
| 621 |
+
" if model_name in self.available_models:\n",
|
| 622 |
+
" self.current_model = model_name\n",
|
| 623 |
+
" return True\n",
|
| 624 |
+
" return False\n",
|
| 625 |
+
" \n",
|
| 626 |
+
" def generate_text(self, prompt, max_tokens=1000):\n",
|
| 627 |
+
" \"\"\"Generate text using the current model\"\"\"\n",
|
| 628 |
+
" if not self.current_model or self.current_model not in self.available_models:\n",
|
| 629 |
+
" return None\n",
|
| 630 |
+
" \n",
|
| 631 |
+
" model_info = self.available_models[self.current_model]\n",
|
| 632 |
+
" provider = model_info['provider']\n",
|
| 633 |
+
" model = model_info['model']\n",
|
| 634 |
+
" \n",
|
| 635 |
+
" try:\n",
|
| 636 |
+
" if provider == 'anthropic':\n",
|
| 637 |
+
" client = self.clients['anthropic']\n",
|
| 638 |
+
" response = client.messages.create(\n",
|
| 639 |
+
" model=model,\n",
|
| 640 |
+
" max_tokens=max_tokens,\n",
|
| 641 |
+
" messages=[{\"role\": \"user\", \"content\": prompt}]\n",
|
| 642 |
+
" )\n",
|
| 643 |
+
" return response.content[0].text\n",
|
| 644 |
+
" \n",
|
| 645 |
+
" elif provider in ['openai', 'deepseek']:\n",
|
| 646 |
+
" client = self.clients[provider]\n",
|
| 647 |
+
" response = client.chat.completions.create(\n",
|
| 648 |
+
" model=model,\n",
|
| 649 |
+
" messages=[{\"role\": \"user\", \"content\": prompt}],\n",
|
| 650 |
+
" max_tokens=max_tokens\n",
|
| 651 |
+
" )\n",
|
| 652 |
+
" return response.choices[0].message.content\n",
|
| 653 |
+
" \n",
|
| 654 |
+
" elif provider == 'groq':\n",
|
| 655 |
+
" client = self.clients['groq']\n",
|
| 656 |
+
" response = client.chat.completions.create(\n",
|
| 657 |
+
" model=model,\n",
|
| 658 |
+
" messages=[{\"role\": \"user\", \"content\": prompt}],\n",
|
| 659 |
+
" max_tokens=max_tokens\n",
|
| 660 |
+
" )\n",
|
| 661 |
+
" return response.choices[0].message.content\n",
|
| 662 |
+
" \n",
|
| 663 |
+
" elif provider == 'google':\n",
|
| 664 |
+
" model_obj = genai.GenerativeModel(model)\n",
|
| 665 |
+
" response = model_obj.generate_content(prompt)\n",
|
| 666 |
+
" return response.text\n",
|
| 667 |
+
" \n",
|
| 668 |
+
" except Exception as e:\n",
|
| 669 |
+
" print(f\"Error generating text with {self.current_model}: {e}\")\n",
|
| 670 |
+
" return None"
|
| 671 |
+
]
|
| 672 |
+
},
|
| 673 |
+
{
|
| 674 |
+
"cell_type": "code",
|
| 675 |
+
"execution_count": 7,
|
| 676 |
+
"id": "809f4c47-6ea8-4eaa-bac1-5ca83daac733",
|
| 677 |
+
"metadata": {},
|
| 678 |
+
"outputs": [
|
| 679 |
+
{
|
| 680 |
+
"name": "stdout",
|
| 681 |
+
"output_type": "stream",
|
| 682 |
+
"text": [
|
| 683 |
+
"Anthropic API Key exists and begins sk-a\n",
|
| 684 |
+
"OpenAI API Key exists and begins sk-proj\n",
|
| 685 |
+
"Deepseek API Key exists and begins sk-1099\n",
|
| 686 |
+
"Groq API Key exists and begins gsk_\n",
|
| 687 |
+
"Google API Key exists and begins AI\n"
|
| 688 |
+
]
|
| 689 |
+
}
|
| 690 |
+
],
|
| 691 |
+
"source": [
|
| 692 |
+
"# Initialize the model manager globally\n",
|
| 693 |
+
"model_manager = AIModelManager()"
|
| 694 |
+
]
|
| 695 |
+
},
|
| 696 |
+
{
|
| 697 |
+
"cell_type": "code",
|
| 698 |
+
"execution_count": 8,
|
| 699 |
+
"id": "ad5f99f2-efd9-4759-88dc-df7f2f5359fb",
|
| 700 |
+
"metadata": {},
|
| 701 |
+
"outputs": [],
|
| 702 |
+
"source": [
|
| 703 |
+
"# ===== ENHANCED ANALYZER WITH MULTI-MODEL SUPPORT =====\n",
|
| 704 |
+
"\n",
|
| 705 |
+
"class EnhancedTextAnalyzer:\n",
|
| 706 |
+
" \"\"\"Main analysis engine with all enhanced features and multi-model support\"\"\"\n",
|
| 707 |
+
" \n",
|
| 708 |
+
" def __init__(self, model_manager=None):\n",
|
| 709 |
+
" self.model_manager = model_manager\n",
|
| 710 |
+
" self.column_detector = SmartColumnDetector()\n",
|
| 711 |
+
" self.text_processor = EnhancedTextProcessor()\n",
|
| 712 |
+
" self.search_engine = TextSearchEngine()\n",
|
| 713 |
+
" self.original_df = None\n",
|
| 714 |
+
" self.processed_df = None\n",
|
| 715 |
+
" self.results = {}\n",
|
| 716 |
+
" self.visualizations = {}\n",
|
| 717 |
+
" \n",
|
| 718 |
+
" def load_file(self, file):\n",
|
| 719 |
+
" \"\"\"Load data from various file formats\"\"\"\n",
|
| 720 |
+
" try:\n",
|
| 721 |
+
" if file.name.endswith('.csv'):\n",
|
| 722 |
+
" df = pd.read_csv(file.name)\n",
|
| 723 |
+
" elif file.name.endswith(('.xlsx', '.xls')):\n",
|
| 724 |
+
" df = pd.read_excel(file.name)\n",
|
| 725 |
+
" elif file.name.endswith('.json'):\n",
|
| 726 |
+
" df = pd.read_json(file.name)\n",
|
| 727 |
+
" else:\n",
|
| 728 |
+
" return None, \"Unsupported file format\"\n",
|
| 729 |
+
" \n",
|
| 730 |
+
" return df, f\"File loaded: {len(df)} records\"\n",
|
| 731 |
+
" except Exception as e:\n",
|
| 732 |
+
" return None, f\"Error loading file: {str(e)}\"\n",
|
| 733 |
+
" \n",
|
| 734 |
+
" def process_data(self, df):\n",
|
| 735 |
+
" \"\"\"Process data with smart extraction and analysis\"\"\"\n",
|
| 736 |
+
" # Extract relevant columns\n",
|
| 737 |
+
" extracted_df, detected_columns = self.column_detector.extract_relevant_data(df)\n",
|
| 738 |
+
" \n",
|
| 739 |
+
" # Store for reference\n",
|
| 740 |
+
" self.processed_df = extracted_df\n",
|
| 741 |
+
" \n",
|
| 742 |
+
" # Clear original from memory\n",
|
| 743 |
+
" del df\n",
|
| 744 |
+
" gc.collect()\n",
|
| 745 |
+
" \n",
|
| 746 |
+
" # Add analysis columns\n",
|
| 747 |
+
" if 'combined_text' in extracted_df.columns:\n",
|
| 748 |
+
" # Sentiment analysis\n",
|
| 749 |
+
" sentiments = []\n",
|
| 750 |
+
" polarities = []\n",
|
| 751 |
+
" topics_1 = []\n",
|
| 752 |
+
" topics_2 = []\n",
|
| 753 |
+
" topics_3 = []\n",
|
| 754 |
+
" insights = []\n",
|
| 755 |
+
" \n",
|
| 756 |
+
" for text in extracted_df['combined_text']:\n",
|
| 757 |
+
" # Sentiment\n",
|
| 758 |
+
" blob = TextBlob(text)\n",
|
| 759 |
+
" polarity = blob.sentiment.polarity\n",
|
| 760 |
+
" if polarity > 0.1:\n",
|
| 761 |
+
" sentiment = 'Positive'\n",
|
| 762 |
+
" elif polarity < -0.1:\n",
|
| 763 |
+
" sentiment = 'Negative'\n",
|
| 764 |
+
" else:\n",
|
| 765 |
+
" sentiment = 'Neutral'\n",
|
| 766 |
+
" \n",
|
| 767 |
+
" sentiments.append(sentiment)\n",
|
| 768 |
+
" polarities.append(polarity)\n",
|
| 769 |
+
" \n",
|
| 770 |
+
" # Extract specific topics (3 separate topics)\n",
|
| 771 |
+
" specific_topics = self.text_processor.extract_specific_topics(text)\n",
|
| 772 |
+
" topics_1.append(specific_topics[0])\n",
|
| 773 |
+
" topics_2.append(specific_topics[1])\n",
|
| 774 |
+
" topics_3.append(specific_topics[2])\n",
|
| 775 |
+
" \n",
|
| 776 |
+
" # Actionable insights using dictionary matching\n",
|
| 777 |
+
" insight = self.text_processor.extract_actionable_insights(text)\n",
|
| 778 |
+
" insights.append(insight)\n",
|
| 779 |
+
" \n",
|
| 780 |
+
" extracted_df['sentiment'] = sentiments\n",
|
| 781 |
+
" extracted_df['sentiment_score'] = polarities\n",
|
| 782 |
+
" extracted_df['topic_1'] = topics_1\n",
|
| 783 |
+
" extracted_df['topic_2'] = topics_2\n",
|
| 784 |
+
" extracted_df['topic_3'] = topics_3\n",
|
| 785 |
+
" extracted_df['actionable_insights'] = insights\n",
|
| 786 |
+
" \n",
|
| 787 |
+
" # Build search index with enhanced search capabilities\n",
|
| 788 |
+
" self.search_engine.build_index(extracted_df, 'combined_text')\n",
|
| 789 |
+
" \n",
|
| 790 |
+
" # Save processed data\n",
|
| 791 |
+
" output_file = 'processed_data.xlsx'\n",
|
| 792 |
+
" extracted_df.to_excel(output_file, index=False)\n",
|
| 793 |
+
" \n",
|
| 794 |
+
" return extracted_df, detected_columns, output_file\n",
|
| 795 |
+
" \n",
|
| 796 |
+
" def generate_ai_insights(self, df, num_samples=5):\n",
|
| 797 |
+
" \"\"\"Generate AI-powered insights using selected model\"\"\"\n",
|
| 798 |
+
" if not self.model_manager or not self.model_manager.current_model:\n",
|
| 799 |
+
" return \"No AI model available for generating insights\"\n",
|
| 800 |
+
" \n",
|
| 801 |
+
" if 'combined_text' not in df.columns or df.empty:\n",
|
| 802 |
+
" return \"No text data available for AI analysis\"\n",
|
| 803 |
+
" \n",
|
| 804 |
+
" # Sample some texts for analysis\n",
|
| 805 |
+
" sample_texts = df['combined_text'].dropna().head(num_samples).tolist()\n",
|
| 806 |
+
" if not sample_texts:\n",
|
| 807 |
+
" return \"No valid text samples found\"\n",
|
| 808 |
+
" \n",
|
| 809 |
+
" # Create prompt for AI analysis\n",
|
| 810 |
+
" prompt = f\"\"\"Analyze the following customer feedback samples and provide key insights:\n",
|
| 811 |
+
"\n",
|
| 812 |
+
"Samples:\n",
|
| 813 |
+
"{chr(10).join([f\"{i+1}. {text[:200]}...\" if len(text) > 200 else f\"{i+1}. {text}\" for i, text in enumerate(sample_texts)])}\n",
|
| 814 |
+
"\n",
|
| 815 |
+
"Please provide:\n",
|
| 816 |
+
"1. Main themes and patterns\n",
|
| 817 |
+
"2. Key sentiment indicators\n",
|
| 818 |
+
"3. Actionable recommendations\n",
|
| 819 |
+
"4. Areas of concern\n",
|
| 820 |
+
"\n",
|
| 821 |
+
"Keep the response concise and focused on actionable insights.\"\"\"\n",
|
| 822 |
+
"\n",
|
| 823 |
+
" # Generate insights using selected model\n",
|
| 824 |
+
" try:\n",
|
| 825 |
+
" response = self.model_manager.generate_text(prompt, max_tokens=500)\n",
|
| 826 |
+
" if response:\n",
|
| 827 |
+
" return f\"**AI Insights (using {self.model_manager.current_model}):**\\n\\n{response}\"\n",
|
| 828 |
+
" else:\n",
|
| 829 |
+
" return \"Failed to generate AI insights. Please check your API configuration.\"\n",
|
| 830 |
+
" except Exception as e:\n",
|
| 831 |
+
" return f\"Error generating AI insights: {str(e)}\"\n",
|
| 832 |
+
" \n",
|
| 833 |
+
" def generate_visualizations(self, df):\n",
|
| 834 |
+
" \"\"\"Generate various visualizations\"\"\"\n",
|
| 835 |
+
" visualizations = {}\n",
|
| 836 |
+
" \n",
|
| 837 |
+
" if 'sentiment' in df.columns:\n",
|
| 838 |
+
" # Sentiment distribution\n",
|
| 839 |
+
" sentiment_counts = df['sentiment'].value_counts()\n",
|
| 840 |
+
" fig_sentiment = px.pie(\n",
|
| 841 |
+
" values=sentiment_counts.values,\n",
|
| 842 |
+
" names=sentiment_counts.index,\n",
|
| 843 |
+
" title=\"Sentiment Distribution\",\n",
|
| 844 |
+
" color_discrete_map={\n",
|
| 845 |
+
" 'Positive': '#27AE60',\n",
|
| 846 |
+
" 'Negative': '#E74C3C',\n",
|
| 847 |
+
" 'Neutral': '#95A5A6'\n",
|
| 848 |
+
" }\n",
|
| 849 |
+
" )\n",
|
| 850 |
+
" visualizations['Sentiment Distribution'] = fig_sentiment\n",
|
| 851 |
+
" \n",
|
| 852 |
+
" if 'topic_1' in df.columns:\n",
|
| 853 |
+
" # Combine all topics for overall topic distribution\n",
|
| 854 |
+
" all_topics = []\n",
|
| 855 |
+
" for col in ['topic_1', 'topic_2', 'topic_3']:\n",
|
| 856 |
+
" if col in df.columns:\n",
|
| 857 |
+
" topics = df[col].dropna().tolist()\n",
|
| 858 |
+
" all_topics.extend([t for t in topics if t != ''])\n",
|
| 859 |
+
" \n",
|
| 860 |
+
" if all_topics:\n",
|
| 861 |
+
" topic_counts = Counter(all_topics)\n",
|
| 862 |
+
" top_topics = dict(topic_counts.most_common(15))\n",
|
| 863 |
+
" \n",
|
| 864 |
+
" fig_topics = px.bar(\n",
|
| 865 |
+
" x=list(top_topics.values()),\n",
|
| 866 |
+
" y=list(top_topics.keys()),\n",
|
| 867 |
+
" orientation='h',\n",
|
| 868 |
+
" title=\"Top 15 Specific Topics\",\n",
|
| 869 |
+
" labels={'x': 'Count', 'y': 'Topic'}\n",
|
| 870 |
+
" )\n",
|
| 871 |
+
" visualizations['Topic Distribution'] = fig_topics\n",
|
| 872 |
+
" \n",
|
| 873 |
+
" if 'sentiment' in df.columns and 'topic_1' in df.columns:\n",
|
| 874 |
+
" # Sentiment by primary topic (topic_1)\n",
|
| 875 |
+
" df_temp = df[df['topic_1'] != ''].copy()\n",
|
| 876 |
+
" if not df_temp.empty:\n",
|
| 877 |
+
" # Get top 10 topics for cleaner visualization\n",
|
| 878 |
+
" top_topics = df_temp['topic_1'].value_counts().head(10).index\n",
|
| 879 |
+
" df_filtered = df_temp[df_temp['topic_1'].isin(top_topics)]\n",
|
| 880 |
+
" \n",
|
| 881 |
+
" pivot_table = pd.crosstab(df_filtered['topic_1'], df_filtered['sentiment'])\n",
|
| 882 |
+
" fig_heatmap = px.imshow(\n",
|
| 883 |
+
" pivot_table,\n",
|
| 884 |
+
" labels=dict(x=\"Sentiment\", y=\"Primary Topic\", color=\"Count\"),\n",
|
| 885 |
+
" title=\"Sentiment by Primary Topic Heatmap\",\n",
|
| 886 |
+
" color_continuous_scale=\"RdYlGn\"\n",
|
| 887 |
+
" )\n",
|
| 888 |
+
" visualizations['Sentiment by Topic'] = fig_heatmap\n",
|
| 889 |
+
" \n",
|
| 890 |
+
" if 'date' in df.columns and 'sentiment' in df.columns:\n",
|
| 891 |
+
" # Sentiment over time\n",
|
| 892 |
+
" df_time = df.copy()\n",
|
| 893 |
+
" df_time['date'] = pd.to_datetime(df_time['date'])\n",
|
| 894 |
+
" time_data = df_time.groupby([pd.Grouper(key='date', freq='M'), 'sentiment']).size().reset_index(name='count')\n",
|
| 895 |
+
" \n",
|
| 896 |
+
" fig_timeline = px.line(\n",
|
| 897 |
+
" time_data,\n",
|
| 898 |
+
" x='date',\n",
|
| 899 |
+
" y='count',\n",
|
| 900 |
+
" color='sentiment',\n",
|
| 901 |
+
" title=\"Sentiment Trends Over Time\",\n",
|
| 902 |
+
" color_discrete_map={\n",
|
| 903 |
+
" 'Positive': '#27AE60',\n",
|
| 904 |
+
" 'Negative': '#E74C3C',\n",
|
| 905 |
+
" 'Neutral': '#95A5A6'\n",
|
| 906 |
+
" }\n",
|
| 907 |
+
" )\n",
|
| 908 |
+
" visualizations['Sentiment Timeline'] = fig_timeline\n",
|
| 909 |
+
" \n",
|
| 910 |
+
" if 'actionable_insights' in df.columns:\n",
|
| 911 |
+
" # Top actionable insights\n",
|
| 912 |
+
" all_insights = []\n",
|
| 913 |
+
" for insight in df['actionable_insights']:\n",
|
| 914 |
+
" if insight and insight != \"\":\n",
|
| 915 |
+
" # Split by comma as we're now using comma-separated insights\n",
|
| 916 |
+
" all_insights.extend([i.strip() for i in insight.split(',')])\n",
|
| 917 |
+
" \n",
|
| 918 |
+
" if all_insights:\n",
|
| 919 |
+
" insight_counts = Counter(all_insights)\n",
|
| 920 |
+
" top_insights = dict(insight_counts.most_common(10))\n",
|
| 921 |
+
" \n",
|
| 922 |
+
" fig_insights = px.bar(\n",
|
| 923 |
+
" x=list(top_insights.values()),\n",
|
| 924 |
+
" y=list(top_insights.keys()),\n",
|
| 925 |
+
" orientation='h',\n",
|
| 926 |
+
" title=\"Top 10 Actionable Insights\",\n",
|
| 927 |
+
" labels={'x': 'Frequency', 'y': 'Insight'}\n",
|
| 928 |
+
" )\n",
|
| 929 |
+
" visualizations['Top Insights'] = fig_insights\n",
|
| 930 |
+
" \n",
|
| 931 |
+
" return visualizations"
|
| 932 |
+
]
|
| 933 |
+
},
|
| 934 |
+
{
|
| 935 |
+
"cell_type": "code",
|
| 936 |
+
"execution_count": 9,
|
| 937 |
+
"id": "5ee86a52-b195-4010-a2b7-3abf57bf9949",
|
| 938 |
+
"metadata": {},
|
| 939 |
+
"outputs": [],
|
| 940 |
+
"source": [
|
| 941 |
+
"# ===== GRADIO INTERFACE =====\n",
|
| 942 |
+
"# Global variables\n",
|
| 943 |
+
"analyzer = None\n",
|
| 944 |
+
"current_data = None\n",
|
| 945 |
+
"current_visualizations = None\n",
|
| 946 |
+
"\n",
|
| 947 |
+
"def update_model(model_name):\n",
|
| 948 |
+
" \"\"\"Update the selected AI model\"\"\"\n",
|
| 949 |
+
" global model_manager\n",
|
| 950 |
+
" \n",
|
| 951 |
+
" if model_manager.set_model(model_name):\n",
|
| 952 |
+
" return f\"β
Model switched to: {model_name}\"\n",
|
| 953 |
+
" else:\n",
|
| 954 |
+
" return f\"β Failed to switch to: {model_name}\"\n",
|
| 955 |
+
"\n",
|
| 956 |
+
"def process_file(file, model_name):\n",
|
| 957 |
+
" \"\"\"Process uploaded file with selected model\"\"\"\n",
|
| 958 |
+
" global analyzer, current_data, current_visualizations, model_manager\n",
|
| 959 |
+
" \n",
|
| 960 |
+
" if file is None:\n",
|
| 961 |
+
" return \"Please upload a file\", None, None, None, None, None, gr.update(choices=[])\n",
|
| 962 |
+
" \n",
|
| 963 |
+
" try:\n",
|
| 964 |
+
" # Update model if changed\n",
|
| 965 |
+
" if model_name and model_manager:\n",
|
| 966 |
+
" model_manager.set_model(model_name)\n",
|
| 967 |
+
" \n",
|
| 968 |
+
" analyzer = EnhancedTextAnalyzer(model_manager)\n",
|
| 969 |
+
" \n",
|
| 970 |
+
" # Load file\n",
|
| 971 |
+
" df, message = analyzer.load_file(file)\n",
|
| 972 |
+
" if df is None:\n",
|
| 973 |
+
" return message, None, None, None, None, None, gr.update(choices=[])\n",
|
| 974 |
+
" \n",
|
| 975 |
+
" # Process data\n",
|
| 976 |
+
" processed_df, detected_cols, output_file = analyzer.process_data(df)\n",
|
| 977 |
+
" current_data = processed_df\n",
|
| 978 |
+
" \n",
|
| 979 |
+
" # Generate visualizations\n",
|
| 980 |
+
" visualizations = analyzer.generate_visualizations(processed_df)\n",
|
| 981 |
+
" current_visualizations = visualizations\n",
|
| 982 |
+
" \n",
|
| 983 |
+
" # Generate AI insights\n",
|
| 984 |
+
" ai_insights = analyzer.generate_ai_insights(processed_df)\n",
|
| 985 |
+
" \n",
|
| 986 |
+
" # Create summary - safely handle detected columns\n",
|
| 987 |
+
" text_cols = list(detected_cols.get('text_columns', []))[:3] if detected_cols.get('text_columns') else []\n",
|
| 988 |
+
" id_cols = list(detected_cols.get('id_columns', []))[:3] if detected_cols.get('id_columns') else []\n",
|
| 989 |
+
" product_cols = list(detected_cols.get('product_columns', []))[:3] if detected_cols.get('product_columns') else []\n",
|
| 990 |
+
" \n",
|
| 991 |
+
" summary = f\"\"\"\n",
|
| 992 |
+
" ### β
File Processing Complete!\n",
|
| 993 |
+
" \n",
|
| 994 |
+
" **Detected Columns:**\n",
|
| 995 |
+
" - Text Columns: {', '.join(text_cols) if text_cols else 'None'}\n",
|
| 996 |
+
" - ID Columns: {', '.join(id_cols) if id_cols else 'Auto-generated'}\n",
|
| 997 |
+
" - Product Columns: {', '.join(product_cols) if product_cols else 'None'}\n",
|
| 998 |
+
" \n",
|
| 999 |
+
" **Analysis Results:**\n",
|
| 1000 |
+
" - Total Records: {len(processed_df)}\n",
|
| 1001 |
+
" - Processed File Saved: {output_file}\n",
|
| 1002 |
+
" - AI Model Used: {model_manager.current_model if model_manager else 'None'}\n",
|
| 1003 |
+
" \"\"\"\n",
|
| 1004 |
+
" \n",
|
| 1005 |
+
" # Data preview\n",
|
| 1006 |
+
" preview = processed_df.head(10)\n",
|
| 1007 |
+
" \n",
|
| 1008 |
+
" # Get first visualization\n",
|
| 1009 |
+
" first_viz = list(visualizations.values())[0] if visualizations else None\n",
|
| 1010 |
+
" \n",
|
| 1011 |
+
" return (\n",
|
| 1012 |
+
" summary,\n",
|
| 1013 |
+
" preview,\n",
|
| 1014 |
+
" output_file,\n",
|
| 1015 |
+
" ai_insights,\n",
|
| 1016 |
+
" first_viz,\n",
|
| 1017 |
+
" \"Ready for search\",\n",
|
| 1018 |
+
" gr.update(choices=list(visualizations.keys()))\n",
|
| 1019 |
+
" )\n",
|
| 1020 |
+
" \n",
|
| 1021 |
+
" except Exception as e:\n",
|
| 1022 |
+
" return f\"Error: {str(e)}\", None, None, None, None, None, gr.update(choices=[])\n",
|
| 1023 |
+
"\n",
|
| 1024 |
+
"def search_data(query):\n",
|
| 1025 |
+
" \"\"\"Search through the data with enhanced semantic search\"\"\"\n",
|
| 1026 |
+
" global analyzer, current_data\n",
|
| 1027 |
+
" \n",
|
| 1028 |
+
" if analyzer is None or current_data is None:\n",
|
| 1029 |
+
" return \"Please process a file first\", None, None\n",
|
| 1030 |
+
" \n",
|
| 1031 |
+
" if not query:\n",
|
| 1032 |
+
" return \"Please enter a search query\", None, None\n",
|
| 1033 |
+
" \n",
|
| 1034 |
+
" try:\n",
|
| 1035 |
+
" results = analyzer.search_engine.search(query, top_k=10)\n",
|
| 1036 |
+
" \n",
|
| 1037 |
+
" if results.empty:\n",
|
| 1038 |
+
" return \"No results found\", None, None\n",
|
| 1039 |
+
" \n",
|
| 1040 |
+
" # Select relevant columns for display (updated to include new topic columns)\n",
|
| 1041 |
+
" display_cols = ['unique_id', 'combined_text', 'sentiment', 'topic_1', 'topic_2', 'topic_3', 'actionable_insights', 'search_score']\n",
|
| 1042 |
+
" display_cols = [col for col in display_cols if col in results.columns]\n",
|
| 1043 |
+
" \n",
|
| 1044 |
+
" results_display = results[display_cols]\n",
|
| 1045 |
+
" \n",
|
| 1046 |
+
" # Save search results\n",
|
| 1047 |
+
" search_output = f\"search_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx\"\n",
|
| 1048 |
+
" results_display.to_excel(search_output, index=False)\n",
|
| 1049 |
+
" \n",
|
| 1050 |
+
" return f\"Found {len(results)} results\", results_display.head(10), search_output\n",
|
| 1051 |
+
" \n",
|
| 1052 |
+
" except Exception as e:\n",
|
| 1053 |
+
" return f\"Search error: {str(e)}\", None, None\n",
|
| 1054 |
+
"\n",
|
| 1055 |
+
"def update_visualization(viz_type):\n",
|
| 1056 |
+
" \"\"\"Update displayed visualization\"\"\"\n",
|
| 1057 |
+
" global current_visualizations\n",
|
| 1058 |
+
" \n",
|
| 1059 |
+
" if current_visualizations and viz_type in current_visualizations:\n",
|
| 1060 |
+
" return current_visualizations[viz_type]\n",
|
| 1061 |
+
" return None\n",
|
| 1062 |
+
"\n",
|
| 1063 |
+
"def export_results(format_type):\n",
|
| 1064 |
+
" \"\"\"Export processed data in different formats\"\"\"\n",
|
| 1065 |
+
" global current_data\n",
|
| 1066 |
+
" \n",
|
| 1067 |
+
" if current_data is None:\n",
|
| 1068 |
+
" return \"No data to export\", None\n",
|
| 1069 |
+
" \n",
|
| 1070 |
+
" try:\n",
|
| 1071 |
+
" timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')\n",
|
| 1072 |
+
" \n",
|
| 1073 |
+
" if format_type == \"Excel\":\n",
|
| 1074 |
+
" output_file = f\"analysis_results_{timestamp}.xlsx\"\n",
|
| 1075 |
+
" current_data.to_excel(output_file, index=False)\n",
|
| 1076 |
+
" else: # CSV\n",
|
| 1077 |
+
" output_file = f\"analysis_results_{timestamp}.csv\"\n",
|
| 1078 |
+
" current_data.to_csv(output_file, index=False)\n",
|
| 1079 |
+
" \n",
|
| 1080 |
+
" return f\"Data exported to {output_file}\", output_file\n",
|
| 1081 |
+
" \n",
|
| 1082 |
+
" except Exception as e:\n",
|
| 1083 |
+
" return f\"Export error: {str(e)}\", None"
|
| 1084 |
+
]
|
| 1085 |
+
},
|
| 1086 |
+
{
|
| 1087 |
+
"cell_type": "code",
|
| 1088 |
+
"execution_count": 10,
|
| 1089 |
+
"id": "38bf0375-9ef8-488c-821f-288c4f59ff5d",
|
| 1090 |
+
"metadata": {},
|
| 1091 |
+
"outputs": [],
|
| 1092 |
+
"source": [
|
| 1093 |
+
"# Create Gradio interface\n",
|
| 1094 |
+
"def create_interface():\n",
|
| 1095 |
+
" \"\"\"Create the Gradio interface with model selection\"\"\"\n",
|
| 1096 |
+
" \n",
|
| 1097 |
+
" with gr.Blocks(theme=gr.themes.Soft()) as app:\n",
|
| 1098 |
+
" gr.Markdown(\n",
|
| 1099 |
+
" \"\"\"\n",
|
| 1100 |
+
" # π Enhanced Text Analytics AI Agent\n",
|
| 1101 |
+
" ### Smart Column Detection & Comprehensive Text Analysis with Multiple AI Models\n",
|
| 1102 |
+
" \n",
|
| 1103 |
+
" **Features:**\n",
|
| 1104 |
+
" - π€ Multiple AI Model Support (OpenAI, Anthropic, Deepseek, Groq, Google)\n",
|
| 1105 |
+
" - π Automatic detection of text, ID, and product columns\n",
|
| 1106 |
+
" - πΎ Memory-efficient processing with automatic file cleanup\n",
|
| 1107 |
+
" - π Sentiment analysis with scoring\n",
|
| 1108 |
+
" - π― Topic/theme extraction\n",
|
| 1109 |
+
" - π‘ Actionable insights generation\n",
|
| 1110 |
+
" - π Advanced text search with similarity scoring\n",
|
| 1111 |
+
" - π Multiple visualization options\n",
|
| 1112 |
+
" - π₯ Export results in Excel or CSV format\n",
|
| 1113 |
+
" \"\"\"\n",
|
| 1114 |
+
" )\n",
|
| 1115 |
+
" \n",
|
| 1116 |
+
" with gr.Tab(\"π€ Upload & Process\"):\n",
|
| 1117 |
+
" with gr.Row():\n",
|
| 1118 |
+
" with gr.Column(scale=1):\n",
|
| 1119 |
+
" # Model selection dropdown\n",
|
| 1120 |
+
" model_dropdown = gr.Dropdown(\n",
|
| 1121 |
+
" label=\"π€ Select AI Model\",\n",
|
| 1122 |
+
" choices=model_manager.get_available_models(),\n",
|
| 1123 |
+
" value=model_manager.current_model if model_manager.current_model else None,\n",
|
| 1124 |
+
" interactive=True\n",
|
| 1125 |
+
" )\n",
|
| 1126 |
+
" \n",
|
| 1127 |
+
" file_upload = gr.File(\n",
|
| 1128 |
+
" label=\"Upload Data File\",\n",
|
| 1129 |
+
" file_types=[\".csv\", \".xlsx\", \".xls\", \".json\"]\n",
|
| 1130 |
+
" )\n",
|
| 1131 |
+
" process_btn = gr.Button(\"π Process File\", variant=\"primary\")\n",
|
| 1132 |
+
" \n",
|
| 1133 |
+
" with gr.Column(scale=2):\n",
|
| 1134 |
+
" status_output = gr.Markdown(label=\"Processing Status\")\n",
|
| 1135 |
+
" ai_insights = gr.Markdown(label=\"AI-Generated Insights\")\n",
|
| 1136 |
+
" \n",
|
| 1137 |
+
" with gr.Row():\n",
|
| 1138 |
+
" data_preview = gr.Dataframe(\n",
|
| 1139 |
+
" label=\"Data Preview (First 10 rows)\",\n",
|
| 1140 |
+
" interactive=False\n",
|
| 1141 |
+
" )\n",
|
| 1142 |
+
" \n",
|
| 1143 |
+
" processed_file = gr.File(\n",
|
| 1144 |
+
" label=\"π Processed Data File\",\n",
|
| 1145 |
+
" interactive=False\n",
|
| 1146 |
+
" )\n",
|
| 1147 |
+
" \n",
|
| 1148 |
+
" with gr.Tab(\"π Search\"):\n",
|
| 1149 |
+
" gr.Markdown(\"### Search through your text data\")\n",
|
| 1150 |
+
" \n",
|
| 1151 |
+
" with gr.Row():\n",
|
| 1152 |
+
" search_input = gr.Textbox(\n",
|
| 1153 |
+
" label=\"Enter search query\",\n",
|
| 1154 |
+
" placeholder=\"Type keywords to search...\"\n",
|
| 1155 |
+
" )\n",
|
| 1156 |
+
" search_btn = gr.Button(\"π Search\", variant=\"primary\")\n",
|
| 1157 |
+
" \n",
|
| 1158 |
+
" search_status = gr.Markdown(label=\"Search Status\")\n",
|
| 1159 |
+
" search_results = gr.Dataframe(\n",
|
| 1160 |
+
" label=\"Search Results\",\n",
|
| 1161 |
+
" interactive=False\n",
|
| 1162 |
+
" )\n",
|
| 1163 |
+
" search_file = gr.File(\n",
|
| 1164 |
+
" label=\"π₯ Download Search Results\",\n",
|
| 1165 |
+
" interactive=False\n",
|
| 1166 |
+
" )\n",
|
| 1167 |
+
" \n",
|
| 1168 |
+
" with gr.Tab(\"π Visualizations\"):\n",
|
| 1169 |
+
" with gr.Row():\n",
|
| 1170 |
+
" viz_selector = gr.Dropdown(\n",
|
| 1171 |
+
" label=\"Select Visualization\",\n",
|
| 1172 |
+
" choices=[],\n",
|
| 1173 |
+
" interactive=True\n",
|
| 1174 |
+
" )\n",
|
| 1175 |
+
" \n",
|
| 1176 |
+
" viz_plot = gr.Plot(label=\"Visualization\")\n",
|
| 1177 |
+
" \n",
|
| 1178 |
+
" with gr.Tab(\"π₯ Export\"):\n",
|
| 1179 |
+
" gr.Markdown(\"### Export your analyzed data\")\n",
|
| 1180 |
+
" \n",
|
| 1181 |
+
" with gr.Row():\n",
|
| 1182 |
+
" export_format = gr.Radio(\n",
|
| 1183 |
+
" choices=[\"Excel\", \"CSV\"],\n",
|
| 1184 |
+
" value=\"Excel\",\n",
|
| 1185 |
+
" label=\"Export Format\"\n",
|
| 1186 |
+
" )\n",
|
| 1187 |
+
" export_btn = gr.Button(\"π₯ Export Data\", variant=\"primary\")\n",
|
| 1188 |
+
" \n",
|
| 1189 |
+
" export_status = gr.Markdown(label=\"Export Status\")\n",
|
| 1190 |
+
" export_file = gr.File(\n",
|
| 1191 |
+
" label=\"π Download Exported File\",\n",
|
| 1192 |
+
" interactive=False\n",
|
| 1193 |
+
" )\n",
|
| 1194 |
+
" \n",
|
| 1195 |
+
" # Event handlers\n",
|
| 1196 |
+
" model_dropdown.change(\n",
|
| 1197 |
+
" fn=update_model,\n",
|
| 1198 |
+
" inputs=[model_dropdown],\n",
|
| 1199 |
+
" outputs=[status_output]\n",
|
| 1200 |
+
" )\n",
|
| 1201 |
+
" \n",
|
| 1202 |
+
" process_btn.click(\n",
|
| 1203 |
+
" fn=process_file,\n",
|
| 1204 |
+
" inputs=[file_upload, model_dropdown],\n",
|
| 1205 |
+
" outputs=[\n",
|
| 1206 |
+
" status_output,\n",
|
| 1207 |
+
" data_preview,\n",
|
| 1208 |
+
" processed_file,\n",
|
| 1209 |
+
" ai_insights,\n",
|
| 1210 |
+
" viz_plot,\n",
|
| 1211 |
+
" search_status,\n",
|
| 1212 |
+
" viz_selector\n",
|
| 1213 |
+
" ]\n",
|
| 1214 |
+
" )\n",
|
| 1215 |
+
" \n",
|
| 1216 |
+
" search_btn.click(\n",
|
| 1217 |
+
" fn=search_data,\n",
|
| 1218 |
+
" inputs=[search_input],\n",
|
| 1219 |
+
" outputs=[search_status, search_results, search_file]\n",
|
| 1220 |
+
" )\n",
|
| 1221 |
+
" \n",
|
| 1222 |
+
" viz_selector.change(\n",
|
| 1223 |
+
" fn=update_visualization,\n",
|
| 1224 |
+
" inputs=[viz_selector],\n",
|
| 1225 |
+
" outputs=[viz_plot]\n",
|
| 1226 |
+
" )\n",
|
| 1227 |
+
" \n",
|
| 1228 |
+
" export_btn.click(\n",
|
| 1229 |
+
" fn=export_results,\n",
|
| 1230 |
+
" inputs=[export_format],\n",
|
| 1231 |
+
" outputs=[export_status, export_file]\n",
|
| 1232 |
+
" )\n",
|
| 1233 |
+
" \n",
|
| 1234 |
+
" return app"
|
| 1235 |
+
]
|
| 1236 |
+
},
|
| 1237 |
+
{
|
| 1238 |
+
"cell_type": "code",
|
| 1239 |
+
"execution_count": 11,
|
| 1240 |
+
"id": "6c5a0767-a788-43a8-911c-04e81814f4c4",
|
| 1241 |
+
"metadata": {},
|
| 1242 |
+
"outputs": [
|
| 1243 |
+
{
|
| 1244 |
+
"name": "stdout",
|
| 1245 |
+
"output_type": "stream",
|
| 1246 |
+
"text": [
|
| 1247 |
+
"* Running on local URL: http://127.0.0.1:7861\n",
|
| 1248 |
+
"* Running on public URL: https://8190830de481785995.gradio.live\n",
|
| 1249 |
+
"\n",
|
| 1250 |
+
"This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n"
|
| 1251 |
+
]
|
| 1252 |
+
},
|
| 1253 |
+
{
|
| 1254 |
+
"data": {
|
| 1255 |
+
"text/html": [
|
| 1256 |
+
"<div><iframe src=\"https://8190830de481785995.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
| 1257 |
+
],
|
| 1258 |
+
"text/plain": [
|
| 1259 |
+
"<IPython.core.display.HTML object>"
|
| 1260 |
+
]
|
| 1261 |
+
},
|
| 1262 |
+
"metadata": {},
|
| 1263 |
+
"output_type": "display_data"
|
| 1264 |
+
},
|
| 1265 |
+
{
|
| 1266 |
+
"name": "stdout",
|
| 1267 |
+
"output_type": "stream",
|
| 1268 |
+
"text": [
|
| 1269 |
+
"Keyboard interruption in main thread... closing server.\n",
|
| 1270 |
+
"Killing tunnel 127.0.0.1:7861 <> https://8190830de481785995.gradio.live\n"
|
| 1271 |
+
]
|
| 1272 |
+
}
|
| 1273 |
+
],
|
| 1274 |
+
"source": [
|
| 1275 |
+
"# Launch the application\n",
|
| 1276 |
+
"if __name__ == \"__main__\":\n",
|
| 1277 |
+
" app = create_interface()\n",
|
| 1278 |
+
" app.launch(share=True, debug=True)"
|
| 1279 |
+
]
|
| 1280 |
+
},
|
| 1281 |
+
{
|
| 1282 |
+
"cell_type": "code",
|
| 1283 |
+
"execution_count": 12,
|
| 1284 |
+
"id": "4f382d04-cee3-40ea-9687-5f2dff2282f7",
|
| 1285 |
+
"metadata": {},
|
| 1286 |
+
"outputs": [
|
| 1287 |
+
{
|
| 1288 |
+
"ename": "SyntaxError",
|
| 1289 |
+
"evalue": "invalid syntax (2621292756.py, line 1)",
|
| 1290 |
+
"output_type": "error",
|
| 1291 |
+
"traceback": [
|
| 1292 |
+
"\u001b[0;36m Cell \u001b[0;32mIn[12], line 1\u001b[0;36m\u001b[0m\n\u001b[0;31m python -m textblob.download_corpora\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
|
| 1293 |
+
]
|
| 1294 |
+
}
|
| 1295 |
+
],
|
| 1296 |
+
"source": [
|
| 1297 |
+
"python -m textblob.download_corpora"
|
| 1298 |
+
]
|
| 1299 |
+
},
|
| 1300 |
+
{
|
| 1301 |
+
"cell_type": "code",
|
| 1302 |
+
"execution_count": null,
|
| 1303 |
+
"id": "63afdaca-562b-4846-8fb2-c699f7ab6615",
|
| 1304 |
+
"metadata": {},
|
| 1305 |
+
"outputs": [],
|
| 1306 |
+
"source": []
|
| 1307 |
+
},
|
| 1308 |
+
{
|
| 1309 |
+
"cell_type": "code",
|
| 1310 |
+
"execution_count": null,
|
| 1311 |
+
"id": "d82bb0bb-053e-4c29-af8b-b732dfcb47ad",
|
| 1312 |
+
"metadata": {},
|
| 1313 |
+
"outputs": [],
|
| 1314 |
+
"source": []
|
| 1315 |
+
},
|
| 1316 |
+
{
|
| 1317 |
+
"cell_type": "code",
|
| 1318 |
+
"execution_count": null,
|
| 1319 |
+
"id": "12da3957-a063-48f8-8916-e552cc317280",
|
| 1320 |
+
"metadata": {},
|
| 1321 |
+
"outputs": [],
|
| 1322 |
+
"source": []
|
| 1323 |
+
}
|
| 1324 |
+
],
|
| 1325 |
+
"metadata": {
|
| 1326 |
+
"kernelspec": {
|
| 1327 |
+
"display_name": "Python 3 (ipykernel)",
|
| 1328 |
+
"language": "python",
|
| 1329 |
+
"name": "python3"
|
| 1330 |
+
},
|
| 1331 |
+
"language_info": {
|
| 1332 |
+
"codemirror_mode": {
|
| 1333 |
+
"name": "ipython",
|
| 1334 |
+
"version": 3
|
| 1335 |
+
},
|
| 1336 |
+
"file_extension": ".py",
|
| 1337 |
+
"mimetype": "text/x-python",
|
| 1338 |
+
"name": "python",
|
| 1339 |
+
"nbconvert_exporter": "python",
|
| 1340 |
+
"pygments_lexer": "ipython3",
|
| 1341 |
+
"version": "3.13.5"
|
| 1342 |
+
}
|
| 1343 |
+
},
|
| 1344 |
+
"nbformat": 4,
|
| 1345 |
+
"nbformat_minor": 5
|
| 1346 |
+
}
|
app.py
ADDED
|
@@ -0,0 +1,1271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ===== MULTIMODAL TEXT ANALYTICS AI ASSISTANT =====
|
| 2 |
+
# This is a comprehensive text analytics system with multiple AI API integrations
|
| 3 |
+
# and smart column detection capabilities for customer feedback analysis
|
| 4 |
+
|
| 5 |
+
# ===== IMPORTS SECTION =====
|
| 6 |
+
# Core Python libraries for basic functionality
|
| 7 |
+
import os # Operating system interface for environment variables and file operations
|
| 8 |
+
import warnings # Python warnings control to suppress unnecessary warnings
|
| 9 |
+
warnings.filterwarnings('ignore') # Suppress all warnings to keep output clean
|
| 10 |
+
|
| 11 |
+
# Environment and API management
|
| 12 |
+
from dotenv import load_dotenv # Load environment variables from .env file for API keys
|
| 13 |
+
from anthropic import Anthropic # Anthropic's Claude AI API client
|
| 14 |
+
|
| 15 |
+
# Additional AI APIs - using try/except to handle missing dependencies gracefully
|
| 16 |
+
try:
|
| 17 |
+
from openai import OpenAI # OpenAI's GPT API client
|
| 18 |
+
except ImportError:
|
| 19 |
+
OpenAI = None # Set to None if not installed, will be checked later
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
from groq import Groq # Groq's fast inference API client
|
| 23 |
+
except ImportError:
|
| 24 |
+
Groq = None # Set to None if not installed
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
import google.generativeai as genai # Google's Gemini API client
|
| 28 |
+
except ImportError:
|
| 29 |
+
genai = None # Set to None if not installed
|
| 30 |
+
|
| 31 |
+
# Data processing and manipulation libraries
|
| 32 |
+
import pandas as pd # Primary data manipulation library for DataFrames
|
| 33 |
+
import numpy as np # Numerical computing library for array operations
|
| 34 |
+
from datetime import datetime, timedelta # Date and time handling utilities
|
| 35 |
+
import json # JSON data format handling
|
| 36 |
+
import gc # Garbage collection for memory management - important for large datasets
|
| 37 |
+
|
| 38 |
+
# Natural Language Processing libraries
|
| 39 |
+
import nltk # Natural Language Toolkit - comprehensive NLP library
|
| 40 |
+
from nltk.corpus import stopwords # Common words to filter out (the, and, or, etc.)
|
| 41 |
+
from nltk.tokenize import word_tokenize # Split text into individual words/tokens
|
| 42 |
+
from nltk.stem import WordNetLemmatizer # Reduce words to their root form (running -> run)
|
| 43 |
+
from textblob import TextBlob # Simple API for diving into common NLP tasks
|
| 44 |
+
import re # Regular expressions for text pattern matching and cleaning
|
| 45 |
+
from collections import Counter # Efficient counting of hashable objects
|
| 46 |
+
|
| 47 |
+
# Machine Learning libraries for text analysis
|
| 48 |
+
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer # Convert text to numerical features
|
| 49 |
+
from sklearn.decomposition import LatentDirichletAllocation # Topic modeling algorithm
|
| 50 |
+
from sklearn.cluster import KMeans # Clustering algorithm for grouping similar texts
|
| 51 |
+
from sklearn.preprocessing import StandardScaler # Normalize numerical features
|
| 52 |
+
from sklearn.metrics.pairwise import cosine_similarity # Measure similarity between text vectors
|
| 53 |
+
|
| 54 |
+
# Visualization libraries for creating charts and graphs
|
| 55 |
+
import plotly.express as px # High-level plotting interface
|
| 56 |
+
import plotly.graph_objects as go # Low-level plotting interface for custom charts
|
| 57 |
+
from plotly.subplots import make_subplots # Create multiple charts in one figure
|
| 58 |
+
import matplotlib.pyplot as plt # Traditional plotting library
|
| 59 |
+
import seaborn as sns # Statistical data visualization built on matplotlib
|
| 60 |
+
|
| 61 |
+
# Web interface framework
|
| 62 |
+
import gradio as gr # Create web interfaces for machine learning models
|
| 63 |
+
|
| 64 |
+
# Download required NLTK data packages - these contain language models and corpora
|
| 65 |
+
nltk.download('punkt', quiet=True) # Sentence tokenizer models
|
| 66 |
+
nltk.download('punkt_tab', quiet=True) # New tokenizer format for latest NLTK versions
|
| 67 |
+
nltk.download('stopwords', quiet=True) # Lists of common words to filter out
|
| 68 |
+
nltk.download('wordnet', quiet=True) # Lexical database for lemmatization
|
| 69 |
+
nltk.download('averaged_perceptron_tagger', quiet=True) # Part-of-speech tagger
|
| 70 |
+
nltk.download('omw-1.4', quiet=True) # Open Multilingual Wordnet for lemmatizer
|
| 71 |
+
nltk.download('brown', quiet=True) # Brown corpus required for TextBlob
|
| 72 |
+
|
| 73 |
+
# Download TextBlob corpora for sentiment analysis
|
| 74 |
+
try:
|
| 75 |
+
from textblob import download_corpora # Import corpora downloader
|
| 76 |
+
download_corpora.main() # Download all required corpora
|
| 77 |
+
except:
|
| 78 |
+
# Alternative method if the above doesn't work - use subprocess
|
| 79 |
+
import subprocess # Execute shell commands from Python
|
| 80 |
+
import sys # System-specific parameters and functions
|
| 81 |
+
try:
|
| 82 |
+
# Run TextBlob download command as subprocess with timeout
|
| 83 |
+
subprocess.run([sys.executable, "-m", "textblob.download_corpora"],
|
| 84 |
+
capture_output=True, text=True, timeout=30)
|
| 85 |
+
except:
|
| 86 |
+
# If download fails, print warning but continue execution
|
| 87 |
+
print("Warning: Could not download TextBlob corpora. Sentiment analysis may not work properly.")
|
| 88 |
+
print("Please run: python -m textblob.download_corpora")
|
| 89 |
+
|
| 90 |
+
# Load environment variables from .env file, override existing ones
|
| 91 |
+
load_dotenv(override=True)
|
| 92 |
+
|
| 93 |
+
# ===== SMART COLUMN DETECTOR CLASS =====
|
| 94 |
+
class SmartColumnDetector:
|
| 95 |
+
"""
|
| 96 |
+
Intelligently detect and extract relevant columns from uploaded data
|
| 97 |
+
This class automatically identifies what type of data each column contains
|
| 98 |
+
"""
|
| 99 |
+
|
| 100 |
+
def __init__(self):
|
| 101 |
+
"""Initialize the detector with keyword lists for different column types"""
|
| 102 |
+
# Keywords for detecting text/feedback columns - these usually contain the main content
|
| 103 |
+
self.text_keywords = ['comment', 'feedback', 'review', 'description', 'text',
|
| 104 |
+
'response', 'opinion', 'message', 'notes', 'remarks']
|
| 105 |
+
|
| 106 |
+
# Keywords for detecting ID/identifier columns - these uniquely identify records
|
| 107 |
+
self.id_keywords = ['id', 'identifier', 'key', 'number', 'code', 'ref',
|
| 108 |
+
'reference', 'index', 'uuid']
|
| 109 |
+
|
| 110 |
+
# Keywords for detecting product/category columns - these describe what's being reviewed
|
| 111 |
+
self.product_keywords = ['product', 'item', 'model', 'variant', 'type',
|
| 112 |
+
'category', 'brand', 'name', 'sku']
|
| 113 |
+
|
| 114 |
+
# Keywords for detecting date/time columns - these show when feedback was given
|
| 115 |
+
self.date_keywords = ['date', 'time', 'created', 'updated', 'timestamp']
|
| 116 |
+
|
| 117 |
+
def detect_column_types(self, df):
|
| 118 |
+
"""
|
| 119 |
+
Detect column types based on column names and content analysis
|
| 120 |
+
Returns a dictionary categorizing each column by its likely purpose
|
| 121 |
+
"""
|
| 122 |
+
# Initialize results dictionary with empty lists for each category
|
| 123 |
+
detected = {
|
| 124 |
+
'text_columns': [], # Columns containing feedback/comments
|
| 125 |
+
'id_columns': [], # Columns containing unique identifiers
|
| 126 |
+
'product_columns': [], # Columns describing products/categories
|
| 127 |
+
'date_columns': [], # Columns containing dates/timestamps
|
| 128 |
+
'other_columns': [] # Everything else
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
# Iterate through each column in the dataframe
|
| 132 |
+
for col in df.columns:
|
| 133 |
+
col_lower = col.lower() # Convert to lowercase for case-insensitive matching
|
| 134 |
+
|
| 135 |
+
# Check if column name contains text-related keywords
|
| 136 |
+
if any(keyword in col_lower for keyword in self.text_keywords):
|
| 137 |
+
detected['text_columns'].append(col)
|
| 138 |
+
# Check if column name contains ID-related keywords
|
| 139 |
+
elif any(keyword in col_lower for keyword in self.id_keywords):
|
| 140 |
+
detected['id_columns'].append(col)
|
| 141 |
+
# Check if column name contains product-related keywords
|
| 142 |
+
elif any(keyword in col_lower for keyword in self.product_keywords):
|
| 143 |
+
detected['product_columns'].append(col)
|
| 144 |
+
# Check if column name contains date-related keywords
|
| 145 |
+
elif any(keyword in col_lower for keyword in self.date_keywords):
|
| 146 |
+
detected['date_columns'].append(col)
|
| 147 |
+
else:
|
| 148 |
+
# If no keywords match, analyze the actual content to determine type
|
| 149 |
+
sample = df[col].dropna().head(100) # Get first 100 non-null values
|
| 150 |
+
if len(sample) > 0: # If we have sample data
|
| 151 |
+
# Check if column contains text data (object dtype in pandas)
|
| 152 |
+
if df[col].dtype == 'object':
|
| 153 |
+
# Calculate average length of text in this column
|
| 154 |
+
avg_length = sample.astype(str).str.len().mean()
|
| 155 |
+
if avg_length > 50: # Long text likely indicates feedback/comments
|
| 156 |
+
detected['text_columns'].append(col)
|
| 157 |
+
elif avg_length < 20 and df[col].nunique() / len(df) > 0.5:
|
| 158 |
+
# Short, mostly unique values likely indicate IDs
|
| 159 |
+
detected['id_columns'].append(col)
|
| 160 |
+
else:
|
| 161 |
+
# Short, non-unique text likely indicates categories/products
|
| 162 |
+
detected['product_columns'].append(col)
|
| 163 |
+
else:
|
| 164 |
+
# Non-text columns go to 'other' category
|
| 165 |
+
detected['other_columns'].append(col)
|
| 166 |
+
|
| 167 |
+
return detected # Return the categorized column dictionary
|
| 168 |
+
|
| 169 |
+
def extract_relevant_data(self, df):
|
| 170 |
+
"""
|
| 171 |
+
Extract only relevant columns and create optimized dataset for analysis
|
| 172 |
+
This reduces memory usage and focuses on important data
|
| 173 |
+
"""
|
| 174 |
+
# First, detect what type each column is
|
| 175 |
+
detected = self.detect_column_types(df)
|
| 176 |
+
|
| 177 |
+
# Create new dataframe with only relevant columns
|
| 178 |
+
extracted_data = pd.DataFrame()
|
| 179 |
+
|
| 180 |
+
# Add unique identifier column - use existing ID or create one
|
| 181 |
+
if detected['id_columns'] and len(detected['id_columns']) > 0:
|
| 182 |
+
# Use first detected ID column
|
| 183 |
+
extracted_data['unique_id'] = df[detected['id_columns'][0]]
|
| 184 |
+
else:
|
| 185 |
+
# Create sequential ID numbers if no ID column exists
|
| 186 |
+
extracted_data['unique_id'] = range(1, len(df) + 1)
|
| 187 |
+
|
| 188 |
+
# Add product information columns (limit to first 2 to avoid too many columns)
|
| 189 |
+
if detected['product_columns'] and len(detected['product_columns']) > 0:
|
| 190 |
+
# Convert to list if needed and limit to 2 product columns
|
| 191 |
+
product_cols = list(detected['product_columns'])[:2]
|
| 192 |
+
for col in product_cols:
|
| 193 |
+
# Add with 'product_' prefix to make purpose clear
|
| 194 |
+
extracted_data[f'product_{col}'] = df[col]
|
| 195 |
+
|
| 196 |
+
# Combine all text columns into a single 'combined_text' column
|
| 197 |
+
if detected['text_columns'] and len(detected['text_columns']) > 0:
|
| 198 |
+
text_cols = list(detected['text_columns']) # Ensure it's a list
|
| 199 |
+
text_data = [] # Initialize list to store combined text
|
| 200 |
+
|
| 201 |
+
# For each row, combine all text columns
|
| 202 |
+
for idx in df.index:
|
| 203 |
+
combined_text = ' '.join([
|
| 204 |
+
str(df.loc[idx, col]) # Convert to string
|
| 205 |
+
for col in text_cols # For each text column
|
| 206 |
+
if col in df.columns and pd.notna(df.loc[idx, col]) # If column exists and value is not null
|
| 207 |
+
])
|
| 208 |
+
text_data.append(combined_text) # Add to our list
|
| 209 |
+
extracted_data['combined_text'] = text_data # Add as new column
|
| 210 |
+
else:
|
| 211 |
+
# If no text columns detected, create empty combined_text column
|
| 212 |
+
extracted_data['combined_text'] = [''] * len(df)
|
| 213 |
+
|
| 214 |
+
# Add date column if available (use first detected date column)
|
| 215 |
+
if detected['date_columns'] and len(detected['date_columns']) > 0:
|
| 216 |
+
# Convert to datetime format, handle errors gracefully
|
| 217 |
+
extracted_data['date'] = pd.to_datetime(df[detected['date_columns'][0]], errors='coerce')
|
| 218 |
+
|
| 219 |
+
# Return both the extracted data and the detection results
|
| 220 |
+
return extracted_data, detected
|
| 221 |
+
|
| 222 |
+
# ===== ENHANCED TEXT PROCESSOR CLASS =====
|
| 223 |
+
class EnhancedTextProcessor:
|
| 224 |
+
"""
|
| 225 |
+
Enhanced text preprocessing with actionable insights extraction
|
| 226 |
+
This class handles text cleaning and extracts meaningful patterns from customer feedback
|
| 227 |
+
"""
|
| 228 |
+
|
| 229 |
+
def __init__(self):
|
| 230 |
+
"""Initialize the text processor with NLP tools and insight dictionaries"""
|
| 231 |
+
self.lemmatizer = WordNetLemmatizer() # Tool to reduce words to root form
|
| 232 |
+
self.stop_words = set(stopwords.words('english')) # Common words to ignore
|
| 233 |
+
|
| 234 |
+
# Dictionary mapping actionable items to keywords that indicate them
|
| 235 |
+
# This helps identify what customers want improved
|
| 236 |
+
self.actionable_dictionary = {
|
| 237 |
+
'improve speed': ['slow', 'faster', 'quick', 'speed up', 'takes too long', 'waiting'],
|
| 238 |
+
'better quality': ['poor quality', 'cheap', 'breaks', 'defective', 'flimsy', 'weak'],
|
| 239 |
+
'enhance ui': ['confusing', 'hard to use', 'complicated', 'not intuitive', 'difficult to navigate'],
|
| 240 |
+
'fix bugs': ['bug', 'error', 'crash', 'freeze', 'not working', 'glitch', 'broken'],
|
| 241 |
+
'add features': ['missing', 'need', 'want', 'should have', 'would be nice', 'lacks'],
|
| 242 |
+
'improve support': ['no response', 'unhelpful', 'rude', 'poor service', 'bad support'],
|
| 243 |
+
'better packaging': ['damaged', 'poor packaging', 'arrived broken', 'not protected'],
|
| 244 |
+
'clearer docs': ['unclear', 'no instructions', 'confusing manual', 'hard to understand'],
|
| 245 |
+
'reduce price': ['expensive', 'overpriced', 'too costly', 'not worth', 'cheaper'],
|
| 246 |
+
'faster delivery': ['late', 'delayed', 'slow shipping', 'took forever', 'still waiting'],
|
| 247 |
+
'better communication': ['no updates', 'not informed', 'lack of communication', 'no tracking'],
|
| 248 |
+
'improve reliability': ['unreliable', 'stops working', 'inconsistent', 'sometimes works'],
|
| 249 |
+
'enhance performance': ['slow performance', 'laggy', 'sluggish', 'not responsive'],
|
| 250 |
+
'better design': ['ugly', 'poor design', 'looks cheap', 'not attractive', 'outdated look'],
|
| 251 |
+
'more options': ['limited options', 'no variety', 'need more choices', 'only one option']
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
def clean_text(self, text):
|
| 255 |
+
"""
|
| 256 |
+
Clean and normalize text for analysis
|
| 257 |
+
Removes special characters and standardizes format
|
| 258 |
+
"""
|
| 259 |
+
# Handle null or empty text
|
| 260 |
+
if pd.isna(text) or text == '':
|
| 261 |
+
return ""
|
| 262 |
+
|
| 263 |
+
text = str(text).lower() # Convert to lowercase string
|
| 264 |
+
text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters, keep only letters, numbers, spaces
|
| 265 |
+
text = ' '.join(text.split()) # Remove extra whitespace
|
| 266 |
+
return text
|
| 267 |
+
|
| 268 |
+
def extract_actionable_insights(self, text):
|
| 269 |
+
"""
|
| 270 |
+
Extract actionable insights using dictionary matching
|
| 271 |
+
Returns comma-separated list of suggested improvements
|
| 272 |
+
"""
|
| 273 |
+
# Handle null or empty text
|
| 274 |
+
if pd.isna(text) or text == '':
|
| 275 |
+
return ""
|
| 276 |
+
|
| 277 |
+
text_lower = text.lower() # Convert to lowercase for matching
|
| 278 |
+
found_insights = [] # List to store found actionable items
|
| 279 |
+
|
| 280 |
+
# Check each actionable item against the text
|
| 281 |
+
for action, keywords in self.actionable_dictionary.items():
|
| 282 |
+
for keyword in keywords:
|
| 283 |
+
if keyword in text_lower: # If keyword found in text
|
| 284 |
+
found_insights.append(action) # Add the actionable item
|
| 285 |
+
break # Only add each action once per text
|
| 286 |
+
|
| 287 |
+
# Return top 3 most relevant insights to avoid overwhelming output
|
| 288 |
+
if found_insights:
|
| 289 |
+
return ', '.join(found_insights[:3])
|
| 290 |
+
return ""
|
| 291 |
+
|
| 292 |
+
def extract_specific_topics(self, text):
|
| 293 |
+
"""
|
| 294 |
+
Extract specific topics from text using keyword extraction and noun phrase detection
|
| 295 |
+
Returns list of 3 topics (may include empty strings if not enough topics found)
|
| 296 |
+
"""
|
| 297 |
+
# Handle null, empty, or very short text
|
| 298 |
+
if pd.isna(text) or text == '' or len(text) < 10:
|
| 299 |
+
return ['', '', ''] # Return 3 empty strings
|
| 300 |
+
|
| 301 |
+
text_lower = text.lower() # Convert to lowercase
|
| 302 |
+
|
| 303 |
+
# Remove stopwords for better topic extraction
|
| 304 |
+
words = word_tokenize(text_lower) # Split into individual words
|
| 305 |
+
# Filter out stopwords and very short words
|
| 306 |
+
filtered_words = [w for w in words if w not in self.stop_words and len(w) > 3]
|
| 307 |
+
|
| 308 |
+
# Extract noun phrases using TextBlob (these are usually good topics)
|
| 309 |
+
blob = TextBlob(text)
|
| 310 |
+
noun_phrases = blob.noun_phrases # Get noun phrases from text
|
| 311 |
+
|
| 312 |
+
topics = [] # Initialize topics list
|
| 313 |
+
|
| 314 |
+
# Add noun phrases (these are usually good topics)
|
| 315 |
+
for phrase in noun_phrases[:5]: # Limit to top 5 noun phrases
|
| 316 |
+
if len(phrase.split()) <= 3: # Only include short phrases (3 words or less)
|
| 317 |
+
topics.append(phrase)
|
| 318 |
+
|
| 319 |
+
# Add frequent meaningful words if we don't have enough topics
|
| 320 |
+
if len(topics) < 3:
|
| 321 |
+
word_freq = Counter(filtered_words) # Count word frequencies
|
| 322 |
+
for word, _ in word_freq.most_common(5): # Get top 5 most common words
|
| 323 |
+
if word not in str(topics): # Avoid duplicates
|
| 324 |
+
topics.append(word)
|
| 325 |
+
if len(topics) >= 3: # Stop when we have 3 topics
|
| 326 |
+
break
|
| 327 |
+
|
| 328 |
+
# Ensure we always return exactly 3 items
|
| 329 |
+
topics = topics[:3] # Take only first 3
|
| 330 |
+
while len(topics) < 3: # Add empty strings if needed
|
| 331 |
+
topics.append('')
|
| 332 |
+
|
| 333 |
+
return topics
|
| 334 |
+
|
| 335 |
+
def determine_topic(self, text):
|
| 336 |
+
"""
|
| 337 |
+
Legacy method kept for compatibility - returns first specific topic
|
| 338 |
+
This maintains backward compatibility with older versions
|
| 339 |
+
"""
|
| 340 |
+
topics = self.extract_specific_topics(text) # Get all topics
|
| 341 |
+
return topics[0] if topics[0] else 'General' # Return first topic or 'General'
|
| 342 |
+
|
| 343 |
+
# ===== SEARCH ENGINE CLASS =====
|
| 344 |
+
class TextSearchEngine:
|
| 345 |
+
"""
|
| 346 |
+
Advanced search functionality for text data with semantic capabilities
|
| 347 |
+
Uses TF-IDF vectorization and cosine similarity for intelligent text search
|
| 348 |
+
"""
|
| 349 |
+
|
| 350 |
+
def __init__(self):
|
| 351 |
+
"""Initialize the search engine with TF-IDF vectorizer and synonym dictionary"""
|
| 352 |
+
# TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer
|
| 353 |
+
# Converts text to numerical vectors for similarity calculations
|
| 354 |
+
self.vectorizer = TfidfVectorizer(
|
| 355 |
+
max_features=1000, # Limit to top 1000 most important terms
|
| 356 |
+
ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams for better matching
|
| 357 |
+
stop_words='english', # Remove common English words
|
| 358 |
+
use_idf=True, # Use inverse document frequency weighting
|
| 359 |
+
smooth_idf=True, # Add smoothing to IDF
|
| 360 |
+
sublinear_tf=True # Apply sublinear tf scaling for better performance
|
| 361 |
+
)
|
| 362 |
+
self.tfidf_matrix = None # Will store the TF-IDF matrix after building index
|
| 363 |
+
self.data = None # Will store the original data
|
| 364 |
+
|
| 365 |
+
# Synonym dictionary for semantic search - helps find related terms
|
| 366 |
+
self.synonyms = {
|
| 367 |
+
'fast': ['quick', 'rapid', 'speedy', 'swift', 'prompt'],
|
| 368 |
+
'slow': ['sluggish', 'delayed', 'laggy', 'lengthy', 'prolonged'],
|
| 369 |
+
'good': ['excellent', 'great', 'wonderful', 'fantastic', 'amazing', 'positive'],
|
| 370 |
+
'bad': ['poor', 'terrible', 'awful', 'negative', 'horrible', 'disappointing'],
|
| 371 |
+
'problem': ['issue', 'bug', 'error', 'defect', 'fault', 'glitch'],
|
| 372 |
+
'help': ['support', 'assistance', 'aid', 'service'],
|
| 373 |
+
'price': ['cost', 'fee', 'charge', 'rate', 'payment', 'expensive', 'cheap'],
|
| 374 |
+
'quality': ['standard', 'grade', 'condition', 'caliber'],
|
| 375 |
+
'delivery': ['shipping', 'dispatch', 'arrival', 'transport'],
|
| 376 |
+
'easy': ['simple', 'straightforward', 'effortless', 'user-friendly'],
|
| 377 |
+
'hard': ['difficult', 'complex', 'complicated', 'challenging'],
|
| 378 |
+
'broken': ['damaged', 'defective', 'faulty', 'malfunctioning'],
|
| 379 |
+
'love': ['like', 'enjoy', 'appreciate', 'adore'],
|
| 380 |
+
'hate': ['dislike', 'despise', 'detest'],
|
| 381 |
+
'feature': ['function', 'capability', 'option', 'characteristic'],
|
| 382 |
+
'customer': ['client', 'buyer', 'purchaser', 'consumer', 'user']
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
def expand_query_with_synonyms(self, query):
|
| 386 |
+
"""
|
| 387 |
+
Expand search query with synonyms for better semantic matching
|
| 388 |
+
This helps find relevant results even when different words are used
|
| 389 |
+
"""
|
| 390 |
+
query_words = query.lower().split() # Split query into individual words
|
| 391 |
+
expanded_terms = [] # List to store original words and synonyms
|
| 392 |
+
|
| 393 |
+
for word in query_words:
|
| 394 |
+
expanded_terms.append(word) # Add the original word
|
| 395 |
+
|
| 396 |
+
# Add synonyms if available for this word
|
| 397 |
+
if word in self.synonyms:
|
| 398 |
+
expanded_terms.extend(self.synonyms[word])
|
| 399 |
+
|
| 400 |
+
# Check if word is a synonym of something else and add related terms
|
| 401 |
+
for key, syns in self.synonyms.items():
|
| 402 |
+
if word in syns: # If current word is a synonym
|
| 403 |
+
expanded_terms.append(key) # Add the main term
|
| 404 |
+
expanded_terms.extend([s for s in syns if s != word]) # Add other synonyms
|
| 405 |
+
|
| 406 |
+
# Remove duplicates while preserving order
|
| 407 |
+
seen = set()
|
| 408 |
+
unique_terms = []
|
| 409 |
+
for term in expanded_terms:
|
| 410 |
+
if term not in seen:
|
| 411 |
+
unique_terms.append(term)
|
| 412 |
+
seen.add(term)
|
| 413 |
+
|
| 414 |
+
return ' '.join(unique_terms) # Return expanded query as single string
|
| 415 |
+
|
| 416 |
+
def build_index(self, df, text_column):
|
| 417 |
+
"""
|
| 418 |
+
Build search index from text data
|
| 419 |
+
Creates TF-IDF vectors for all documents to enable fast similarity search
|
| 420 |
+
"""
|
| 421 |
+
self.data = df.copy() # Store copy of the data
|
| 422 |
+
texts = df[text_column].fillna('').tolist() # Get all text, fill nulls with empty string
|
| 423 |
+
|
| 424 |
+
# Add other searchable columns to improve search accuracy
|
| 425 |
+
if 'topic_1' in df.columns:
|
| 426 |
+
# Combine main text with topic information for better searchability
|
| 427 |
+
texts = [f"{text} {df.iloc[i]['topic_1']} {df.iloc[i]['topic_2']} {df.iloc[i]['topic_3']}"
|
| 428 |
+
for i, text in enumerate(texts)]
|
| 429 |
+
if 'actionable_insights' in df.columns:
|
| 430 |
+
# Also include actionable insights in searchable text
|
| 431 |
+
texts = [f"{texts[i]} {df.iloc[i]['actionable_insights']}"
|
| 432 |
+
for i in range(len(texts))]
|
| 433 |
+
|
| 434 |
+
# Create TF-IDF matrix from all texts
|
| 435 |
+
self.tfidf_matrix = self.vectorizer.fit_transform(texts)
|
| 436 |
+
|
| 437 |
+
def search(self, query, top_k=10):
|
| 438 |
+
"""
|
| 439 |
+
Enhanced search with semantic understanding
|
| 440 |
+
Returns top matching documents with similarity scores
|
| 441 |
+
"""
|
| 442 |
+
# Check if index has been built
|
| 443 |
+
if self.tfidf_matrix is None:
|
| 444 |
+
return pd.DataFrame() # Return empty DataFrame if no index
|
| 445 |
+
|
| 446 |
+
# Expand query with synonyms for better semantic matching
|
| 447 |
+
expanded_query = self.expand_query_with_synonyms(query)
|
| 448 |
+
|
| 449 |
+
# Vectorize both original and expanded queries
|
| 450 |
+
query_vector = self.vectorizer.transform([query]) # Original query vector
|
| 451 |
+
expanded_vector = self.vectorizer.transform([expanded_query]) # Expanded query vector
|
| 452 |
+
|
| 453 |
+
# Calculate similarities for both queries against all documents
|
| 454 |
+
similarities_orig = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
|
| 455 |
+
similarities_exp = cosine_similarity(expanded_vector, self.tfidf_matrix).flatten()
|
| 456 |
+
|
| 457 |
+
# Combine scores (weighted average - original query gets more weight)
|
| 458 |
+
combined_similarities = (0.7 * similarities_orig + 0.3 * similarities_exp)
|
| 459 |
+
|
| 460 |
+
# Get top results
|
| 461 |
+
top_indices = combined_similarities.argsort()[-top_k:][::-1] # Get indices of top scores, reverse order
|
| 462 |
+
top_scores = combined_similarities[top_indices] # Get the actual scores
|
| 463 |
+
|
| 464 |
+
# Filter results with score > 0.05 (lower threshold for better recall)
|
| 465 |
+
valid_indices = [idx for idx, score in zip(top_indices, top_scores) if score > 0.05]
|
| 466 |
+
|
| 467 |
+
if valid_indices:
|
| 468 |
+
# Create results dataframe from valid matches
|
| 469 |
+
results = self.data.iloc[valid_indices].copy()
|
| 470 |
+
results['search_score'] = [combined_similarities[idx] for idx in valid_indices]
|
| 471 |
+
|
| 472 |
+
# Boost results that have exact matches in the text
|
| 473 |
+
query_lower = query.lower()
|
| 474 |
+
for idx in results.index:
|
| 475 |
+
if 'combined_text' in results.columns:
|
| 476 |
+
# If exact query appears in text, boost the score
|
| 477 |
+
if query_lower in str(results.at[idx, 'combined_text']).lower():
|
| 478 |
+
results.at[idx, 'search_score'] *= 1.5 # 50% boost for exact matches
|
| 479 |
+
|
| 480 |
+
return results.sort_values('search_score', ascending=False) # Return sorted by relevance
|
| 481 |
+
|
| 482 |
+
return pd.DataFrame() # Return empty DataFrame if no valid results
|
| 483 |
+
|
| 484 |
+
# ===== AI MODEL MANAGER CLASS =====
|
| 485 |
+
class AIModelManager:
|
| 486 |
+
"""
|
| 487 |
+
Manages multiple AI model APIs and provides unified interface
|
| 488 |
+
Supports OpenAI, Anthropic, Deepseek, Groq, and Google Gemini
|
| 489 |
+
"""
|
| 490 |
+
|
| 491 |
+
def __init__(self):
|
| 492 |
+
"""Initialize the model manager and set up all available AI APIs"""
|
| 493 |
+
self.available_models = {} # Dictionary to store available models
|
| 494 |
+
self.clients = {} # Dictionary to store API clients
|
| 495 |
+
self.current_model = None # Currently selected model
|
| 496 |
+
self.initialize_apis() # Set up all APIs
|
| 497 |
+
|
| 498 |
+
def initialize_apis(self):
|
| 499 |
+
"""Initialize all available AI APIs based on environment variables"""
|
| 500 |
+
|
| 501 |
+
# Anthropic Claude API setup
|
| 502 |
+
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") # Get API key from environment
|
| 503 |
+
if ANTHROPIC_API_KEY: # If API key exists
|
| 504 |
+
try:
|
| 505 |
+
self.clients['anthropic'] = Anthropic(api_key=ANTHROPIC_API_KEY) # Create client
|
| 506 |
+
# Add Claude model to available models
|
| 507 |
+
self.available_models['Claude 3 Haiku'] = {
|
| 508 |
+
'provider': 'anthropic',
|
| 509 |
+
'model': 'claude-3-haiku-20240307'
|
| 510 |
+
}
|
| 511 |
+
print(f"Anthropic API Key exists and begins {ANTHROPIC_API_KEY[:4]}") # Confirm setup
|
| 512 |
+
except Exception as e:
|
| 513 |
+
print(f"Error initializing Anthropic: {e}")
|
| 514 |
+
else:
|
| 515 |
+
print("Anthropic API Key not set")
|
| 516 |
+
|
| 517 |
+
# OpenAI API setup
|
| 518 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 519 |
+
if OPENAI_API_KEY and OpenAI: # Check both API key and library availability
|
| 520 |
+
try:
|
| 521 |
+
self.clients['openai'] = OpenAI(api_key=OPENAI_API_KEY)
|
| 522 |
+
# Add multiple OpenAI models
|
| 523 |
+
self.available_models['GPT-4o-mini'] = {
|
| 524 |
+
'provider': 'openai',
|
| 525 |
+
'model': 'gpt-4o-mini'
|
| 526 |
+
}
|
| 527 |
+
self.available_models['GPT-3.5 Turbo'] = {
|
| 528 |
+
'provider': 'openai',
|
| 529 |
+
'model': 'gpt-3.5-turbo'
|
| 530 |
+
}
|
| 531 |
+
print(f"OpenAI API Key exists and begins {OPENAI_API_KEY[:7]}")
|
| 532 |
+
except Exception as e:
|
| 533 |
+
print(f"Error initializing OpenAI: {e}")
|
| 534 |
+
else:
|
| 535 |
+
print("OpenAI API Key not set or library not installed")
|
| 536 |
+
|
| 537 |
+
# Deepseek API setup (uses OpenAI-compatible API)
|
| 538 |
+
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
|
| 539 |
+
if DEEPSEEK_API_KEY and OpenAI:
|
| 540 |
+
try:
|
| 541 |
+
# Deepseek uses OpenAI client with different base URL
|
| 542 |
+
self.clients['deepseek'] = OpenAI(
|
| 543 |
+
api_key=DEEPSEEK_API_KEY,
|
| 544 |
+
base_url="https://api.deepseek.com" # Deepseek's API endpoint
|
| 545 |
+
)
|
| 546 |
+
self.available_models['Deepseek Chat'] = {
|
| 547 |
+
'provider': 'deepseek',
|
| 548 |
+
'model': 'deepseek-chat'
|
| 549 |
+
}
|
| 550 |
+
print(f"Deepseek API Key exists and begins {DEEPSEEK_API_KEY[:7]}")
|
| 551 |
+
except Exception as e:
|
| 552 |
+
print(f"Error initializing Deepseek: {e}")
|
| 553 |
+
else:
|
| 554 |
+
print("Deepseek API Key not set or OpenAI library not installed")
|
| 555 |
+
|
| 556 |
+
# Groq API setup
|
| 557 |
+
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
| 558 |
+
if GROQ_API_KEY and Groq:
|
| 559 |
+
try:
|
| 560 |
+
self.clients['groq'] = Groq(api_key=GROQ_API_KEY)
|
| 561 |
+
# Add multiple Groq models
|
| 562 |
+
self.available_models['Llama 3.3 70B'] = {
|
| 563 |
+
'provider': 'groq',
|
| 564 |
+
'model': 'llama-3.3-70b-versatile'
|
| 565 |
+
}
|
| 566 |
+
self.available_models['Mixtral 8x7B'] = {
|
| 567 |
+
'provider': 'groq',
|
| 568 |
+
'model': 'mixtral-8x7b-32768'
|
| 569 |
+
}
|
| 570 |
+
print(f"Groq API Key exists and begins {GROQ_API_KEY[:4]}")
|
| 571 |
+
except Exception as e:
|
| 572 |
+
print(f"Error initializing Groq: {e}")
|
| 573 |
+
else:
|
| 574 |
+
print("Groq API Key not set or library not installed")
|
| 575 |
+
|
| 576 |
+
# Google Gemini API setup
|
| 577 |
+
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
| 578 |
+
if GOOGLE_API_KEY and genai:
|
| 579 |
+
try:
|
| 580 |
+
genai.configure(api_key=GOOGLE_API_KEY) # Configure Google AI
|
| 581 |
+
self.clients['google'] = genai # Store the configured module
|
| 582 |
+
# Add Google models
|
| 583 |
+
self.available_models['Gemini 1.5 Flash'] = {
|
| 584 |
+
'provider': 'google',
|
| 585 |
+
'model': 'gemini-1.5-flash'
|
| 586 |
+
}
|
| 587 |
+
self.available_models['Gemini 1.5 Pro'] = {
|
| 588 |
+
'provider': 'google',
|
| 589 |
+
'model': 'gemini-1.5-pro'
|
| 590 |
+
}
|
| 591 |
+
print(f"Google API Key exists and begins {GOOGLE_API_KEY[:2]}")
|
| 592 |
+
except Exception as e:
|
| 593 |
+
print(f"Error initializing Google Gemini: {e}")
|
| 594 |
+
else:
|
| 595 |
+
print("Google API Key not set or library not installed")
|
| 596 |
+
|
| 597 |
+
# Set default model to first available model
|
| 598 |
+
if self.available_models:
|
| 599 |
+
self.current_model = list(self.available_models.keys())[0]
|
| 600 |
+
|
| 601 |
+
def get_available_models(self):
|
| 602 |
+
"""Return list of available model names"""
|
| 603 |
+
return list(self.available_models.keys())
|
| 604 |
+
|
| 605 |
+
def set_model(self, model_name):
|
| 606 |
+
"""Set the current model for text generation"""
|
| 607 |
+
if model_name in self.available_models:
|
| 608 |
+
self.current_model = model_name
|
| 609 |
+
return True # Success
|
| 610 |
+
return False # Model not available
|
| 611 |
+
|
| 612 |
+
def generate_text(self, prompt, max_tokens=1000):
|
| 613 |
+
"""
|
| 614 |
+
Generate text using the current model
|
| 615 |
+
Handles different API formats for each provider
|
| 616 |
+
"""
|
| 617 |
+
# Check if we have a valid current model
|
| 618 |
+
if not self.current_model or self.current_model not in self.available_models:
|
| 619 |
+
return None
|
| 620 |
+
|
| 621 |
+
model_info = self.available_models[self.current_model] # Get model configuration
|
| 622 |
+
provider = model_info['provider'] # Which API provider to use
|
| 623 |
+
model = model_info['model'] # Specific model name
|
| 624 |
+
|
| 625 |
+
try:
|
| 626 |
+
# Handle Anthropic API format
|
| 627 |
+
if provider == 'anthropic':
|
| 628 |
+
client = self.clients['anthropic']
|
| 629 |
+
response = client.messages.create(
|
| 630 |
+
model=model,
|
| 631 |
+
max_tokens=max_tokens,
|
| 632 |
+
messages=[{"role": "user", "content": prompt}]
|
| 633 |
+
)
|
| 634 |
+
return response.content[0].text # Extract text from response
|
| 635 |
+
|
| 636 |
+
# Handle OpenAI and Deepseek API format (both use OpenAI-compatible format)
|
| 637 |
+
elif provider in ['openai', 'deepseek']:
|
| 638 |
+
client = self.clients[provider]
|
| 639 |
+
response = client.chat.completions.create(
|
| 640 |
+
model=model,
|
| 641 |
+
messages=[{"role": "user", "content": prompt}],
|
| 642 |
+
max_tokens=max_tokens
|
| 643 |
+
)
|
| 644 |
+
return response.choices[0].message.content # Extract text from response
|
| 645 |
+
|
| 646 |
+
# Handle Groq API format (similar to OpenAI)
|
| 647 |
+
elif provider == 'groq':
|
| 648 |
+
client = self.clients['groq']
|
| 649 |
+
response = client.chat.completions.create(
|
| 650 |
+
model=model,
|
| 651 |
+
messages=[{"role": "user", "content": prompt}],
|
| 652 |
+
max_tokens=max_tokens
|
| 653 |
+
)
|
| 654 |
+
return response.choices[0].message.content
|
| 655 |
+
|
| 656 |
+
# Handle Google Gemini API format
|
| 657 |
+
elif provider == 'google':
|
| 658 |
+
model_obj = genai.GenerativeModel(model) # Create model object
|
| 659 |
+
response = model_obj.generate_content(prompt) # Generate response
|
| 660 |
+
return response.text # Extract text
|
| 661 |
+
|
| 662 |
+
except Exception as e:
|
| 663 |
+
print(f"Error generating text with {self.current_model}: {e}")
|
| 664 |
+
return None
|
| 665 |
+
|
| 666 |
+
# Initialize the model manager globally so it can be used throughout the application
|
| 667 |
+
model_manager = AIModelManager()
|
| 668 |
+
|
| 669 |
+
# ===== ENHANCED TEXT ANALYZER CLASS =====
|
| 670 |
+
class EnhancedTextAnalyzer:
|
| 671 |
+
"""
|
| 672 |
+
Main analysis engine with all enhanced features and multi-model support
|
| 673 |
+
This is the core class that orchestrates all text analysis functionality
|
| 674 |
+
"""
|
| 675 |
+
|
| 676 |
+
def __init__(self, model_manager=None):
|
| 677 |
+
"""Initialize the analyzer with all component classes"""
|
| 678 |
+
self.model_manager = model_manager # AI model manager for generating insights
|
| 679 |
+
self.column_detector = SmartColumnDetector() # Smart column detection
|
| 680 |
+
self.text_processor = EnhancedTextProcessor() # Text processing and insights
|
| 681 |
+
self.search_engine = TextSearchEngine() # Text search functionality
|
| 682 |
+
self.original_df = None # Store original data
|
| 683 |
+
self.processed_df = None # Store processed data
|
| 684 |
+
self.results = {} # Store analysis results
|
| 685 |
+
self.visualizations = {} # Store generated visualizations
|
| 686 |
+
|
| 687 |
+
def load_file(self, file):
|
| 688 |
+
"""
|
| 689 |
+
Load data from various file formats (CSV, Excel, JSON)
|
| 690 |
+
Returns the loaded dataframe and a status message
|
| 691 |
+
"""
|
| 692 |
+
try:
|
| 693 |
+
# Determine file type based on extension and load accordingly
|
| 694 |
+
if file.name.endswith('.csv'):
|
| 695 |
+
df = pd.read_csv(file.name) # Load CSV file
|
| 696 |
+
elif file.name.endswith(('.xlsx', '.xls')):
|
| 697 |
+
df = pd.read_excel(file.name) # Load Excel file
|
| 698 |
+
elif file.name.endswith('.json'):
|
| 699 |
+
df = pd.read_json(file.name) # Load JSON file
|
| 700 |
+
else:
|
| 701 |
+
return None, "Unsupported file format" # Return error for unsupported formats
|
| 702 |
+
|
| 703 |
+
return df, f"File loaded: {len(df)} records" # Return success message with record count
|
| 704 |
+
except Exception as e:
|
| 705 |
+
return None, f"Error loading file: {str(e)}" # Return error message
|
| 706 |
+
|
| 707 |
+
def process_data(self, df):
|
| 708 |
+
"""
|
| 709 |
+
Process data with smart extraction and analysis
|
| 710 |
+
This is the main processing pipeline that analyzes the uploaded data
|
| 711 |
+
"""
|
| 712 |
+
# Step 1: Extract relevant columns using smart detection
|
| 713 |
+
extracted_df, detected_columns = self.column_detector.extract_relevant_data(df)
|
| 714 |
+
|
| 715 |
+
# Step 2: Store processed data for later use
|
| 716 |
+
self.processed_df = extracted_df
|
| 717 |
+
|
| 718 |
+
# Step 3: Clean up memory by deleting original large dataframe
|
| 719 |
+
del df
|
| 720 |
+
gc.collect() # Force garbage collection to free memory
|
| 721 |
+
|
| 722 |
+
# Step 4: Add analysis columns if we have text data to analyze
|
| 723 |
+
if 'combined_text' in extracted_df.columns:
|
| 724 |
+
# Initialize lists to store analysis results for each row
|
| 725 |
+
sentiments = [] # Positive/Negative/Neutral sentiment classification
|
| 726 |
+
polarities = [] # Numerical sentiment scores (-1 to 1)
|
| 727 |
+
topics_1 = [] # Primary topic for each text
|
| 728 |
+
topics_2 = [] # Secondary topic for each text
|
| 729 |
+
topics_3 = [] # Tertiary topic for each text
|
| 730 |
+
insights = [] # Actionable insights for each text
|
| 731 |
+
|
| 732 |
+
# Process each text entry
|
| 733 |
+
for text in extracted_df['combined_text']:
|
| 734 |
+
# Sentiment analysis using TextBlob
|
| 735 |
+
blob = TextBlob(text)
|
| 736 |
+
polarity = blob.sentiment.polarity # Get numerical sentiment score
|
| 737 |
+
|
| 738 |
+
# Convert numerical score to categorical sentiment
|
| 739 |
+
if polarity > 0.1: # Positive threshold
|
| 740 |
+
sentiment = 'Positive'
|
| 741 |
+
elif polarity < -0.1: # Negative threshold
|
| 742 |
+
sentiment = 'Negative'
|
| 743 |
+
else: # Neutral range
|
| 744 |
+
sentiment = 'Neutral'
|
| 745 |
+
|
| 746 |
+
sentiments.append(sentiment) # Add categorical sentiment
|
| 747 |
+
polarities.append(polarity) # Add numerical score
|
| 748 |
+
|
| 749 |
+
# Extract specific topics (3 separate topics per text)
|
| 750 |
+
specific_topics = self.text_processor.extract_specific_topics(text)
|
| 751 |
+
topics_1.append(specific_topics[0]) # Primary topic
|
| 752 |
+
topics_2.append(specific_topics[1]) # Secondary topic
|
| 753 |
+
topics_3.append(specific_topics[2]) # Tertiary topic
|
| 754 |
+
|
| 755 |
+
# Extract actionable insights using dictionary matching
|
| 756 |
+
insight = self.text_processor.extract_actionable_insights(text)
|
| 757 |
+
insights.append(insight)
|
| 758 |
+
|
| 759 |
+
# Add all analysis results as new columns to the dataframe
|
| 760 |
+
extracted_df['sentiment'] = sentiments # Categorical sentiment
|
| 761 |
+
extracted_df['sentiment_score'] = polarities # Numerical sentiment score
|
| 762 |
+
extracted_df['topic_1'] = topics_1 # Primary topic
|
| 763 |
+
extracted_df['topic_2'] = topics_2 # Secondary topic
|
| 764 |
+
extracted_df['topic_3'] = topics_3 # Tertiary topic
|
| 765 |
+
extracted_df['actionable_insights'] = insights # Actionable insights
|
| 766 |
+
|
| 767 |
+
# Build search index with enhanced search capabilities
|
| 768 |
+
self.search_engine.build_index(extracted_df, 'combined_text')
|
| 769 |
+
|
| 770 |
+
# Step 5: Save processed data to Excel file for download
|
| 771 |
+
output_file = 'processed_data.xlsx'
|
| 772 |
+
extracted_df.to_excel(output_file, index=False)
|
| 773 |
+
|
| 774 |
+
# Return processed data, detected column info, and output file path
|
| 775 |
+
return extracted_df, detected_columns, output_file
|
| 776 |
+
|
| 777 |
+
def generate_ai_insights(self, df, num_samples=5):
|
| 778 |
+
"""
|
| 779 |
+
Generate AI-powered insights using selected model
|
| 780 |
+
Takes sample texts and generates high-level insights using AI
|
| 781 |
+
"""
|
| 782 |
+
# Check if AI model is available
|
| 783 |
+
if not self.model_manager or not self.model_manager.current_model:
|
| 784 |
+
return "No AI model available for generating insights"
|
| 785 |
+
|
| 786 |
+
# Check if we have text data to analyze
|
| 787 |
+
if 'combined_text' not in df.columns or df.empty:
|
| 788 |
+
return "No text data available for AI analysis"
|
| 789 |
+
|
| 790 |
+
# Sample some texts for analysis (to avoid sending too much data to AI)
|
| 791 |
+
sample_texts = df['combined_text'].dropna().head(num_samples).tolist()
|
| 792 |
+
if not sample_texts:
|
| 793 |
+
return "No valid text samples found"
|
| 794 |
+
|
| 795 |
+
# Create prompt for AI analysis
|
| 796 |
+
# This prompt asks the AI to analyze the customer feedback samples
|
| 797 |
+
prompt = f"""Analyze the following customer feedback samples and provide key insights:
|
| 798 |
+
|
| 799 |
+
Samples:
|
| 800 |
+
{chr(10).join([f"{i+1}. {text[:200]}..." if len(text) > 200 else f"{i+1}. {text}" for i, text in enumerate(sample_texts)])}
|
| 801 |
+
|
| 802 |
+
Please provide:
|
| 803 |
+
1. Main themes and patterns
|
| 804 |
+
2. Key sentiment indicators
|
| 805 |
+
3. Actionable recommendations
|
| 806 |
+
4. Areas of concern
|
| 807 |
+
|
| 808 |
+
Keep the response concise and focused on actionable insights."""
|
| 809 |
+
|
| 810 |
+
# Generate insights using selected model
|
| 811 |
+
try:
|
| 812 |
+
response = self.model_manager.generate_text(prompt, max_tokens=500)
|
| 813 |
+
if response:
|
| 814 |
+
return f"**AI Insights (using {self.model_manager.current_model}):**\n\n{response}"
|
| 815 |
+
else:
|
| 816 |
+
return "Failed to generate AI insights. Please check your API configuration."
|
| 817 |
+
except Exception as e:
|
| 818 |
+
return f"Error generating AI insights: {str(e)}"
|
| 819 |
+
|
| 820 |
+
def generate_visualizations(self, df):
|
| 821 |
+
"""
|
| 822 |
+
Generate various visualizations from the analyzed data
|
| 823 |
+
Creates interactive charts using Plotly for better user experience
|
| 824 |
+
"""
|
| 825 |
+
visualizations = {} # Dictionary to store all visualizations
|
| 826 |
+
|
| 827 |
+
# Generate sentiment distribution pie chart
|
| 828 |
+
if 'sentiment' in df.columns:
|
| 829 |
+
sentiment_counts = df['sentiment'].value_counts() # Count each sentiment category
|
| 830 |
+
fig_sentiment = px.pie(
|
| 831 |
+
values=sentiment_counts.values, # Values for pie slices
|
| 832 |
+
names=sentiment_counts.index, # Labels for pie slices
|
| 833 |
+
title="Sentiment Distribution", # Chart title
|
| 834 |
+
color_discrete_map={ # Custom colors for each sentiment
|
| 835 |
+
'Positive': '#27AE60', # Green for positive
|
| 836 |
+
'Negative': '#E74C3C', # Red for negative
|
| 837 |
+
'Neutral': '#95A5A6' # Gray for neutral
|
| 838 |
+
}
|
| 839 |
+
)
|
| 840 |
+
visualizations['Sentiment Distribution'] = fig_sentiment
|
| 841 |
+
|
| 842 |
+
# Generate topic distribution bar chart
|
| 843 |
+
if 'topic_1' in df.columns:
|
| 844 |
+
# Combine all topics from all three topic columns
|
| 845 |
+
all_topics = []
|
| 846 |
+
for col in ['topic_1', 'topic_2', 'topic_3']:
|
| 847 |
+
if col in df.columns:
|
| 848 |
+
topics = df[col].dropna().tolist() # Get non-null topics
|
| 849 |
+
all_topics.extend([t for t in topics if t != '']) # Add non-empty topics
|
| 850 |
+
|
| 851 |
+
if all_topics:
|
| 852 |
+
topic_counts = Counter(all_topics) # Count topic frequencies
|
| 853 |
+
top_topics = dict(topic_counts.most_common(15)) # Get top 15 topics
|
| 854 |
+
|
| 855 |
+
fig_topics = px.bar(
|
| 856 |
+
x=list(top_topics.values()), # Frequency values
|
| 857 |
+
y=list(top_topics.keys()), # Topic names
|
| 858 |
+
orientation='h', # Horizontal bar chart
|
| 859 |
+
title="Top 15 Specific Topics", # Chart title
|
| 860 |
+
labels={'x': 'Count', 'y': 'Topic'} # Axis labels
|
| 861 |
+
)
|
| 862 |
+
visualizations['Topic Distribution'] = fig_topics
|
| 863 |
+
|
| 864 |
+
# Generate sentiment by topic heatmap
|
| 865 |
+
if 'sentiment' in df.columns and 'topic_1' in df.columns:
|
| 866 |
+
df_temp = df[df['topic_1'] != ''].copy() # Filter out empty topics
|
| 867 |
+
if not df_temp.empty:
|
| 868 |
+
# Get top 10 topics for cleaner visualization
|
| 869 |
+
top_topics = df_temp['topic_1'].value_counts().head(10).index
|
| 870 |
+
df_filtered = df_temp[df_temp['topic_1'].isin(top_topics)]
|
| 871 |
+
|
| 872 |
+
# Create cross-tabulation of topics vs sentiments
|
| 873 |
+
pivot_table = pd.crosstab(df_filtered['topic_1'], df_filtered['sentiment'])
|
| 874 |
+
fig_heatmap = px.imshow(
|
| 875 |
+
pivot_table, # Data for heatmap
|
| 876 |
+
labels=dict(x="Sentiment", y="Primary Topic", color="Count"), # Labels
|
| 877 |
+
title="Sentiment by Primary Topic Heatmap", # Title
|
| 878 |
+
color_continuous_scale="RdYlGn" # Color scale (red to green)
|
| 879 |
+
)
|
| 880 |
+
visualizations['Sentiment by Topic'] = fig_heatmap
|
| 881 |
+
|
| 882 |
+
# Generate sentiment timeline if date data is available
|
| 883 |
+
if 'date' in df.columns and 'sentiment' in df.columns:
|
| 884 |
+
df_time = df.copy()
|
| 885 |
+
df_time['date'] = pd.to_datetime(df_time['date']) # Ensure date format
|
| 886 |
+
# Group by month and sentiment to show trends over time
|
| 887 |
+
time_data = df_time.groupby([pd.Grouper(key='date', freq='M'), 'sentiment']).size().reset_index(name='count')
|
| 888 |
+
|
| 889 |
+
fig_timeline = px.line(
|
| 890 |
+
time_data,
|
| 891 |
+
x='date', # X-axis: time
|
| 892 |
+
y='count', # Y-axis: count
|
| 893 |
+
color='sentiment', # Different lines for each sentiment
|
| 894 |
+
title="Sentiment Trends Over Time", # Chart title
|
| 895 |
+
color_discrete_map={ # Custom colors
|
| 896 |
+
'Positive': '#27AE60',
|
| 897 |
+
'Negative': '#E74C3C',
|
| 898 |
+
'Neutral': '#95A5A6'
|
| 899 |
+
}
|
| 900 |
+
)
|
| 901 |
+
visualizations['Sentiment Timeline'] = fig_timeline
|
| 902 |
+
|
| 903 |
+
# Generate actionable insights bar chart
|
| 904 |
+
if 'actionable_insights' in df.columns:
|
| 905 |
+
all_insights = [] # List to store all individual insights
|
| 906 |
+
for insight in df['actionable_insights']:
|
| 907 |
+
if insight and insight != "":
|
| 908 |
+
# Split by comma as we're now using comma-separated insights
|
| 909 |
+
all_insights.extend([i.strip() for i in insight.split(',')])
|
| 910 |
+
|
| 911 |
+
if all_insights:
|
| 912 |
+
insight_counts = Counter(all_insights) # Count insight frequencies
|
| 913 |
+
top_insights = dict(insight_counts.most_common(10)) # Get top 10 insights
|
| 914 |
+
|
| 915 |
+
fig_insights = px.bar(
|
| 916 |
+
x=list(top_insights.values()), # Frequency values
|
| 917 |
+
y=list(top_insights.keys()), # Insight names
|
| 918 |
+
orientation='h', # Horizontal bar chart
|
| 919 |
+
title="Top 10 Actionable Insights", # Chart title
|
| 920 |
+
labels={'x': 'Frequency', 'y': 'Insight'} # Axis labels
|
| 921 |
+
)
|
| 922 |
+
visualizations['Top Insights'] = fig_insights
|
| 923 |
+
|
| 924 |
+
return visualizations # Return dictionary of all generated visualizations
|
| 925 |
+
|
| 926 |
+
# ===== GRADIO INTERFACE FUNCTIONS =====
|
| 927 |
+
# Global variables to maintain state across function calls
|
| 928 |
+
analyzer = None # Main analyzer instance
|
| 929 |
+
current_data = None # Currently processed data
|
| 930 |
+
current_visualizations = None # Currently generated visualizations
|
| 931 |
+
|
| 932 |
+
def update_model(model_name):
|
| 933 |
+
"""Update the selected AI model"""
|
| 934 |
+
global model_manager
|
| 935 |
+
|
| 936 |
+
if model_manager.set_model(model_name): # Try to set the new model
|
| 937 |
+
return f"β
Model switched to: {model_name}"
|
| 938 |
+
else:
|
| 939 |
+
return f"β Failed to switch to: {model_name}"
|
| 940 |
+
|
| 941 |
+
def process_file(file, model_name):
|
| 942 |
+
"""
|
| 943 |
+
Process uploaded file with selected model
|
| 944 |
+
This is the main function called when user uploads a file
|
| 945 |
+
"""
|
| 946 |
+
global analyzer, current_data, current_visualizations, model_manager
|
| 947 |
+
|
| 948 |
+
# Check if file was uploaded
|
| 949 |
+
if file is None:
|
| 950 |
+
return "Please upload a file", None, None, None, None, None, gr.update(choices=[])
|
| 951 |
+
|
| 952 |
+
try:
|
| 953 |
+
# Update model if changed
|
| 954 |
+
if model_name and model_manager:
|
| 955 |
+
model_manager.set_model(model_name)
|
| 956 |
+
|
| 957 |
+
# Create new analyzer instance
|
| 958 |
+
analyzer = EnhancedTextAnalyzer(model_manager)
|
| 959 |
+
|
| 960 |
+
# Load the uploaded file
|
| 961 |
+
df, message = analyzer.load_file(file)
|
| 962 |
+
if df is None: # If file loading failed
|
| 963 |
+
return message, None, None, None, None, None, gr.update(choices=[])
|
| 964 |
+
|
| 965 |
+
# Process the loaded data
|
| 966 |
+
processed_df, detected_cols, output_file = analyzer.process_data(df)
|
| 967 |
+
current_data = processed_df # Store for later use
|
| 968 |
+
|
| 969 |
+
# Generate visualizations from processed data
|
| 970 |
+
visualizations = analyzer.generate_visualizations(processed_df)
|
| 971 |
+
current_visualizations = visualizations # Store for later use
|
| 972 |
+
|
| 973 |
+
# Generate AI insights using the selected model
|
| 974 |
+
ai_insights = analyzer.generate_ai_insights(processed_df)
|
| 975 |
+
|
| 976 |
+
# Create summary of processing results
|
| 977 |
+
# Safely handle detected columns (convert to lists and limit length)
|
| 978 |
+
text_cols = list(detected_cols.get('text_columns', []))[:3] if detected_cols.get('text_columns') else []
|
| 979 |
+
id_cols = list(detected_cols.get('id_columns', []))[:3] if detected_cols.get('id_columns') else []
|
| 980 |
+
product_cols = list(detected_cols.get('product_columns', []))[:3] if detected_cols.get('product_columns') else []
|
| 981 |
+
|
| 982 |
+
summary = f"""
|
| 983 |
+
### β
File Processing Complete!
|
| 984 |
+
|
| 985 |
+
**Detected Columns:**
|
| 986 |
+
- Text Columns: {', '.join(text_cols) if text_cols else 'None'}
|
| 987 |
+
- ID Columns: {', '.join(id_cols) if id_cols else 'Auto-generated'}
|
| 988 |
+
- Product Columns: {', '.join(product_cols) if product_cols else 'None'}
|
| 989 |
+
|
| 990 |
+
**Analysis Results:**
|
| 991 |
+
- Total Records: {len(processed_df)}
|
| 992 |
+
- Processed File Saved: {output_file}
|
| 993 |
+
- AI Model Used: {model_manager.current_model if model_manager else 'None'}
|
| 994 |
+
"""
|
| 995 |
+
|
| 996 |
+
# Create data preview (first 10 rows for display)
|
| 997 |
+
preview = processed_df.head(10)
|
| 998 |
+
|
| 999 |
+
# Get first visualization for immediate display
|
| 1000 |
+
first_viz = list(visualizations.values())[0] if visualizations else None
|
| 1001 |
+
|
| 1002 |
+
# Return all results for the Gradio interface
|
| 1003 |
+
return (
|
| 1004 |
+
summary, # Processing status
|
| 1005 |
+
preview, # Data preview
|
| 1006 |
+
output_file, # Downloadable processed file
|
| 1007 |
+
ai_insights, # AI-generated insights
|
| 1008 |
+
first_viz, # First visualization
|
| 1009 |
+
"Ready for search", # Search status
|
| 1010 |
+
gr.update(choices=list(visualizations.keys())) # Update visualization dropdown
|
| 1011 |
+
)
|
| 1012 |
+
|
| 1013 |
+
except Exception as e:
|
| 1014 |
+
# Return error message if anything goes wrong
|
| 1015 |
+
return f"Error: {str(e)}", None, None, None, None, None, gr.update(choices=[])
|
| 1016 |
+
|
| 1017 |
+
def search_data(query):
|
| 1018 |
+
"""
|
| 1019 |
+
Search through the data with enhanced semantic search
|
| 1020 |
+
Uses the built search engine to find relevant text entries
|
| 1021 |
+
"""
|
| 1022 |
+
global analyzer, current_data
|
| 1023 |
+
|
| 1024 |
+
# Check if data has been processed
|
| 1025 |
+
if analyzer is None or current_data is None:
|
| 1026 |
+
return "Please process a file first", None, None
|
| 1027 |
+
|
| 1028 |
+
# Check if search query was provided
|
| 1029 |
+
if not query:
|
| 1030 |
+
return "Please enter a search query", None, None
|
| 1031 |
+
|
| 1032 |
+
try:
|
| 1033 |
+
# Perform the search using the search engine
|
| 1034 |
+
results = analyzer.search_engine.search(query, top_k=10)
|
| 1035 |
+
|
| 1036 |
+
# Check if any results were found
|
| 1037 |
+
if results.empty:
|
| 1038 |
+
return "No results found", None, None
|
| 1039 |
+
|
| 1040 |
+
# Select relevant columns for display (updated to include new topic columns)
|
| 1041 |
+
display_cols = ['unique_id', 'combined_text', 'sentiment', 'topic_1', 'topic_2', 'topic_3', 'actionable_insights', 'search_score']
|
| 1042 |
+
display_cols = [col for col in display_cols if col in results.columns] # Only include existing columns
|
| 1043 |
+
|
| 1044 |
+
results_display = results[display_cols] # Create display dataframe
|
| 1045 |
+
|
| 1046 |
+
# Save search results to file for download
|
| 1047 |
+
search_output = f"search_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
|
| 1048 |
+
results_display.to_excel(search_output, index=False)
|
| 1049 |
+
|
| 1050 |
+
# Return search results and status
|
| 1051 |
+
return f"Found {len(results)} results", results_display.head(10), search_output
|
| 1052 |
+
|
| 1053 |
+
except Exception as e:
|
| 1054 |
+
return f"Search error: {str(e)}", None, None
|
| 1055 |
+
|
| 1056 |
+
def update_visualization(viz_type):
|
| 1057 |
+
"""
|
| 1058 |
+
Update displayed visualization based on user selection
|
| 1059 |
+
Called when user selects a different visualization from dropdown
|
| 1060 |
+
"""
|
| 1061 |
+
global current_visualizations
|
| 1062 |
+
|
| 1063 |
+
# Check if visualization exists and return it
|
| 1064 |
+
if current_visualizations and viz_type in current_visualizations:
|
| 1065 |
+
return current_visualizations[viz_type]
|
| 1066 |
+
return None # Return None if visualization not found
|
| 1067 |
+
|
| 1068 |
+
def export_results(format_type):
|
| 1069 |
+
"""
|
| 1070 |
+
Export processed data in different formats (Excel or CSV)
|
| 1071 |
+
Allows users to download their analyzed data
|
| 1072 |
+
"""
|
| 1073 |
+
global current_data
|
| 1074 |
+
|
| 1075 |
+
# Check if there's data to export
|
| 1076 |
+
if current_data is None:
|
| 1077 |
+
return "No data to export", None
|
| 1078 |
+
|
| 1079 |
+
try:
|
| 1080 |
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # Create timestamp for unique filename
|
| 1081 |
+
|
| 1082 |
+
# Export based on selected format
|
| 1083 |
+
if format_type == "Excel":
|
| 1084 |
+
output_file = f"analysis_results_{timestamp}.xlsx"
|
| 1085 |
+
current_data.to_excel(output_file, index=False) # Save as Excel
|
| 1086 |
+
else: # CSV
|
| 1087 |
+
output_file = f"analysis_results_{timestamp}.csv"
|
| 1088 |
+
current_data.to_csv(output_file, index=False) # Save as CSV
|
| 1089 |
+
|
| 1090 |
+
return f"Data exported to {output_file}", output_file
|
| 1091 |
+
|
| 1092 |
+
except Exception as e:
|
| 1093 |
+
return f"Export error: {str(e)}", None
|
| 1094 |
+
|
| 1095 |
+
# ===== GRADIO INTERFACE CREATION =====
|
| 1096 |
+
def create_interface():
|
| 1097 |
+
"""
|
| 1098 |
+
Create the Gradio interface with model selection
|
| 1099 |
+
This function builds the entire web interface using Gradio
|
| 1100 |
+
"""
|
| 1101 |
+
|
| 1102 |
+
# Create the main Gradio application with soft theme
|
| 1103 |
+
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
| 1104 |
+
# Main title and description
|
| 1105 |
+
gr.Markdown(
|
| 1106 |
+
"""
|
| 1107 |
+
# π Enhanced Text Analytics AI Agent
|
| 1108 |
+
### Smart Column Detection & Comprehensive Text Analysis with Multiple AI Models
|
| 1109 |
+
|
| 1110 |
+
**Features:**
|
| 1111 |
+
- π€ Multiple AI Model Support (OpenAI, Anthropic, Deepseek, Groq, Google)
|
| 1112 |
+
- π Automatic detection of text, ID, and product columns
|
| 1113 |
+
- πΎ Memory-efficient processing with automatic file cleanup
|
| 1114 |
+
- π Sentiment analysis with scoring
|
| 1115 |
+
- π― Topic/theme extraction
|
| 1116 |
+
- π‘ Actionable insights generation
|
| 1117 |
+
- π Advanced text search with similarity scoring
|
| 1118 |
+
- π Multiple visualization options
|
| 1119 |
+
- π₯ Export results in Excel or CSV format
|
| 1120 |
+
"""
|
| 1121 |
+
)
|
| 1122 |
+
|
| 1123 |
+
# Tab 1: Upload & Process
|
| 1124 |
+
with gr.Tab("π€ Upload & Process"):
|
| 1125 |
+
with gr.Row():
|
| 1126 |
+
with gr.Column(scale=1): # Left column for controls
|
| 1127 |
+
# Model selection dropdown
|
| 1128 |
+
model_dropdown = gr.Dropdown(
|
| 1129 |
+
label="π€ Select AI Model",
|
| 1130 |
+
choices=model_manager.get_available_models(), # Get available models
|
| 1131 |
+
value=model_manager.current_model if model_manager.current_model else None,
|
| 1132 |
+
interactive=True
|
| 1133 |
+
)
|
| 1134 |
+
|
| 1135 |
+
# File upload component
|
| 1136 |
+
file_upload = gr.File(
|
| 1137 |
+
label="Upload Data File",
|
| 1138 |
+
file_types=[".csv", ".xlsx", ".xls", ".json"] # Supported file types
|
| 1139 |
+
)
|
| 1140 |
+
|
| 1141 |
+
# Process button
|
| 1142 |
+
process_btn = gr.Button("π Process File", variant="primary")
|
| 1143 |
+
|
| 1144 |
+
with gr.Column(scale=2): # Right column for results
|
| 1145 |
+
status_output = gr.Markdown(label="Processing Status") # Processing status display
|
| 1146 |
+
ai_insights = gr.Markdown(label="AI-Generated Insights") # AI insights display
|
| 1147 |
+
|
| 1148 |
+
# Data preview section
|
| 1149 |
+
with gr.Row():
|
| 1150 |
+
data_preview = gr.Dataframe(
|
| 1151 |
+
label="Data Preview (First 10 rows)",
|
| 1152 |
+
interactive=False # Read-only display
|
| 1153 |
+
)
|
| 1154 |
+
|
| 1155 |
+
# Processed file download
|
| 1156 |
+
processed_file = gr.File(
|
| 1157 |
+
label="π Processed Data File",
|
| 1158 |
+
interactive=False # Read-only, for download only
|
| 1159 |
+
)
|
| 1160 |
+
|
| 1161 |
+
# Tab 2: Search
|
| 1162 |
+
with gr.Tab("π Search"):
|
| 1163 |
+
gr.Markdown("### Search through your text data")
|
| 1164 |
+
|
| 1165 |
+
with gr.Row():
|
| 1166 |
+
# Search input box
|
| 1167 |
+
search_input = gr.Textbox(
|
| 1168 |
+
label="Enter search query",
|
| 1169 |
+
placeholder="Type keywords to search..."
|
| 1170 |
+
)
|
| 1171 |
+
# Search button
|
| 1172 |
+
search_btn = gr.Button("π Search", variant="primary")
|
| 1173 |
+
|
| 1174 |
+
# Search results display
|
| 1175 |
+
search_status = gr.Markdown(label="Search Status") # Search status
|
| 1176 |
+
search_results = gr.Dataframe( # Search results table
|
| 1177 |
+
label="Search Results",
|
| 1178 |
+
interactive=False
|
| 1179 |
+
)
|
| 1180 |
+
search_file = gr.File( # Download search results
|
| 1181 |
+
label="π₯ Download Search Results",
|
| 1182 |
+
interactive=False
|
| 1183 |
+
)
|
| 1184 |
+
|
| 1185 |
+
# Tab 3: Visualizations
|
| 1186 |
+
with gr.Tab("π Visualizations"):
|
| 1187 |
+
with gr.Row():
|
| 1188 |
+
# Visualization selector dropdown
|
| 1189 |
+
viz_selector = gr.Dropdown(
|
| 1190 |
+
label="Select Visualization",
|
| 1191 |
+
choices=[], # Will be populated after processing
|
| 1192 |
+
interactive=True
|
| 1193 |
+
)
|
| 1194 |
+
|
| 1195 |
+
# Visualization display area
|
| 1196 |
+
viz_plot = gr.Plot(label="Visualization")
|
| 1197 |
+
|
| 1198 |
+
# Tab 4: Export
|
| 1199 |
+
with gr.Tab("π₯ Export"):
|
| 1200 |
+
gr.Markdown("### Export your analyzed data")
|
| 1201 |
+
|
| 1202 |
+
with gr.Row():
|
| 1203 |
+
# Export format selection
|
| 1204 |
+
export_format = gr.Radio(
|
| 1205 |
+
choices=["Excel", "CSV"],
|
| 1206 |
+
value="Excel",
|
| 1207 |
+
label="Export Format"
|
| 1208 |
+
)
|
| 1209 |
+
# Export button
|
| 1210 |
+
export_btn = gr.Button("π₯ Export Data", variant="primary")
|
| 1211 |
+
|
| 1212 |
+
# Export results display
|
| 1213 |
+
export_status = gr.Markdown(label="Export Status") # Export status
|
| 1214 |
+
export_file = gr.File( # Download exported file
|
| 1215 |
+
label="π Download Exported File",
|
| 1216 |
+
interactive=False
|
| 1217 |
+
)
|
| 1218 |
+
|
| 1219 |
+
# ===== EVENT HANDLERS =====
|
| 1220 |
+
# These connect user interactions to the backend functions
|
| 1221 |
+
|
| 1222 |
+
# Model selection change handler
|
| 1223 |
+
model_dropdown.change(
|
| 1224 |
+
fn=update_model, # Function to call
|
| 1225 |
+
inputs=[model_dropdown], # Input components
|
| 1226 |
+
outputs=[status_output] # Output components
|
| 1227 |
+
)
|
| 1228 |
+
|
| 1229 |
+
# File processing button click handler
|
| 1230 |
+
process_btn.click(
|
| 1231 |
+
fn=process_file, # Function to call
|
| 1232 |
+
inputs=[file_upload, model_dropdown], # Input components
|
| 1233 |
+
outputs=[ # Output components
|
| 1234 |
+
status_output,
|
| 1235 |
+
data_preview,
|
| 1236 |
+
processed_file,
|
| 1237 |
+
ai_insights,
|
| 1238 |
+
viz_plot,
|
| 1239 |
+
search_status,
|
| 1240 |
+
viz_selector
|
| 1241 |
+
]
|
| 1242 |
+
)
|
| 1243 |
+
|
| 1244 |
+
# Search button click handler
|
| 1245 |
+
search_btn.click(
|
| 1246 |
+
fn=search_data, # Function to call
|
| 1247 |
+
inputs=[search_input], # Input components
|
| 1248 |
+
outputs=[search_status, search_results, search_file] # Output components
|
| 1249 |
+
)
|
| 1250 |
+
|
| 1251 |
+
# Visualization selector change handler
|
| 1252 |
+
viz_selector.change(
|
| 1253 |
+
fn=update_visualization, # Function to call
|
| 1254 |
+
inputs=[viz_selector], # Input components
|
| 1255 |
+
outputs=[viz_plot] # Output components
|
| 1256 |
+
)
|
| 1257 |
+
|
| 1258 |
+
# Export button click handler
|
| 1259 |
+
export_btn.click(
|
| 1260 |
+
fn=export_results, # Function to call
|
| 1261 |
+
inputs=[export_format], # Input components
|
| 1262 |
+
outputs=[export_status, export_file] # Output components
|
| 1263 |
+
)
|
| 1264 |
+
|
| 1265 |
+
return app # Return the complete Gradio application
|
| 1266 |
+
|
| 1267 |
+
# ===== APPLICATION LAUNCH =====
|
| 1268 |
+
# Launch the application when script is run directly
|
| 1269 |
+
if __name__ == "__main__":
|
| 1270 |
+
app = create_interface() # Create the Gradio interface
|
| 1271 |
+
app.launch(share=True, debug=True) # Launch with public sharing and debug mode
|
requirements.txt
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core data processing and analysis
|
| 2 |
+
pandas>=2.0.0
|
| 3 |
+
numpy>=1.24.0
|
| 4 |
+
|
| 5 |
+
# Environment and configuration
|
| 6 |
+
python-dotenv>=1.0.0
|
| 7 |
+
|
| 8 |
+
# AI/ML APIs
|
| 9 |
+
anthropic>=0.25.0
|
| 10 |
+
openai>=1.30.0
|
| 11 |
+
groq>=0.8.0
|
| 12 |
+
google-generativeai>=0.5.0
|
| 13 |
+
|
| 14 |
+
# Natural Language Processing
|
| 15 |
+
nltk>=3.8.0
|
| 16 |
+
textblob>=0.17.1
|
| 17 |
+
|
| 18 |
+
# Machine Learning
|
| 19 |
+
scikit-learn>=1.3.0
|
| 20 |
+
|
| 21 |
+
# Visualization
|
| 22 |
+
plotly>=5.15.0
|
| 23 |
+
matplotlib>=3.7.0
|
| 24 |
+
seaborn>=0.12.0
|
| 25 |
+
|
| 26 |
+
# Web Interface
|
| 27 |
+
gradio>=4.25.0
|
| 28 |
+
|
| 29 |
+
# File handling (additional support)
|
| 30 |
+
openpyxl>=3.1.0
|
| 31 |
+
xlrd>=2.0.0
|
| 32 |
+
|
| 33 |
+
# Optional: For better performance
|
| 34 |
+
numba>=0.57.0
|