Fola-AI commited on
Commit
928639d
Β·
verified Β·
1 Parent(s): 3a84dee

Upload 3 files

Browse files
Files changed (3) hide show
  1. Multimodal_Text_Analytics.ipynb +1346 -0
  2. app.py +1271 -0
  3. requirements.txt +34 -0
Multimodal_Text_Analytics.ipynb ADDED
@@ -0,0 +1,1346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "3baa95af-73a1-4d3c-a562-f90777f1f0c0",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Text Data Analysis AI Assistant with Gradio\n",
9
+ " - Intelligent Customer Feedback Analysis System with Multiple AI APIs"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 1,
15
+ "id": "31a6bbea-df57-40ed-afd3-4df75cc86d0a",
16
+ "metadata": {},
17
+ "outputs": [
18
+ {
19
+ "name": "stderr",
20
+ "output_type": "stream",
21
+ "text": [
22
+ "[nltk_data] Downloading package brown to /Users/fola-ai/nltk_data...\n",
23
+ "[nltk_data] Package brown is already up-to-date!\n",
24
+ "[nltk_data] Downloading package punkt_tab to /Users/fola-\n",
25
+ "[nltk_data] ai/nltk_data...\n",
26
+ "[nltk_data] Package punkt_tab is already up-to-date!\n",
27
+ "[nltk_data] Downloading package wordnet to /Users/fola-ai/nltk_data...\n",
28
+ "[nltk_data] Package wordnet is already up-to-date!\n",
29
+ "[nltk_data] Downloading package averaged_perceptron_tagger_eng to\n",
30
+ "[nltk_data] /Users/fola-ai/nltk_data...\n",
31
+ "[nltk_data] Unzipping taggers/averaged_perceptron_tagger_eng.zip.\n",
32
+ "[nltk_data] Downloading package conll2000 to /Users/fola-\n",
33
+ "[nltk_data] ai/nltk_data...\n",
34
+ "[nltk_data] Unzipping corpora/conll2000.zip.\n",
35
+ "[nltk_data] Downloading package movie_reviews to /Users/fola-\n",
36
+ "[nltk_data] ai/nltk_data...\n",
37
+ "[nltk_data] Unzipping corpora/movie_reviews.zip.\n"
38
+ ]
39
+ },
40
+ {
41
+ "name": "stdout",
42
+ "output_type": "stream",
43
+ "text": [
44
+ "Finished.\n"
45
+ ]
46
+ }
47
+ ],
48
+ "source": [
49
+ "# ===== IMPORTS SECTION =====\n",
50
+ "# Core libraries\n",
51
+ "import os\n",
52
+ "import warnings\n",
53
+ "warnings.filterwarnings('ignore')\n",
54
+ "\n",
55
+ "# Environment and API\n",
56
+ "from dotenv import load_dotenv\n",
57
+ "from anthropic import Anthropic\n",
58
+ "\n",
59
+ "# Additional AI APIs\n",
60
+ "try:\n",
61
+ " from openai import OpenAI\n",
62
+ "except ImportError:\n",
63
+ " OpenAI = None\n",
64
+ " \n",
65
+ "try:\n",
66
+ " from groq import Groq\n",
67
+ "except ImportError:\n",
68
+ " Groq = None\n",
69
+ " \n",
70
+ "try:\n",
71
+ " import google.generativeai as genai\n",
72
+ "except ImportError:\n",
73
+ " genai = None\n",
74
+ "\n",
75
+ "# Data processing\n",
76
+ "import pandas as pd\n",
77
+ "import numpy as np\n",
78
+ "from datetime import datetime, timedelta\n",
79
+ "import json\n",
80
+ "import gc # For garbage collection\n",
81
+ "\n",
82
+ "# Natural Language Processing\n",
83
+ "import nltk\n",
84
+ "from nltk.corpus import stopwords\n",
85
+ "from nltk.tokenize import word_tokenize\n",
86
+ "from nltk.stem import WordNetLemmatizer\n",
87
+ "from textblob import TextBlob\n",
88
+ "import re\n",
89
+ "from collections import Counter\n",
90
+ "\n",
91
+ "# Machine Learning\n",
92
+ "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
93
+ "from sklearn.decomposition import LatentDirichletAllocation\n",
94
+ "from sklearn.cluster import KMeans\n",
95
+ "from sklearn.preprocessing import StandardScaler\n",
96
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
97
+ "\n",
98
+ "# Visualization\n",
99
+ "import plotly.express as px\n",
100
+ "import plotly.graph_objects as go\n",
101
+ "from plotly.subplots import make_subplots\n",
102
+ "import matplotlib.pyplot as plt\n",
103
+ "import seaborn as sns\n",
104
+ "\n",
105
+ "# Web interface\n",
106
+ "import gradio as gr\n",
107
+ "\n",
108
+ "# Download required NLTK data\n",
109
+ "nltk.download('punkt', quiet=True)\n",
110
+ "nltk.download('punkt_tab', quiet=True) # New tokenizer format\n",
111
+ "nltk.download('stopwords', quiet=True)\n",
112
+ "nltk.download('wordnet', quiet=True)\n",
113
+ "nltk.download('averaged_perceptron_tagger', quiet=True)\n",
114
+ "nltk.download('omw-1.4', quiet=True) # For WordNet lemmatizer\n",
115
+ "nltk.download('brown', quiet=True) # Required for TextBlob\n",
116
+ "\n",
117
+ "# Download TextBlob corpora\n",
118
+ "try:\n",
119
+ " from textblob import download_corpora\n",
120
+ " download_corpora.main()\n",
121
+ "except:\n",
122
+ " # Alternative method if the above doesn't work\n",
123
+ " import subprocess\n",
124
+ " import sys\n",
125
+ " try:\n",
126
+ " subprocess.run([sys.executable, \"-m\", \"textblob.download_corpora\"], \n",
127
+ " capture_output=True, text=True, timeout=30)\n",
128
+ " except:\n",
129
+ " print(\"Warning: Could not download TextBlob corpora. Sentiment analysis may not work properly.\")\n",
130
+ " print(\"Please run: python -m textblob.download_corpora\")"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": 2,
136
+ "id": "db7c1e72-7960-4968-9a72-0f62ca7140d9",
137
+ "metadata": {},
138
+ "outputs": [
139
+ {
140
+ "data": {
141
+ "text/plain": [
142
+ "True"
143
+ ]
144
+ },
145
+ "execution_count": 2,
146
+ "metadata": {},
147
+ "output_type": "execute_result"
148
+ }
149
+ ],
150
+ "source": [
151
+ "load_dotenv(override=True)"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": 3,
157
+ "id": "bded62da-82ab-4e17-bbf5-3edfe1b39398",
158
+ "metadata": {},
159
+ "outputs": [],
160
+ "source": [
161
+ "# ===== SMART COLUMN DETECTOR =====\n",
162
+ "class SmartColumnDetector:\n",
163
+ " \"\"\"Intelligently detect and extract relevant columns from uploaded data\"\"\"\n",
164
+ " \n",
165
+ " def __init__(self):\n",
166
+ " # Keywords for detecting different column types\n",
167
+ " self.text_keywords = ['comment', 'feedback', 'review', 'description', 'text', \n",
168
+ " 'response', 'opinion', 'message', 'notes', 'remarks']\n",
169
+ " self.id_keywords = ['id', 'identifier', 'key', 'number', 'code', 'ref', \n",
170
+ " 'reference', 'index', 'uuid']\n",
171
+ " self.product_keywords = ['product', 'item', 'model', 'variant', 'type', \n",
172
+ " 'category', 'brand', 'name', 'sku']\n",
173
+ " self.date_keywords = ['date', 'time', 'created', 'updated', 'timestamp']\n",
174
+ " \n",
175
+ " def detect_column_types(self, df):\n",
176
+ " \"\"\"Detect column types based on column names and content\"\"\"\n",
177
+ " detected = {\n",
178
+ " 'text_columns': [],\n",
179
+ " 'id_columns': [],\n",
180
+ " 'product_columns': [],\n",
181
+ " 'date_columns': [],\n",
182
+ " 'other_columns': []\n",
183
+ " }\n",
184
+ " \n",
185
+ " for col in df.columns:\n",
186
+ " col_lower = col.lower()\n",
187
+ " \n",
188
+ " # Check for text columns\n",
189
+ " if any(keyword in col_lower for keyword in self.text_keywords):\n",
190
+ " detected['text_columns'].append(col)\n",
191
+ " # Check for ID columns\n",
192
+ " elif any(keyword in col_lower for keyword in self.id_keywords):\n",
193
+ " detected['id_columns'].append(col)\n",
194
+ " # Check for product columns\n",
195
+ " elif any(keyword in col_lower for keyword in self.product_keywords):\n",
196
+ " detected['product_columns'].append(col)\n",
197
+ " # Check for date columns\n",
198
+ " elif any(keyword in col_lower for keyword in self.date_keywords):\n",
199
+ " detected['date_columns'].append(col)\n",
200
+ " else:\n",
201
+ " # Analyze content to determine type\n",
202
+ " sample = df[col].dropna().head(100)\n",
203
+ " if len(sample) > 0:\n",
204
+ " # Check if mostly text\n",
205
+ " if df[col].dtype == 'object':\n",
206
+ " avg_length = sample.astype(str).str.len().mean()\n",
207
+ " if avg_length > 50: # Likely text content\n",
208
+ " detected['text_columns'].append(col)\n",
209
+ " elif avg_length < 20 and df[col].nunique() / len(df) > 0.5:\n",
210
+ " detected['id_columns'].append(col)\n",
211
+ " else:\n",
212
+ " detected['product_columns'].append(col)\n",
213
+ " else:\n",
214
+ " detected['other_columns'].append(col)\n",
215
+ " \n",
216
+ " return detected\n",
217
+ " \n",
218
+ " def extract_relevant_data(self, df):\n",
219
+ " \"\"\"Extract only relevant columns and create optimized dataset\"\"\"\n",
220
+ " detected = self.detect_column_types(df)\n",
221
+ " \n",
222
+ " # Create new dataframe with relevant columns\n",
223
+ " extracted_data = pd.DataFrame()\n",
224
+ " \n",
225
+ " # Add unique identifier\n",
226
+ " if detected['id_columns'] and len(detected['id_columns']) > 0:\n",
227
+ " extracted_data['unique_id'] = df[detected['id_columns'][0]]\n",
228
+ " else:\n",
229
+ " extracted_data['unique_id'] = range(1, len(df) + 1)\n",
230
+ " \n",
231
+ " # Add product information\n",
232
+ " if detected['product_columns'] and len(detected['product_columns']) > 0:\n",
233
+ " # Convert to list if needed and limit to 2 product columns\n",
234
+ " product_cols = list(detected['product_columns'])[:2]\n",
235
+ " for col in product_cols:\n",
236
+ " extracted_data[f'product_{col}'] = df[col]\n",
237
+ " \n",
238
+ " # Combine text columns\n",
239
+ " if detected['text_columns'] and len(detected['text_columns']) > 0:\n",
240
+ " text_cols = list(detected['text_columns']) # Ensure it's a list\n",
241
+ " text_data = []\n",
242
+ " for idx in df.index:\n",
243
+ " combined_text = ' '.join([\n",
244
+ " str(df.loc[idx, col]) \n",
245
+ " for col in text_cols \n",
246
+ " if col in df.columns and pd.notna(df.loc[idx, col])\n",
247
+ " ])\n",
248
+ " text_data.append(combined_text)\n",
249
+ " extracted_data['combined_text'] = text_data\n",
250
+ " else:\n",
251
+ " # If no text columns detected, create empty combined_text\n",
252
+ " extracted_data['combined_text'] = [''] * len(df)\n",
253
+ " \n",
254
+ " # Add date columns\n",
255
+ " if detected['date_columns'] and len(detected['date_columns']) > 0:\n",
256
+ " extracted_data['date'] = pd.to_datetime(df[detected['date_columns'][0]], errors='coerce')\n",
257
+ " \n",
258
+ " return extracted_data, detected"
259
+ ]
260
+ },
261
+ {
262
+ "cell_type": "code",
263
+ "execution_count": 4,
264
+ "id": "626af7bf-b4cf-4259-b409-18e5225555aa",
265
+ "metadata": {},
266
+ "outputs": [],
267
+ "source": [
268
+ "# ===== ENHANCED TEXT PROCESSOR =====\n",
269
+ "class EnhancedTextProcessor:\n",
270
+ " \"\"\"Enhanced text preprocessing with actionable insights extraction\"\"\"\n",
271
+ "\n",
272
+ " def __init__(self):\n",
273
+ " self.lemmatizer = WordNetLemmatizer()\n",
274
+ " self.stop_words = set(stopwords.words('english'))\n",
275
+ " \n",
276
+ " # Initialize actionable insights dictionary with common customer feedback phrases\n",
277
+ " self.actionable_dictionary = {\n",
278
+ " 'improve speed': ['slow', 'faster', 'quick', 'speed up', 'takes too long', 'waiting'],\n",
279
+ " 'better quality': ['poor quality', 'cheap', 'breaks', 'defective', 'flimsy', 'weak'],\n",
280
+ " 'enhance ui': ['confusing', 'hard to use', 'complicated', 'not intuitive', 'difficult to navigate'],\n",
281
+ " 'fix bugs': ['bug', 'error', 'crash', 'freeze', 'not working', 'glitch', 'broken'],\n",
282
+ " 'add features': ['missing', 'need', 'want', 'should have', 'would be nice', 'lacks'],\n",
283
+ " 'improve support': ['no response', 'unhelpful', 'rude', 'poor service', 'bad support'],\n",
284
+ " 'better packaging': ['damaged', 'poor packaging', 'arrived broken', 'not protected'],\n",
285
+ " 'clearer docs': ['unclear', 'no instructions', 'confusing manual', 'hard to understand'],\n",
286
+ " 'reduce price': ['expensive', 'overpriced', 'too costly', 'not worth', 'cheaper'],\n",
287
+ " 'faster delivery': ['late', 'delayed', 'slow shipping', 'took forever', 'still waiting'],\n",
288
+ " 'better communication': ['no updates', 'not informed', 'lack of communication', 'no tracking'],\n",
289
+ " 'improve reliability': ['unreliable', 'stops working', 'inconsistent', 'sometimes works'],\n",
290
+ " 'enhance performance': ['slow performance', 'laggy', 'sluggish', 'not responsive'],\n",
291
+ " 'better design': ['ugly', 'poor design', 'looks cheap', 'not attractive', 'outdated look'],\n",
292
+ " 'more options': ['limited options', 'no variety', 'need more choices', 'only one option']\n",
293
+ " }\n",
294
+ "\n",
295
+ " def clean_text(self, text):\n",
296
+ " \"\"\"Clean and normalize text\"\"\"\n",
297
+ " if pd.isna(text) or text == '':\n",
298
+ " return \"\"\n",
299
+ "\n",
300
+ " text = str(text).lower()\n",
301
+ " text = re.sub(r'[^a-zA-Z0-9\\s]', '', text)\n",
302
+ " text = ' '.join(text.split())\n",
303
+ " return text\n",
304
+ "\n",
305
+ " def extract_actionable_insights(self, text):\n",
306
+ " \"\"\"Extract actionable insights using dictionary matching\"\"\"\n",
307
+ " if pd.isna(text) or text == '':\n",
308
+ " return \"\"\n",
309
+ " \n",
310
+ " text_lower = text.lower()\n",
311
+ " found_insights = []\n",
312
+ " \n",
313
+ " # Check each actionable item against the text\n",
314
+ " for action, keywords in self.actionable_dictionary.items():\n",
315
+ " for keyword in keywords:\n",
316
+ " if keyword in text_lower:\n",
317
+ " found_insights.append(action)\n",
318
+ " break # Only add each action once\n",
319
+ " \n",
320
+ " # Return top 3 most relevant insights\n",
321
+ " if found_insights:\n",
322
+ " return ', '.join(found_insights[:3])\n",
323
+ " return \"\"\n",
324
+ "\n",
325
+ " def extract_specific_topics(self, text):\n",
326
+ " \"\"\"Extract specific topics from text using keyword extraction\"\"\"\n",
327
+ " if pd.isna(text) or text == '' or len(text) < 10:\n",
328
+ " return ['', '', '']\n",
329
+ " \n",
330
+ " # Clean text first\n",
331
+ " text_lower = text.lower()\n",
332
+ " \n",
333
+ " # Remove stopwords for better topic extraction\n",
334
+ " words = word_tokenize(text_lower)\n",
335
+ " filtered_words = [w for w in words if w not in self.stop_words and len(w) > 3]\n",
336
+ " \n",
337
+ " # Extract noun phrases and important terms\n",
338
+ " blob = TextBlob(text)\n",
339
+ " noun_phrases = blob.noun_phrases\n",
340
+ " \n",
341
+ " # Combine noun phrases with high-frequency meaningful words\n",
342
+ " topics = []\n",
343
+ " \n",
344
+ " # Add noun phrases (these are usually good topics)\n",
345
+ " for phrase in noun_phrases[:5]: # Limit to top 5 noun phrases\n",
346
+ " if len(phrase.split()) <= 3: # Only short phrases\n",
347
+ " topics.append(phrase)\n",
348
+ " \n",
349
+ " # Add frequent meaningful words if we don't have enough topics\n",
350
+ " if len(topics) < 3:\n",
351
+ " word_freq = Counter(filtered_words)\n",
352
+ " for word, _ in word_freq.most_common(5):\n",
353
+ " if word not in str(topics): # Avoid duplicates\n",
354
+ " topics.append(word)\n",
355
+ " if len(topics) >= 3:\n",
356
+ " break\n",
357
+ " \n",
358
+ " # Ensure we always return 3 items (empty string if not enough topics)\n",
359
+ " topics = topics[:3]\n",
360
+ " while len(topics) < 3:\n",
361
+ " topics.append('')\n",
362
+ " \n",
363
+ " return topics\n",
364
+ "\n",
365
+ " def determine_topic(self, text):\n",
366
+ " \"\"\"Legacy method kept for compatibility - returns first specific topic\"\"\"\n",
367
+ " topics = self.extract_specific_topics(text)\n",
368
+ " return topics[0] if topics[0] else 'General'"
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "code",
373
+ "execution_count": 5,
374
+ "id": "b2eb5f17-7400-4591-8c0e-de7645b87c72",
375
+ "metadata": {},
376
+ "outputs": [],
377
+ "source": [
378
+ "# ===== SEARCH ENGINE =====\n",
379
+ "class TextSearchEngine:\n",
380
+ " \"\"\"Advanced search functionality for text data with semantic capabilities\"\"\"\n",
381
+ " \n",
382
+ " def __init__(self):\n",
383
+ " self.vectorizer = TfidfVectorizer(\n",
384
+ " max_features=1000,\n",
385
+ " ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams for better matching\n",
386
+ " stop_words='english',\n",
387
+ " use_idf=True,\n",
388
+ " smooth_idf=True,\n",
389
+ " sublinear_tf=True # Apply sublinear tf scaling\n",
390
+ " )\n",
391
+ " self.tfidf_matrix = None\n",
392
+ " self.data = None\n",
393
+ " \n",
394
+ " # Synonym dictionary for semantic search\n",
395
+ " self.synonyms = {\n",
396
+ " 'fast': ['quick', 'rapid', 'speedy', 'swift', 'prompt'],\n",
397
+ " 'slow': ['sluggish', 'delayed', 'laggy', 'lengthy', 'prolonged'],\n",
398
+ " 'good': ['excellent', 'great', 'wonderful', 'fantastic', 'amazing', 'positive'],\n",
399
+ " 'bad': ['poor', 'terrible', 'awful', 'negative', 'horrible', 'disappointing'],\n",
400
+ " 'problem': ['issue', 'bug', 'error', 'defect', 'fault', 'glitch'],\n",
401
+ " 'help': ['support', 'assistance', 'aid', 'service'],\n",
402
+ " 'price': ['cost', 'fee', 'charge', 'rate', 'payment', 'expensive', 'cheap'],\n",
403
+ " 'quality': ['standard', 'grade', 'condition', 'caliber'],\n",
404
+ " 'delivery': ['shipping', 'dispatch', 'arrival', 'transport'],\n",
405
+ " 'easy': ['simple', 'straightforward', 'effortless', 'user-friendly'],\n",
406
+ " 'hard': ['difficult', 'complex', 'complicated', 'challenging'],\n",
407
+ " 'broken': ['damaged', 'defective', 'faulty', 'malfunctioning'],\n",
408
+ " 'love': ['like', 'enjoy', 'appreciate', 'adore'],\n",
409
+ " 'hate': ['dislike', 'despise', 'detest'],\n",
410
+ " 'feature': ['function', 'capability', 'option', 'characteristic'],\n",
411
+ " 'customer': ['client', 'buyer', 'purchaser', 'consumer', 'user']\n",
412
+ " }\n",
413
+ " \n",
414
+ " def expand_query_with_synonyms(self, query):\n",
415
+ " \"\"\"Expand search query with synonyms for better semantic matching\"\"\"\n",
416
+ " query_words = query.lower().split()\n",
417
+ " expanded_terms = []\n",
418
+ " \n",
419
+ " for word in query_words:\n",
420
+ " # Add the original word\n",
421
+ " expanded_terms.append(word)\n",
422
+ " \n",
423
+ " # Add synonyms if available\n",
424
+ " if word in self.synonyms:\n",
425
+ " expanded_terms.extend(self.synonyms[word])\n",
426
+ " \n",
427
+ " # Check if word is a synonym of something else\n",
428
+ " for key, syns in self.synonyms.items():\n",
429
+ " if word in syns:\n",
430
+ " expanded_terms.append(key)\n",
431
+ " expanded_terms.extend([s for s in syns if s != word])\n",
432
+ " \n",
433
+ " # Remove duplicates while preserving order\n",
434
+ " seen = set()\n",
435
+ " unique_terms = []\n",
436
+ " for term in expanded_terms:\n",
437
+ " if term not in seen:\n",
438
+ " unique_terms.append(term)\n",
439
+ " seen.add(term)\n",
440
+ " \n",
441
+ " return ' '.join(unique_terms)\n",
442
+ " \n",
443
+ " def build_index(self, df, text_column):\n",
444
+ " \"\"\"Build search index from text data\"\"\"\n",
445
+ " self.data = df.copy()\n",
446
+ " texts = df[text_column].fillna('').tolist()\n",
447
+ " \n",
448
+ " # Add other searchable columns to improve search\n",
449
+ " if 'topic_1' in df.columns:\n",
450
+ " texts = [f\"{text} {df.iloc[i]['topic_1']} {df.iloc[i]['topic_2']} {df.iloc[i]['topic_3']}\" \n",
451
+ " for i, text in enumerate(texts)]\n",
452
+ " if 'actionable_insights' in df.columns:\n",
453
+ " texts = [f\"{texts[i]} {df.iloc[i]['actionable_insights']}\" \n",
454
+ " for i in range(len(texts))]\n",
455
+ " \n",
456
+ " self.tfidf_matrix = self.vectorizer.fit_transform(texts)\n",
457
+ " \n",
458
+ " def search(self, query, top_k=10):\n",
459
+ " \"\"\"Enhanced search with semantic understanding\"\"\"\n",
460
+ " if self.tfidf_matrix is None:\n",
461
+ " return pd.DataFrame()\n",
462
+ " \n",
463
+ " # Expand query with synonyms\n",
464
+ " expanded_query = self.expand_query_with_synonyms(query)\n",
465
+ " \n",
466
+ " # Vectorize both original and expanded queries\n",
467
+ " query_vector = self.vectorizer.transform([query])\n",
468
+ " expanded_vector = self.vectorizer.transform([expanded_query])\n",
469
+ " \n",
470
+ " # Calculate similarities for both\n",
471
+ " similarities_orig = cosine_similarity(query_vector, self.tfidf_matrix).flatten()\n",
472
+ " similarities_exp = cosine_similarity(expanded_vector, self.tfidf_matrix).flatten()\n",
473
+ " \n",
474
+ " # Combine scores (weighted average - original query gets more weight)\n",
475
+ " combined_similarities = (0.7 * similarities_orig + 0.3 * similarities_exp)\n",
476
+ " \n",
477
+ " # Get top results\n",
478
+ " top_indices = combined_similarities.argsort()[-top_k:][::-1]\n",
479
+ " top_scores = combined_similarities[top_indices]\n",
480
+ " \n",
481
+ " # Filter results with score > 0.05 (lower threshold for better recall)\n",
482
+ " valid_indices = [idx for idx, score in zip(top_indices, top_scores) if score > 0.05]\n",
483
+ " \n",
484
+ " if valid_indices:\n",
485
+ " results = self.data.iloc[valid_indices].copy()\n",
486
+ " results['search_score'] = [combined_similarities[idx] for idx in valid_indices]\n",
487
+ " \n",
488
+ " # Boost results that have exact matches\n",
489
+ " query_lower = query.lower()\n",
490
+ " for idx in results.index:\n",
491
+ " if 'combined_text' in results.columns:\n",
492
+ " if query_lower in str(results.at[idx, 'combined_text']).lower():\n",
493
+ " results.at[idx, 'search_score'] *= 1.5 # Boost exact matches\n",
494
+ " \n",
495
+ " return results.sort_values('search_score', ascending=False)\n",
496
+ " \n",
497
+ " return pd.DataFrame()\n"
498
+ ]
499
+ },
500
+ {
501
+ "cell_type": "code",
502
+ "execution_count": 6,
503
+ "id": "e8b88155-971f-4dd5-b26c-104a737bc426",
504
+ "metadata": {},
505
+ "outputs": [],
506
+ "source": [
507
+ "# ===== API CONFIGURATION =====\n",
508
+ "class AIModelManager:\n",
509
+ " \"\"\"Manages multiple AI model APIs and provides unified interface\"\"\"\n",
510
+ " \n",
511
+ " def __init__(self):\n",
512
+ " self.available_models = {}\n",
513
+ " self.clients = {}\n",
514
+ " self.current_model = None\n",
515
+ " self.initialize_apis()\n",
516
+ " \n",
517
+ " def initialize_apis(self):\n",
518
+ " \"\"\"Initialize all available AI APIs\"\"\"\n",
519
+ " \n",
520
+ " # Anthropic\n",
521
+ " ANTHROPIC_API_KEY = os.getenv(\"ANTHROPIC_API_KEY\")\n",
522
+ " if ANTHROPIC_API_KEY:\n",
523
+ " try:\n",
524
+ " self.clients['anthropic'] = Anthropic(api_key=ANTHROPIC_API_KEY)\n",
525
+ " self.available_models['Claude 3 Haiku'] = {\n",
526
+ " 'provider': 'anthropic',\n",
527
+ " 'model': 'claude-3-haiku-20240307'\n",
528
+ " }\n",
529
+ " print(f\"Anthropic API Key exists and begins {ANTHROPIC_API_KEY[:4]}\")\n",
530
+ " except Exception as e:\n",
531
+ " print(f\"Error initializing Anthropic: {e}\")\n",
532
+ " else:\n",
533
+ " print(\"Anthropic API Key not set\")\n",
534
+ " \n",
535
+ " # OpenAI\n",
536
+ " OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
537
+ " if OPENAI_API_KEY and OpenAI:\n",
538
+ " try:\n",
539
+ " self.clients['openai'] = OpenAI(api_key=OPENAI_API_KEY)\n",
540
+ " self.available_models['GPT-4o-mini'] = {\n",
541
+ " 'provider': 'openai',\n",
542
+ " 'model': 'gpt-4o-mini'\n",
543
+ " }\n",
544
+ " self.available_models['GPT-3.5 Turbo'] = {\n",
545
+ " 'provider': 'openai',\n",
546
+ " 'model': 'gpt-3.5-turbo'\n",
547
+ " }\n",
548
+ " print(f\"OpenAI API Key exists and begins {OPENAI_API_KEY[:7]}\")\n",
549
+ " except Exception as e:\n",
550
+ " print(f\"Error initializing OpenAI: {e}\")\n",
551
+ " else:\n",
552
+ " print(\"OpenAI API Key not set or library not installed\")\n",
553
+ " \n",
554
+ " # Deepseek (uses OpenAI-compatible API)\n",
555
+ " DEEPSEEK_API_KEY = os.getenv(\"DEEPSEEK_API_KEY\")\n",
556
+ " if DEEPSEEK_API_KEY and OpenAI:\n",
557
+ " try:\n",
558
+ " self.clients['deepseek'] = OpenAI(\n",
559
+ " api_key=DEEPSEEK_API_KEY,\n",
560
+ " base_url=\"https://api.deepseek.com\"\n",
561
+ " )\n",
562
+ " self.available_models['Deepseek Chat'] = {\n",
563
+ " 'provider': 'deepseek',\n",
564
+ " 'model': 'deepseek-chat'\n",
565
+ " }\n",
566
+ " print(f\"Deepseek API Key exists and begins {DEEPSEEK_API_KEY[:7]}\")\n",
567
+ " except Exception as e:\n",
568
+ " print(f\"Error initializing Deepseek: {e}\")\n",
569
+ " else:\n",
570
+ " print(\"Deepseek API Key not set or OpenAI library not installed\")\n",
571
+ " \n",
572
+ " # Groq\n",
573
+ " GROQ_API_KEY = os.getenv(\"GROQ_API_KEY\")\n",
574
+ " if GROQ_API_KEY and Groq:\n",
575
+ " try:\n",
576
+ " self.clients['groq'] = Groq(api_key=GROQ_API_KEY)\n",
577
+ " self.available_models['Llama 3.3 70B'] = {\n",
578
+ " 'provider': 'groq',\n",
579
+ " 'model': 'llama-3.3-70b-versatile'\n",
580
+ " }\n",
581
+ " self.available_models['Mixtral 8x7B'] = {\n",
582
+ " 'provider': 'groq',\n",
583
+ " 'model': 'mixtral-8x7b-32768'\n",
584
+ " }\n",
585
+ " print(f\"Groq API Key exists and begins {GROQ_API_KEY[:4]}\")\n",
586
+ " except Exception as e:\n",
587
+ " print(f\"Error initializing Groq: {e}\")\n",
588
+ " else:\n",
589
+ " print(\"Groq API Key not set or library not installed\")\n",
590
+ " \n",
591
+ " # Google Gemini\n",
592
+ " GOOGLE_API_KEY = os.getenv(\"GOOGLE_API_KEY\")\n",
593
+ " if GOOGLE_API_KEY and genai:\n",
594
+ " try:\n",
595
+ " genai.configure(api_key=GOOGLE_API_KEY)\n",
596
+ " self.clients['google'] = genai\n",
597
+ " self.available_models['Gemini 1.5 Flash'] = {\n",
598
+ " 'provider': 'google',\n",
599
+ " 'model': 'gemini-1.5-flash'\n",
600
+ " }\n",
601
+ " self.available_models['Gemini 1.5 Pro'] = {\n",
602
+ " 'provider': 'google',\n",
603
+ " 'model': 'gemini-1.5-pro'\n",
604
+ " }\n",
605
+ " print(f\"Google API Key exists and begins {GOOGLE_API_KEY[:2]}\")\n",
606
+ " except Exception as e:\n",
607
+ " print(f\"Error initializing Google Gemini: {e}\")\n",
608
+ " else:\n",
609
+ " print(\"Google API Key not set or library not installed\")\n",
610
+ " \n",
611
+ " # Set default model\n",
612
+ " if self.available_models:\n",
613
+ " self.current_model = list(self.available_models.keys())[0]\n",
614
+ " \n",
615
+ " def get_available_models(self):\n",
616
+ " \"\"\"Return list of available model names\"\"\"\n",
617
+ " return list(self.available_models.keys())\n",
618
+ " \n",
619
+ " def set_model(self, model_name):\n",
620
+ " \"\"\"Set the current model\"\"\"\n",
621
+ " if model_name in self.available_models:\n",
622
+ " self.current_model = model_name\n",
623
+ " return True\n",
624
+ " return False\n",
625
+ " \n",
626
+ " def generate_text(self, prompt, max_tokens=1000):\n",
627
+ " \"\"\"Generate text using the current model\"\"\"\n",
628
+ " if not self.current_model or self.current_model not in self.available_models:\n",
629
+ " return None\n",
630
+ " \n",
631
+ " model_info = self.available_models[self.current_model]\n",
632
+ " provider = model_info['provider']\n",
633
+ " model = model_info['model']\n",
634
+ " \n",
635
+ " try:\n",
636
+ " if provider == 'anthropic':\n",
637
+ " client = self.clients['anthropic']\n",
638
+ " response = client.messages.create(\n",
639
+ " model=model,\n",
640
+ " max_tokens=max_tokens,\n",
641
+ " messages=[{\"role\": \"user\", \"content\": prompt}]\n",
642
+ " )\n",
643
+ " return response.content[0].text\n",
644
+ " \n",
645
+ " elif provider in ['openai', 'deepseek']:\n",
646
+ " client = self.clients[provider]\n",
647
+ " response = client.chat.completions.create(\n",
648
+ " model=model,\n",
649
+ " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
650
+ " max_tokens=max_tokens\n",
651
+ " )\n",
652
+ " return response.choices[0].message.content\n",
653
+ " \n",
654
+ " elif provider == 'groq':\n",
655
+ " client = self.clients['groq']\n",
656
+ " response = client.chat.completions.create(\n",
657
+ " model=model,\n",
658
+ " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
659
+ " max_tokens=max_tokens\n",
660
+ " )\n",
661
+ " return response.choices[0].message.content\n",
662
+ " \n",
663
+ " elif provider == 'google':\n",
664
+ " model_obj = genai.GenerativeModel(model)\n",
665
+ " response = model_obj.generate_content(prompt)\n",
666
+ " return response.text\n",
667
+ " \n",
668
+ " except Exception as e:\n",
669
+ " print(f\"Error generating text with {self.current_model}: {e}\")\n",
670
+ " return None"
671
+ ]
672
+ },
673
+ {
674
+ "cell_type": "code",
675
+ "execution_count": 7,
676
+ "id": "809f4c47-6ea8-4eaa-bac1-5ca83daac733",
677
+ "metadata": {},
678
+ "outputs": [
679
+ {
680
+ "name": "stdout",
681
+ "output_type": "stream",
682
+ "text": [
683
+ "Anthropic API Key exists and begins sk-a\n",
684
+ "OpenAI API Key exists and begins sk-proj\n",
685
+ "Deepseek API Key exists and begins sk-1099\n",
686
+ "Groq API Key exists and begins gsk_\n",
687
+ "Google API Key exists and begins AI\n"
688
+ ]
689
+ }
690
+ ],
691
+ "source": [
692
+ "# Initialize the model manager globally\n",
693
+ "model_manager = AIModelManager()"
694
+ ]
695
+ },
696
+ {
697
+ "cell_type": "code",
698
+ "execution_count": 8,
699
+ "id": "ad5f99f2-efd9-4759-88dc-df7f2f5359fb",
700
+ "metadata": {},
701
+ "outputs": [],
702
+ "source": [
703
+ "# ===== ENHANCED ANALYZER WITH MULTI-MODEL SUPPORT =====\n",
704
+ "\n",
705
+ "class EnhancedTextAnalyzer:\n",
706
+ " \"\"\"Main analysis engine with all enhanced features and multi-model support\"\"\"\n",
707
+ " \n",
708
+ " def __init__(self, model_manager=None):\n",
709
+ " self.model_manager = model_manager\n",
710
+ " self.column_detector = SmartColumnDetector()\n",
711
+ " self.text_processor = EnhancedTextProcessor()\n",
712
+ " self.search_engine = TextSearchEngine()\n",
713
+ " self.original_df = None\n",
714
+ " self.processed_df = None\n",
715
+ " self.results = {}\n",
716
+ " self.visualizations = {}\n",
717
+ " \n",
718
+ " def load_file(self, file):\n",
719
+ " \"\"\"Load data from various file formats\"\"\"\n",
720
+ " try:\n",
721
+ " if file.name.endswith('.csv'):\n",
722
+ " df = pd.read_csv(file.name)\n",
723
+ " elif file.name.endswith(('.xlsx', '.xls')):\n",
724
+ " df = pd.read_excel(file.name)\n",
725
+ " elif file.name.endswith('.json'):\n",
726
+ " df = pd.read_json(file.name)\n",
727
+ " else:\n",
728
+ " return None, \"Unsupported file format\"\n",
729
+ " \n",
730
+ " return df, f\"File loaded: {len(df)} records\"\n",
731
+ " except Exception as e:\n",
732
+ " return None, f\"Error loading file: {str(e)}\"\n",
733
+ " \n",
734
+ " def process_data(self, df):\n",
735
+ " \"\"\"Process data with smart extraction and analysis\"\"\"\n",
736
+ " # Extract relevant columns\n",
737
+ " extracted_df, detected_columns = self.column_detector.extract_relevant_data(df)\n",
738
+ " \n",
739
+ " # Store for reference\n",
740
+ " self.processed_df = extracted_df\n",
741
+ " \n",
742
+ " # Clear original from memory\n",
743
+ " del df\n",
744
+ " gc.collect()\n",
745
+ " \n",
746
+ " # Add analysis columns\n",
747
+ " if 'combined_text' in extracted_df.columns:\n",
748
+ " # Sentiment analysis\n",
749
+ " sentiments = []\n",
750
+ " polarities = []\n",
751
+ " topics_1 = []\n",
752
+ " topics_2 = []\n",
753
+ " topics_3 = []\n",
754
+ " insights = []\n",
755
+ " \n",
756
+ " for text in extracted_df['combined_text']:\n",
757
+ " # Sentiment\n",
758
+ " blob = TextBlob(text)\n",
759
+ " polarity = blob.sentiment.polarity\n",
760
+ " if polarity > 0.1:\n",
761
+ " sentiment = 'Positive'\n",
762
+ " elif polarity < -0.1:\n",
763
+ " sentiment = 'Negative'\n",
764
+ " else:\n",
765
+ " sentiment = 'Neutral'\n",
766
+ " \n",
767
+ " sentiments.append(sentiment)\n",
768
+ " polarities.append(polarity)\n",
769
+ " \n",
770
+ " # Extract specific topics (3 separate topics)\n",
771
+ " specific_topics = self.text_processor.extract_specific_topics(text)\n",
772
+ " topics_1.append(specific_topics[0])\n",
773
+ " topics_2.append(specific_topics[1])\n",
774
+ " topics_3.append(specific_topics[2])\n",
775
+ " \n",
776
+ " # Actionable insights using dictionary matching\n",
777
+ " insight = self.text_processor.extract_actionable_insights(text)\n",
778
+ " insights.append(insight)\n",
779
+ " \n",
780
+ " extracted_df['sentiment'] = sentiments\n",
781
+ " extracted_df['sentiment_score'] = polarities\n",
782
+ " extracted_df['topic_1'] = topics_1\n",
783
+ " extracted_df['topic_2'] = topics_2\n",
784
+ " extracted_df['topic_3'] = topics_3\n",
785
+ " extracted_df['actionable_insights'] = insights\n",
786
+ " \n",
787
+ " # Build search index with enhanced search capabilities\n",
788
+ " self.search_engine.build_index(extracted_df, 'combined_text')\n",
789
+ " \n",
790
+ " # Save processed data\n",
791
+ " output_file = 'processed_data.xlsx'\n",
792
+ " extracted_df.to_excel(output_file, index=False)\n",
793
+ " \n",
794
+ " return extracted_df, detected_columns, output_file\n",
795
+ " \n",
796
+ " def generate_ai_insights(self, df, num_samples=5):\n",
797
+ " \"\"\"Generate AI-powered insights using selected model\"\"\"\n",
798
+ " if not self.model_manager or not self.model_manager.current_model:\n",
799
+ " return \"No AI model available for generating insights\"\n",
800
+ " \n",
801
+ " if 'combined_text' not in df.columns or df.empty:\n",
802
+ " return \"No text data available for AI analysis\"\n",
803
+ " \n",
804
+ " # Sample some texts for analysis\n",
805
+ " sample_texts = df['combined_text'].dropna().head(num_samples).tolist()\n",
806
+ " if not sample_texts:\n",
807
+ " return \"No valid text samples found\"\n",
808
+ " \n",
809
+ " # Create prompt for AI analysis\n",
810
+ " prompt = f\"\"\"Analyze the following customer feedback samples and provide key insights:\n",
811
+ "\n",
812
+ "Samples:\n",
813
+ "{chr(10).join([f\"{i+1}. {text[:200]}...\" if len(text) > 200 else f\"{i+1}. {text}\" for i, text in enumerate(sample_texts)])}\n",
814
+ "\n",
815
+ "Please provide:\n",
816
+ "1. Main themes and patterns\n",
817
+ "2. Key sentiment indicators\n",
818
+ "3. Actionable recommendations\n",
819
+ "4. Areas of concern\n",
820
+ "\n",
821
+ "Keep the response concise and focused on actionable insights.\"\"\"\n",
822
+ "\n",
823
+ " # Generate insights using selected model\n",
824
+ " try:\n",
825
+ " response = self.model_manager.generate_text(prompt, max_tokens=500)\n",
826
+ " if response:\n",
827
+ " return f\"**AI Insights (using {self.model_manager.current_model}):**\\n\\n{response}\"\n",
828
+ " else:\n",
829
+ " return \"Failed to generate AI insights. Please check your API configuration.\"\n",
830
+ " except Exception as e:\n",
831
+ " return f\"Error generating AI insights: {str(e)}\"\n",
832
+ " \n",
833
+ " def generate_visualizations(self, df):\n",
834
+ " \"\"\"Generate various visualizations\"\"\"\n",
835
+ " visualizations = {}\n",
836
+ " \n",
837
+ " if 'sentiment' in df.columns:\n",
838
+ " # Sentiment distribution\n",
839
+ " sentiment_counts = df['sentiment'].value_counts()\n",
840
+ " fig_sentiment = px.pie(\n",
841
+ " values=sentiment_counts.values,\n",
842
+ " names=sentiment_counts.index,\n",
843
+ " title=\"Sentiment Distribution\",\n",
844
+ " color_discrete_map={\n",
845
+ " 'Positive': '#27AE60',\n",
846
+ " 'Negative': '#E74C3C',\n",
847
+ " 'Neutral': '#95A5A6'\n",
848
+ " }\n",
849
+ " )\n",
850
+ " visualizations['Sentiment Distribution'] = fig_sentiment\n",
851
+ " \n",
852
+ " if 'topic_1' in df.columns:\n",
853
+ " # Combine all topics for overall topic distribution\n",
854
+ " all_topics = []\n",
855
+ " for col in ['topic_1', 'topic_2', 'topic_3']:\n",
856
+ " if col in df.columns:\n",
857
+ " topics = df[col].dropna().tolist()\n",
858
+ " all_topics.extend([t for t in topics if t != ''])\n",
859
+ " \n",
860
+ " if all_topics:\n",
861
+ " topic_counts = Counter(all_topics)\n",
862
+ " top_topics = dict(topic_counts.most_common(15))\n",
863
+ " \n",
864
+ " fig_topics = px.bar(\n",
865
+ " x=list(top_topics.values()),\n",
866
+ " y=list(top_topics.keys()),\n",
867
+ " orientation='h',\n",
868
+ " title=\"Top 15 Specific Topics\",\n",
869
+ " labels={'x': 'Count', 'y': 'Topic'}\n",
870
+ " )\n",
871
+ " visualizations['Topic Distribution'] = fig_topics\n",
872
+ " \n",
873
+ " if 'sentiment' in df.columns and 'topic_1' in df.columns:\n",
874
+ " # Sentiment by primary topic (topic_1)\n",
875
+ " df_temp = df[df['topic_1'] != ''].copy()\n",
876
+ " if not df_temp.empty:\n",
877
+ " # Get top 10 topics for cleaner visualization\n",
878
+ " top_topics = df_temp['topic_1'].value_counts().head(10).index\n",
879
+ " df_filtered = df_temp[df_temp['topic_1'].isin(top_topics)]\n",
880
+ " \n",
881
+ " pivot_table = pd.crosstab(df_filtered['topic_1'], df_filtered['sentiment'])\n",
882
+ " fig_heatmap = px.imshow(\n",
883
+ " pivot_table,\n",
884
+ " labels=dict(x=\"Sentiment\", y=\"Primary Topic\", color=\"Count\"),\n",
885
+ " title=\"Sentiment by Primary Topic Heatmap\",\n",
886
+ " color_continuous_scale=\"RdYlGn\"\n",
887
+ " )\n",
888
+ " visualizations['Sentiment by Topic'] = fig_heatmap\n",
889
+ " \n",
890
+ " if 'date' in df.columns and 'sentiment' in df.columns:\n",
891
+ " # Sentiment over time\n",
892
+ " df_time = df.copy()\n",
893
+ " df_time['date'] = pd.to_datetime(df_time['date'])\n",
894
+ " time_data = df_time.groupby([pd.Grouper(key='date', freq='M'), 'sentiment']).size().reset_index(name='count')\n",
895
+ " \n",
896
+ " fig_timeline = px.line(\n",
897
+ " time_data,\n",
898
+ " x='date',\n",
899
+ " y='count',\n",
900
+ " color='sentiment',\n",
901
+ " title=\"Sentiment Trends Over Time\",\n",
902
+ " color_discrete_map={\n",
903
+ " 'Positive': '#27AE60',\n",
904
+ " 'Negative': '#E74C3C',\n",
905
+ " 'Neutral': '#95A5A6'\n",
906
+ " }\n",
907
+ " )\n",
908
+ " visualizations['Sentiment Timeline'] = fig_timeline\n",
909
+ " \n",
910
+ " if 'actionable_insights' in df.columns:\n",
911
+ " # Top actionable insights\n",
912
+ " all_insights = []\n",
913
+ " for insight in df['actionable_insights']:\n",
914
+ " if insight and insight != \"\":\n",
915
+ " # Split by comma as we're now using comma-separated insights\n",
916
+ " all_insights.extend([i.strip() for i in insight.split(',')])\n",
917
+ " \n",
918
+ " if all_insights:\n",
919
+ " insight_counts = Counter(all_insights)\n",
920
+ " top_insights = dict(insight_counts.most_common(10))\n",
921
+ " \n",
922
+ " fig_insights = px.bar(\n",
923
+ " x=list(top_insights.values()),\n",
924
+ " y=list(top_insights.keys()),\n",
925
+ " orientation='h',\n",
926
+ " title=\"Top 10 Actionable Insights\",\n",
927
+ " labels={'x': 'Frequency', 'y': 'Insight'}\n",
928
+ " )\n",
929
+ " visualizations['Top Insights'] = fig_insights\n",
930
+ " \n",
931
+ " return visualizations"
932
+ ]
933
+ },
934
+ {
935
+ "cell_type": "code",
936
+ "execution_count": 9,
937
+ "id": "5ee86a52-b195-4010-a2b7-3abf57bf9949",
938
+ "metadata": {},
939
+ "outputs": [],
940
+ "source": [
941
+ "# ===== GRADIO INTERFACE =====\n",
942
+ "# Global variables\n",
943
+ "analyzer = None\n",
944
+ "current_data = None\n",
945
+ "current_visualizations = None\n",
946
+ "\n",
947
+ "def update_model(model_name):\n",
948
+ " \"\"\"Update the selected AI model\"\"\"\n",
949
+ " global model_manager\n",
950
+ " \n",
951
+ " if model_manager.set_model(model_name):\n",
952
+ " return f\"βœ… Model switched to: {model_name}\"\n",
953
+ " else:\n",
954
+ " return f\"❌ Failed to switch to: {model_name}\"\n",
955
+ "\n",
956
+ "def process_file(file, model_name):\n",
957
+ " \"\"\"Process uploaded file with selected model\"\"\"\n",
958
+ " global analyzer, current_data, current_visualizations, model_manager\n",
959
+ " \n",
960
+ " if file is None:\n",
961
+ " return \"Please upload a file\", None, None, None, None, None, gr.update(choices=[])\n",
962
+ " \n",
963
+ " try:\n",
964
+ " # Update model if changed\n",
965
+ " if model_name and model_manager:\n",
966
+ " model_manager.set_model(model_name)\n",
967
+ " \n",
968
+ " analyzer = EnhancedTextAnalyzer(model_manager)\n",
969
+ " \n",
970
+ " # Load file\n",
971
+ " df, message = analyzer.load_file(file)\n",
972
+ " if df is None:\n",
973
+ " return message, None, None, None, None, None, gr.update(choices=[])\n",
974
+ " \n",
975
+ " # Process data\n",
976
+ " processed_df, detected_cols, output_file = analyzer.process_data(df)\n",
977
+ " current_data = processed_df\n",
978
+ " \n",
979
+ " # Generate visualizations\n",
980
+ " visualizations = analyzer.generate_visualizations(processed_df)\n",
981
+ " current_visualizations = visualizations\n",
982
+ " \n",
983
+ " # Generate AI insights\n",
984
+ " ai_insights = analyzer.generate_ai_insights(processed_df)\n",
985
+ " \n",
986
+ " # Create summary - safely handle detected columns\n",
987
+ " text_cols = list(detected_cols.get('text_columns', []))[:3] if detected_cols.get('text_columns') else []\n",
988
+ " id_cols = list(detected_cols.get('id_columns', []))[:3] if detected_cols.get('id_columns') else []\n",
989
+ " product_cols = list(detected_cols.get('product_columns', []))[:3] if detected_cols.get('product_columns') else []\n",
990
+ " \n",
991
+ " summary = f\"\"\"\n",
992
+ " ### βœ… File Processing Complete!\n",
993
+ " \n",
994
+ " **Detected Columns:**\n",
995
+ " - Text Columns: {', '.join(text_cols) if text_cols else 'None'}\n",
996
+ " - ID Columns: {', '.join(id_cols) if id_cols else 'Auto-generated'}\n",
997
+ " - Product Columns: {', '.join(product_cols) if product_cols else 'None'}\n",
998
+ " \n",
999
+ " **Analysis Results:**\n",
1000
+ " - Total Records: {len(processed_df)}\n",
1001
+ " - Processed File Saved: {output_file}\n",
1002
+ " - AI Model Used: {model_manager.current_model if model_manager else 'None'}\n",
1003
+ " \"\"\"\n",
1004
+ " \n",
1005
+ " # Data preview\n",
1006
+ " preview = processed_df.head(10)\n",
1007
+ " \n",
1008
+ " # Get first visualization\n",
1009
+ " first_viz = list(visualizations.values())[0] if visualizations else None\n",
1010
+ " \n",
1011
+ " return (\n",
1012
+ " summary,\n",
1013
+ " preview,\n",
1014
+ " output_file,\n",
1015
+ " ai_insights,\n",
1016
+ " first_viz,\n",
1017
+ " \"Ready for search\",\n",
1018
+ " gr.update(choices=list(visualizations.keys()))\n",
1019
+ " )\n",
1020
+ " \n",
1021
+ " except Exception as e:\n",
1022
+ " return f\"Error: {str(e)}\", None, None, None, None, None, gr.update(choices=[])\n",
1023
+ "\n",
1024
+ "def search_data(query):\n",
1025
+ " \"\"\"Search through the data with enhanced semantic search\"\"\"\n",
1026
+ " global analyzer, current_data\n",
1027
+ " \n",
1028
+ " if analyzer is None or current_data is None:\n",
1029
+ " return \"Please process a file first\", None, None\n",
1030
+ " \n",
1031
+ " if not query:\n",
1032
+ " return \"Please enter a search query\", None, None\n",
1033
+ " \n",
1034
+ " try:\n",
1035
+ " results = analyzer.search_engine.search(query, top_k=10)\n",
1036
+ " \n",
1037
+ " if results.empty:\n",
1038
+ " return \"No results found\", None, None\n",
1039
+ " \n",
1040
+ " # Select relevant columns for display (updated to include new topic columns)\n",
1041
+ " display_cols = ['unique_id', 'combined_text', 'sentiment', 'topic_1', 'topic_2', 'topic_3', 'actionable_insights', 'search_score']\n",
1042
+ " display_cols = [col for col in display_cols if col in results.columns]\n",
1043
+ " \n",
1044
+ " results_display = results[display_cols]\n",
1045
+ " \n",
1046
+ " # Save search results\n",
1047
+ " search_output = f\"search_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx\"\n",
1048
+ " results_display.to_excel(search_output, index=False)\n",
1049
+ " \n",
1050
+ " return f\"Found {len(results)} results\", results_display.head(10), search_output\n",
1051
+ " \n",
1052
+ " except Exception as e:\n",
1053
+ " return f\"Search error: {str(e)}\", None, None\n",
1054
+ "\n",
1055
+ "def update_visualization(viz_type):\n",
1056
+ " \"\"\"Update displayed visualization\"\"\"\n",
1057
+ " global current_visualizations\n",
1058
+ " \n",
1059
+ " if current_visualizations and viz_type in current_visualizations:\n",
1060
+ " return current_visualizations[viz_type]\n",
1061
+ " return None\n",
1062
+ "\n",
1063
+ "def export_results(format_type):\n",
1064
+ " \"\"\"Export processed data in different formats\"\"\"\n",
1065
+ " global current_data\n",
1066
+ " \n",
1067
+ " if current_data is None:\n",
1068
+ " return \"No data to export\", None\n",
1069
+ " \n",
1070
+ " try:\n",
1071
+ " timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')\n",
1072
+ " \n",
1073
+ " if format_type == \"Excel\":\n",
1074
+ " output_file = f\"analysis_results_{timestamp}.xlsx\"\n",
1075
+ " current_data.to_excel(output_file, index=False)\n",
1076
+ " else: # CSV\n",
1077
+ " output_file = f\"analysis_results_{timestamp}.csv\"\n",
1078
+ " current_data.to_csv(output_file, index=False)\n",
1079
+ " \n",
1080
+ " return f\"Data exported to {output_file}\", output_file\n",
1081
+ " \n",
1082
+ " except Exception as e:\n",
1083
+ " return f\"Export error: {str(e)}\", None"
1084
+ ]
1085
+ },
1086
+ {
1087
+ "cell_type": "code",
1088
+ "execution_count": 10,
1089
+ "id": "38bf0375-9ef8-488c-821f-288c4f59ff5d",
1090
+ "metadata": {},
1091
+ "outputs": [],
1092
+ "source": [
1093
+ "# Create Gradio interface\n",
1094
+ "def create_interface():\n",
1095
+ " \"\"\"Create the Gradio interface with model selection\"\"\"\n",
1096
+ " \n",
1097
+ " with gr.Blocks(theme=gr.themes.Soft()) as app:\n",
1098
+ " gr.Markdown(\n",
1099
+ " \"\"\"\n",
1100
+ " # πŸ“Š Enhanced Text Analytics AI Agent\n",
1101
+ " ### Smart Column Detection & Comprehensive Text Analysis with Multiple AI Models\n",
1102
+ " \n",
1103
+ " **Features:**\n",
1104
+ " - πŸ€– Multiple AI Model Support (OpenAI, Anthropic, Deepseek, Groq, Google)\n",
1105
+ " - πŸ” Automatic detection of text, ID, and product columns\n",
1106
+ " - πŸ’Ύ Memory-efficient processing with automatic file cleanup\n",
1107
+ " - 😊 Sentiment analysis with scoring\n",
1108
+ " - 🎯 Topic/theme extraction\n",
1109
+ " - πŸ’‘ Actionable insights generation\n",
1110
+ " - πŸ”Ž Advanced text search with similarity scoring\n",
1111
+ " - πŸ“ˆ Multiple visualization options\n",
1112
+ " - πŸ“₯ Export results in Excel or CSV format\n",
1113
+ " \"\"\"\n",
1114
+ " )\n",
1115
+ " \n",
1116
+ " with gr.Tab(\"πŸ“€ Upload & Process\"):\n",
1117
+ " with gr.Row():\n",
1118
+ " with gr.Column(scale=1):\n",
1119
+ " # Model selection dropdown\n",
1120
+ " model_dropdown = gr.Dropdown(\n",
1121
+ " label=\"πŸ€– Select AI Model\",\n",
1122
+ " choices=model_manager.get_available_models(),\n",
1123
+ " value=model_manager.current_model if model_manager.current_model else None,\n",
1124
+ " interactive=True\n",
1125
+ " )\n",
1126
+ " \n",
1127
+ " file_upload = gr.File(\n",
1128
+ " label=\"Upload Data File\",\n",
1129
+ " file_types=[\".csv\", \".xlsx\", \".xls\", \".json\"]\n",
1130
+ " )\n",
1131
+ " process_btn = gr.Button(\"πŸš€ Process File\", variant=\"primary\")\n",
1132
+ " \n",
1133
+ " with gr.Column(scale=2):\n",
1134
+ " status_output = gr.Markdown(label=\"Processing Status\")\n",
1135
+ " ai_insights = gr.Markdown(label=\"AI-Generated Insights\")\n",
1136
+ " \n",
1137
+ " with gr.Row():\n",
1138
+ " data_preview = gr.Dataframe(\n",
1139
+ " label=\"Data Preview (First 10 rows)\",\n",
1140
+ " interactive=False\n",
1141
+ " )\n",
1142
+ " \n",
1143
+ " processed_file = gr.File(\n",
1144
+ " label=\"πŸ“ Processed Data File\",\n",
1145
+ " interactive=False\n",
1146
+ " )\n",
1147
+ " \n",
1148
+ " with gr.Tab(\"πŸ” Search\"):\n",
1149
+ " gr.Markdown(\"### Search through your text data\")\n",
1150
+ " \n",
1151
+ " with gr.Row():\n",
1152
+ " search_input = gr.Textbox(\n",
1153
+ " label=\"Enter search query\",\n",
1154
+ " placeholder=\"Type keywords to search...\"\n",
1155
+ " )\n",
1156
+ " search_btn = gr.Button(\"πŸ”Ž Search\", variant=\"primary\")\n",
1157
+ " \n",
1158
+ " search_status = gr.Markdown(label=\"Search Status\")\n",
1159
+ " search_results = gr.Dataframe(\n",
1160
+ " label=\"Search Results\",\n",
1161
+ " interactive=False\n",
1162
+ " )\n",
1163
+ " search_file = gr.File(\n",
1164
+ " label=\"πŸ“₯ Download Search Results\",\n",
1165
+ " interactive=False\n",
1166
+ " )\n",
1167
+ " \n",
1168
+ " with gr.Tab(\"πŸ“ˆ Visualizations\"):\n",
1169
+ " with gr.Row():\n",
1170
+ " viz_selector = gr.Dropdown(\n",
1171
+ " label=\"Select Visualization\",\n",
1172
+ " choices=[],\n",
1173
+ " interactive=True\n",
1174
+ " )\n",
1175
+ " \n",
1176
+ " viz_plot = gr.Plot(label=\"Visualization\")\n",
1177
+ " \n",
1178
+ " with gr.Tab(\"πŸ“₯ Export\"):\n",
1179
+ " gr.Markdown(\"### Export your analyzed data\")\n",
1180
+ " \n",
1181
+ " with gr.Row():\n",
1182
+ " export_format = gr.Radio(\n",
1183
+ " choices=[\"Excel\", \"CSV\"],\n",
1184
+ " value=\"Excel\",\n",
1185
+ " label=\"Export Format\"\n",
1186
+ " )\n",
1187
+ " export_btn = gr.Button(\"πŸ“₯ Export Data\", variant=\"primary\")\n",
1188
+ " \n",
1189
+ " export_status = gr.Markdown(label=\"Export Status\")\n",
1190
+ " export_file = gr.File(\n",
1191
+ " label=\"πŸ“ Download Exported File\",\n",
1192
+ " interactive=False\n",
1193
+ " )\n",
1194
+ " \n",
1195
+ " # Event handlers\n",
1196
+ " model_dropdown.change(\n",
1197
+ " fn=update_model,\n",
1198
+ " inputs=[model_dropdown],\n",
1199
+ " outputs=[status_output]\n",
1200
+ " )\n",
1201
+ " \n",
1202
+ " process_btn.click(\n",
1203
+ " fn=process_file,\n",
1204
+ " inputs=[file_upload, model_dropdown],\n",
1205
+ " outputs=[\n",
1206
+ " status_output,\n",
1207
+ " data_preview,\n",
1208
+ " processed_file,\n",
1209
+ " ai_insights,\n",
1210
+ " viz_plot,\n",
1211
+ " search_status,\n",
1212
+ " viz_selector\n",
1213
+ " ]\n",
1214
+ " )\n",
1215
+ " \n",
1216
+ " search_btn.click(\n",
1217
+ " fn=search_data,\n",
1218
+ " inputs=[search_input],\n",
1219
+ " outputs=[search_status, search_results, search_file]\n",
1220
+ " )\n",
1221
+ " \n",
1222
+ " viz_selector.change(\n",
1223
+ " fn=update_visualization,\n",
1224
+ " inputs=[viz_selector],\n",
1225
+ " outputs=[viz_plot]\n",
1226
+ " )\n",
1227
+ " \n",
1228
+ " export_btn.click(\n",
1229
+ " fn=export_results,\n",
1230
+ " inputs=[export_format],\n",
1231
+ " outputs=[export_status, export_file]\n",
1232
+ " )\n",
1233
+ " \n",
1234
+ " return app"
1235
+ ]
1236
+ },
1237
+ {
1238
+ "cell_type": "code",
1239
+ "execution_count": 11,
1240
+ "id": "6c5a0767-a788-43a8-911c-04e81814f4c4",
1241
+ "metadata": {},
1242
+ "outputs": [
1243
+ {
1244
+ "name": "stdout",
1245
+ "output_type": "stream",
1246
+ "text": [
1247
+ "* Running on local URL: http://127.0.0.1:7861\n",
1248
+ "* Running on public URL: https://8190830de481785995.gradio.live\n",
1249
+ "\n",
1250
+ "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n"
1251
+ ]
1252
+ },
1253
+ {
1254
+ "data": {
1255
+ "text/html": [
1256
+ "<div><iframe src=\"https://8190830de481785995.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
1257
+ ],
1258
+ "text/plain": [
1259
+ "<IPython.core.display.HTML object>"
1260
+ ]
1261
+ },
1262
+ "metadata": {},
1263
+ "output_type": "display_data"
1264
+ },
1265
+ {
1266
+ "name": "stdout",
1267
+ "output_type": "stream",
1268
+ "text": [
1269
+ "Keyboard interruption in main thread... closing server.\n",
1270
+ "Killing tunnel 127.0.0.1:7861 <> https://8190830de481785995.gradio.live\n"
1271
+ ]
1272
+ }
1273
+ ],
1274
+ "source": [
1275
+ "# Launch the application\n",
1276
+ "if __name__ == \"__main__\":\n",
1277
+ " app = create_interface()\n",
1278
+ " app.launch(share=True, debug=True)"
1279
+ ]
1280
+ },
1281
+ {
1282
+ "cell_type": "code",
1283
+ "execution_count": 12,
1284
+ "id": "4f382d04-cee3-40ea-9687-5f2dff2282f7",
1285
+ "metadata": {},
1286
+ "outputs": [
1287
+ {
1288
+ "ename": "SyntaxError",
1289
+ "evalue": "invalid syntax (2621292756.py, line 1)",
1290
+ "output_type": "error",
1291
+ "traceback": [
1292
+ "\u001b[0;36m Cell \u001b[0;32mIn[12], line 1\u001b[0;36m\u001b[0m\n\u001b[0;31m python -m textblob.download_corpora\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
1293
+ ]
1294
+ }
1295
+ ],
1296
+ "source": [
1297
+ "python -m textblob.download_corpora"
1298
+ ]
1299
+ },
1300
+ {
1301
+ "cell_type": "code",
1302
+ "execution_count": null,
1303
+ "id": "63afdaca-562b-4846-8fb2-c699f7ab6615",
1304
+ "metadata": {},
1305
+ "outputs": [],
1306
+ "source": []
1307
+ },
1308
+ {
1309
+ "cell_type": "code",
1310
+ "execution_count": null,
1311
+ "id": "d82bb0bb-053e-4c29-af8b-b732dfcb47ad",
1312
+ "metadata": {},
1313
+ "outputs": [],
1314
+ "source": []
1315
+ },
1316
+ {
1317
+ "cell_type": "code",
1318
+ "execution_count": null,
1319
+ "id": "12da3957-a063-48f8-8916-e552cc317280",
1320
+ "metadata": {},
1321
+ "outputs": [],
1322
+ "source": []
1323
+ }
1324
+ ],
1325
+ "metadata": {
1326
+ "kernelspec": {
1327
+ "display_name": "Python 3 (ipykernel)",
1328
+ "language": "python",
1329
+ "name": "python3"
1330
+ },
1331
+ "language_info": {
1332
+ "codemirror_mode": {
1333
+ "name": "ipython",
1334
+ "version": 3
1335
+ },
1336
+ "file_extension": ".py",
1337
+ "mimetype": "text/x-python",
1338
+ "name": "python",
1339
+ "nbconvert_exporter": "python",
1340
+ "pygments_lexer": "ipython3",
1341
+ "version": "3.13.5"
1342
+ }
1343
+ },
1344
+ "nbformat": 4,
1345
+ "nbformat_minor": 5
1346
+ }
app.py ADDED
@@ -0,0 +1,1271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ===== MULTIMODAL TEXT ANALYTICS AI ASSISTANT =====
2
+ # This is a comprehensive text analytics system with multiple AI API integrations
3
+ # and smart column detection capabilities for customer feedback analysis
4
+
5
+ # ===== IMPORTS SECTION =====
6
+ # Core Python libraries for basic functionality
7
+ import os # Operating system interface for environment variables and file operations
8
+ import warnings # Python warnings control to suppress unnecessary warnings
9
+ warnings.filterwarnings('ignore') # Suppress all warnings to keep output clean
10
+
11
+ # Environment and API management
12
+ from dotenv import load_dotenv # Load environment variables from .env file for API keys
13
+ from anthropic import Anthropic # Anthropic's Claude AI API client
14
+
15
+ # Additional AI APIs - using try/except to handle missing dependencies gracefully
16
+ try:
17
+ from openai import OpenAI # OpenAI's GPT API client
18
+ except ImportError:
19
+ OpenAI = None # Set to None if not installed, will be checked later
20
+
21
+ try:
22
+ from groq import Groq # Groq's fast inference API client
23
+ except ImportError:
24
+ Groq = None # Set to None if not installed
25
+
26
+ try:
27
+ import google.generativeai as genai # Google's Gemini API client
28
+ except ImportError:
29
+ genai = None # Set to None if not installed
30
+
31
+ # Data processing and manipulation libraries
32
+ import pandas as pd # Primary data manipulation library for DataFrames
33
+ import numpy as np # Numerical computing library for array operations
34
+ from datetime import datetime, timedelta # Date and time handling utilities
35
+ import json # JSON data format handling
36
+ import gc # Garbage collection for memory management - important for large datasets
37
+
38
+ # Natural Language Processing libraries
39
+ import nltk # Natural Language Toolkit - comprehensive NLP library
40
+ from nltk.corpus import stopwords # Common words to filter out (the, and, or, etc.)
41
+ from nltk.tokenize import word_tokenize # Split text into individual words/tokens
42
+ from nltk.stem import WordNetLemmatizer # Reduce words to their root form (running -> run)
43
+ from textblob import TextBlob # Simple API for diving into common NLP tasks
44
+ import re # Regular expressions for text pattern matching and cleaning
45
+ from collections import Counter # Efficient counting of hashable objects
46
+
47
+ # Machine Learning libraries for text analysis
48
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer # Convert text to numerical features
49
+ from sklearn.decomposition import LatentDirichletAllocation # Topic modeling algorithm
50
+ from sklearn.cluster import KMeans # Clustering algorithm for grouping similar texts
51
+ from sklearn.preprocessing import StandardScaler # Normalize numerical features
52
+ from sklearn.metrics.pairwise import cosine_similarity # Measure similarity between text vectors
53
+
54
+ # Visualization libraries for creating charts and graphs
55
+ import plotly.express as px # High-level plotting interface
56
+ import plotly.graph_objects as go # Low-level plotting interface for custom charts
57
+ from plotly.subplots import make_subplots # Create multiple charts in one figure
58
+ import matplotlib.pyplot as plt # Traditional plotting library
59
+ import seaborn as sns # Statistical data visualization built on matplotlib
60
+
61
+ # Web interface framework
62
+ import gradio as gr # Create web interfaces for machine learning models
63
+
64
+ # Download required NLTK data packages - these contain language models and corpora
65
+ nltk.download('punkt', quiet=True) # Sentence tokenizer models
66
+ nltk.download('punkt_tab', quiet=True) # New tokenizer format for latest NLTK versions
67
+ nltk.download('stopwords', quiet=True) # Lists of common words to filter out
68
+ nltk.download('wordnet', quiet=True) # Lexical database for lemmatization
69
+ nltk.download('averaged_perceptron_tagger', quiet=True) # Part-of-speech tagger
70
+ nltk.download('omw-1.4', quiet=True) # Open Multilingual Wordnet for lemmatizer
71
+ nltk.download('brown', quiet=True) # Brown corpus required for TextBlob
72
+
73
+ # Download TextBlob corpora for sentiment analysis
74
+ try:
75
+ from textblob import download_corpora # Import corpora downloader
76
+ download_corpora.main() # Download all required corpora
77
+ except:
78
+ # Alternative method if the above doesn't work - use subprocess
79
+ import subprocess # Execute shell commands from Python
80
+ import sys # System-specific parameters and functions
81
+ try:
82
+ # Run TextBlob download command as subprocess with timeout
83
+ subprocess.run([sys.executable, "-m", "textblob.download_corpora"],
84
+ capture_output=True, text=True, timeout=30)
85
+ except:
86
+ # If download fails, print warning but continue execution
87
+ print("Warning: Could not download TextBlob corpora. Sentiment analysis may not work properly.")
88
+ print("Please run: python -m textblob.download_corpora")
89
+
90
+ # Load environment variables from .env file, override existing ones
91
+ load_dotenv(override=True)
92
+
93
+ # ===== SMART COLUMN DETECTOR CLASS =====
94
+ class SmartColumnDetector:
95
+ """
96
+ Intelligently detect and extract relevant columns from uploaded data
97
+ This class automatically identifies what type of data each column contains
98
+ """
99
+
100
+ def __init__(self):
101
+ """Initialize the detector with keyword lists for different column types"""
102
+ # Keywords for detecting text/feedback columns - these usually contain the main content
103
+ self.text_keywords = ['comment', 'feedback', 'review', 'description', 'text',
104
+ 'response', 'opinion', 'message', 'notes', 'remarks']
105
+
106
+ # Keywords for detecting ID/identifier columns - these uniquely identify records
107
+ self.id_keywords = ['id', 'identifier', 'key', 'number', 'code', 'ref',
108
+ 'reference', 'index', 'uuid']
109
+
110
+ # Keywords for detecting product/category columns - these describe what's being reviewed
111
+ self.product_keywords = ['product', 'item', 'model', 'variant', 'type',
112
+ 'category', 'brand', 'name', 'sku']
113
+
114
+ # Keywords for detecting date/time columns - these show when feedback was given
115
+ self.date_keywords = ['date', 'time', 'created', 'updated', 'timestamp']
116
+
117
+ def detect_column_types(self, df):
118
+ """
119
+ Detect column types based on column names and content analysis
120
+ Returns a dictionary categorizing each column by its likely purpose
121
+ """
122
+ # Initialize results dictionary with empty lists for each category
123
+ detected = {
124
+ 'text_columns': [], # Columns containing feedback/comments
125
+ 'id_columns': [], # Columns containing unique identifiers
126
+ 'product_columns': [], # Columns describing products/categories
127
+ 'date_columns': [], # Columns containing dates/timestamps
128
+ 'other_columns': [] # Everything else
129
+ }
130
+
131
+ # Iterate through each column in the dataframe
132
+ for col in df.columns:
133
+ col_lower = col.lower() # Convert to lowercase for case-insensitive matching
134
+
135
+ # Check if column name contains text-related keywords
136
+ if any(keyword in col_lower for keyword in self.text_keywords):
137
+ detected['text_columns'].append(col)
138
+ # Check if column name contains ID-related keywords
139
+ elif any(keyword in col_lower for keyword in self.id_keywords):
140
+ detected['id_columns'].append(col)
141
+ # Check if column name contains product-related keywords
142
+ elif any(keyword in col_lower for keyword in self.product_keywords):
143
+ detected['product_columns'].append(col)
144
+ # Check if column name contains date-related keywords
145
+ elif any(keyword in col_lower for keyword in self.date_keywords):
146
+ detected['date_columns'].append(col)
147
+ else:
148
+ # If no keywords match, analyze the actual content to determine type
149
+ sample = df[col].dropna().head(100) # Get first 100 non-null values
150
+ if len(sample) > 0: # If we have sample data
151
+ # Check if column contains text data (object dtype in pandas)
152
+ if df[col].dtype == 'object':
153
+ # Calculate average length of text in this column
154
+ avg_length = sample.astype(str).str.len().mean()
155
+ if avg_length > 50: # Long text likely indicates feedback/comments
156
+ detected['text_columns'].append(col)
157
+ elif avg_length < 20 and df[col].nunique() / len(df) > 0.5:
158
+ # Short, mostly unique values likely indicate IDs
159
+ detected['id_columns'].append(col)
160
+ else:
161
+ # Short, non-unique text likely indicates categories/products
162
+ detected['product_columns'].append(col)
163
+ else:
164
+ # Non-text columns go to 'other' category
165
+ detected['other_columns'].append(col)
166
+
167
+ return detected # Return the categorized column dictionary
168
+
169
+ def extract_relevant_data(self, df):
170
+ """
171
+ Extract only relevant columns and create optimized dataset for analysis
172
+ This reduces memory usage and focuses on important data
173
+ """
174
+ # First, detect what type each column is
175
+ detected = self.detect_column_types(df)
176
+
177
+ # Create new dataframe with only relevant columns
178
+ extracted_data = pd.DataFrame()
179
+
180
+ # Add unique identifier column - use existing ID or create one
181
+ if detected['id_columns'] and len(detected['id_columns']) > 0:
182
+ # Use first detected ID column
183
+ extracted_data['unique_id'] = df[detected['id_columns'][0]]
184
+ else:
185
+ # Create sequential ID numbers if no ID column exists
186
+ extracted_data['unique_id'] = range(1, len(df) + 1)
187
+
188
+ # Add product information columns (limit to first 2 to avoid too many columns)
189
+ if detected['product_columns'] and len(detected['product_columns']) > 0:
190
+ # Convert to list if needed and limit to 2 product columns
191
+ product_cols = list(detected['product_columns'])[:2]
192
+ for col in product_cols:
193
+ # Add with 'product_' prefix to make purpose clear
194
+ extracted_data[f'product_{col}'] = df[col]
195
+
196
+ # Combine all text columns into a single 'combined_text' column
197
+ if detected['text_columns'] and len(detected['text_columns']) > 0:
198
+ text_cols = list(detected['text_columns']) # Ensure it's a list
199
+ text_data = [] # Initialize list to store combined text
200
+
201
+ # For each row, combine all text columns
202
+ for idx in df.index:
203
+ combined_text = ' '.join([
204
+ str(df.loc[idx, col]) # Convert to string
205
+ for col in text_cols # For each text column
206
+ if col in df.columns and pd.notna(df.loc[idx, col]) # If column exists and value is not null
207
+ ])
208
+ text_data.append(combined_text) # Add to our list
209
+ extracted_data['combined_text'] = text_data # Add as new column
210
+ else:
211
+ # If no text columns detected, create empty combined_text column
212
+ extracted_data['combined_text'] = [''] * len(df)
213
+
214
+ # Add date column if available (use first detected date column)
215
+ if detected['date_columns'] and len(detected['date_columns']) > 0:
216
+ # Convert to datetime format, handle errors gracefully
217
+ extracted_data['date'] = pd.to_datetime(df[detected['date_columns'][0]], errors='coerce')
218
+
219
+ # Return both the extracted data and the detection results
220
+ return extracted_data, detected
221
+
222
+ # ===== ENHANCED TEXT PROCESSOR CLASS =====
223
+ class EnhancedTextProcessor:
224
+ """
225
+ Enhanced text preprocessing with actionable insights extraction
226
+ This class handles text cleaning and extracts meaningful patterns from customer feedback
227
+ """
228
+
229
+ def __init__(self):
230
+ """Initialize the text processor with NLP tools and insight dictionaries"""
231
+ self.lemmatizer = WordNetLemmatizer() # Tool to reduce words to root form
232
+ self.stop_words = set(stopwords.words('english')) # Common words to ignore
233
+
234
+ # Dictionary mapping actionable items to keywords that indicate them
235
+ # This helps identify what customers want improved
236
+ self.actionable_dictionary = {
237
+ 'improve speed': ['slow', 'faster', 'quick', 'speed up', 'takes too long', 'waiting'],
238
+ 'better quality': ['poor quality', 'cheap', 'breaks', 'defective', 'flimsy', 'weak'],
239
+ 'enhance ui': ['confusing', 'hard to use', 'complicated', 'not intuitive', 'difficult to navigate'],
240
+ 'fix bugs': ['bug', 'error', 'crash', 'freeze', 'not working', 'glitch', 'broken'],
241
+ 'add features': ['missing', 'need', 'want', 'should have', 'would be nice', 'lacks'],
242
+ 'improve support': ['no response', 'unhelpful', 'rude', 'poor service', 'bad support'],
243
+ 'better packaging': ['damaged', 'poor packaging', 'arrived broken', 'not protected'],
244
+ 'clearer docs': ['unclear', 'no instructions', 'confusing manual', 'hard to understand'],
245
+ 'reduce price': ['expensive', 'overpriced', 'too costly', 'not worth', 'cheaper'],
246
+ 'faster delivery': ['late', 'delayed', 'slow shipping', 'took forever', 'still waiting'],
247
+ 'better communication': ['no updates', 'not informed', 'lack of communication', 'no tracking'],
248
+ 'improve reliability': ['unreliable', 'stops working', 'inconsistent', 'sometimes works'],
249
+ 'enhance performance': ['slow performance', 'laggy', 'sluggish', 'not responsive'],
250
+ 'better design': ['ugly', 'poor design', 'looks cheap', 'not attractive', 'outdated look'],
251
+ 'more options': ['limited options', 'no variety', 'need more choices', 'only one option']
252
+ }
253
+
254
+ def clean_text(self, text):
255
+ """
256
+ Clean and normalize text for analysis
257
+ Removes special characters and standardizes format
258
+ """
259
+ # Handle null or empty text
260
+ if pd.isna(text) or text == '':
261
+ return ""
262
+
263
+ text = str(text).lower() # Convert to lowercase string
264
+ text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters, keep only letters, numbers, spaces
265
+ text = ' '.join(text.split()) # Remove extra whitespace
266
+ return text
267
+
268
+ def extract_actionable_insights(self, text):
269
+ """
270
+ Extract actionable insights using dictionary matching
271
+ Returns comma-separated list of suggested improvements
272
+ """
273
+ # Handle null or empty text
274
+ if pd.isna(text) or text == '':
275
+ return ""
276
+
277
+ text_lower = text.lower() # Convert to lowercase for matching
278
+ found_insights = [] # List to store found actionable items
279
+
280
+ # Check each actionable item against the text
281
+ for action, keywords in self.actionable_dictionary.items():
282
+ for keyword in keywords:
283
+ if keyword in text_lower: # If keyword found in text
284
+ found_insights.append(action) # Add the actionable item
285
+ break # Only add each action once per text
286
+
287
+ # Return top 3 most relevant insights to avoid overwhelming output
288
+ if found_insights:
289
+ return ', '.join(found_insights[:3])
290
+ return ""
291
+
292
+ def extract_specific_topics(self, text):
293
+ """
294
+ Extract specific topics from text using keyword extraction and noun phrase detection
295
+ Returns list of 3 topics (may include empty strings if not enough topics found)
296
+ """
297
+ # Handle null, empty, or very short text
298
+ if pd.isna(text) or text == '' or len(text) < 10:
299
+ return ['', '', ''] # Return 3 empty strings
300
+
301
+ text_lower = text.lower() # Convert to lowercase
302
+
303
+ # Remove stopwords for better topic extraction
304
+ words = word_tokenize(text_lower) # Split into individual words
305
+ # Filter out stopwords and very short words
306
+ filtered_words = [w for w in words if w not in self.stop_words and len(w) > 3]
307
+
308
+ # Extract noun phrases using TextBlob (these are usually good topics)
309
+ blob = TextBlob(text)
310
+ noun_phrases = blob.noun_phrases # Get noun phrases from text
311
+
312
+ topics = [] # Initialize topics list
313
+
314
+ # Add noun phrases (these are usually good topics)
315
+ for phrase in noun_phrases[:5]: # Limit to top 5 noun phrases
316
+ if len(phrase.split()) <= 3: # Only include short phrases (3 words or less)
317
+ topics.append(phrase)
318
+
319
+ # Add frequent meaningful words if we don't have enough topics
320
+ if len(topics) < 3:
321
+ word_freq = Counter(filtered_words) # Count word frequencies
322
+ for word, _ in word_freq.most_common(5): # Get top 5 most common words
323
+ if word not in str(topics): # Avoid duplicates
324
+ topics.append(word)
325
+ if len(topics) >= 3: # Stop when we have 3 topics
326
+ break
327
+
328
+ # Ensure we always return exactly 3 items
329
+ topics = topics[:3] # Take only first 3
330
+ while len(topics) < 3: # Add empty strings if needed
331
+ topics.append('')
332
+
333
+ return topics
334
+
335
+ def determine_topic(self, text):
336
+ """
337
+ Legacy method kept for compatibility - returns first specific topic
338
+ This maintains backward compatibility with older versions
339
+ """
340
+ topics = self.extract_specific_topics(text) # Get all topics
341
+ return topics[0] if topics[0] else 'General' # Return first topic or 'General'
342
+
343
+ # ===== SEARCH ENGINE CLASS =====
344
+ class TextSearchEngine:
345
+ """
346
+ Advanced search functionality for text data with semantic capabilities
347
+ Uses TF-IDF vectorization and cosine similarity for intelligent text search
348
+ """
349
+
350
+ def __init__(self):
351
+ """Initialize the search engine with TF-IDF vectorizer and synonym dictionary"""
352
+ # TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer
353
+ # Converts text to numerical vectors for similarity calculations
354
+ self.vectorizer = TfidfVectorizer(
355
+ max_features=1000, # Limit to top 1000 most important terms
356
+ ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams for better matching
357
+ stop_words='english', # Remove common English words
358
+ use_idf=True, # Use inverse document frequency weighting
359
+ smooth_idf=True, # Add smoothing to IDF
360
+ sublinear_tf=True # Apply sublinear tf scaling for better performance
361
+ )
362
+ self.tfidf_matrix = None # Will store the TF-IDF matrix after building index
363
+ self.data = None # Will store the original data
364
+
365
+ # Synonym dictionary for semantic search - helps find related terms
366
+ self.synonyms = {
367
+ 'fast': ['quick', 'rapid', 'speedy', 'swift', 'prompt'],
368
+ 'slow': ['sluggish', 'delayed', 'laggy', 'lengthy', 'prolonged'],
369
+ 'good': ['excellent', 'great', 'wonderful', 'fantastic', 'amazing', 'positive'],
370
+ 'bad': ['poor', 'terrible', 'awful', 'negative', 'horrible', 'disappointing'],
371
+ 'problem': ['issue', 'bug', 'error', 'defect', 'fault', 'glitch'],
372
+ 'help': ['support', 'assistance', 'aid', 'service'],
373
+ 'price': ['cost', 'fee', 'charge', 'rate', 'payment', 'expensive', 'cheap'],
374
+ 'quality': ['standard', 'grade', 'condition', 'caliber'],
375
+ 'delivery': ['shipping', 'dispatch', 'arrival', 'transport'],
376
+ 'easy': ['simple', 'straightforward', 'effortless', 'user-friendly'],
377
+ 'hard': ['difficult', 'complex', 'complicated', 'challenging'],
378
+ 'broken': ['damaged', 'defective', 'faulty', 'malfunctioning'],
379
+ 'love': ['like', 'enjoy', 'appreciate', 'adore'],
380
+ 'hate': ['dislike', 'despise', 'detest'],
381
+ 'feature': ['function', 'capability', 'option', 'characteristic'],
382
+ 'customer': ['client', 'buyer', 'purchaser', 'consumer', 'user']
383
+ }
384
+
385
+ def expand_query_with_synonyms(self, query):
386
+ """
387
+ Expand search query with synonyms for better semantic matching
388
+ This helps find relevant results even when different words are used
389
+ """
390
+ query_words = query.lower().split() # Split query into individual words
391
+ expanded_terms = [] # List to store original words and synonyms
392
+
393
+ for word in query_words:
394
+ expanded_terms.append(word) # Add the original word
395
+
396
+ # Add synonyms if available for this word
397
+ if word in self.synonyms:
398
+ expanded_terms.extend(self.synonyms[word])
399
+
400
+ # Check if word is a synonym of something else and add related terms
401
+ for key, syns in self.synonyms.items():
402
+ if word in syns: # If current word is a synonym
403
+ expanded_terms.append(key) # Add the main term
404
+ expanded_terms.extend([s for s in syns if s != word]) # Add other synonyms
405
+
406
+ # Remove duplicates while preserving order
407
+ seen = set()
408
+ unique_terms = []
409
+ for term in expanded_terms:
410
+ if term not in seen:
411
+ unique_terms.append(term)
412
+ seen.add(term)
413
+
414
+ return ' '.join(unique_terms) # Return expanded query as single string
415
+
416
+ def build_index(self, df, text_column):
417
+ """
418
+ Build search index from text data
419
+ Creates TF-IDF vectors for all documents to enable fast similarity search
420
+ """
421
+ self.data = df.copy() # Store copy of the data
422
+ texts = df[text_column].fillna('').tolist() # Get all text, fill nulls with empty string
423
+
424
+ # Add other searchable columns to improve search accuracy
425
+ if 'topic_1' in df.columns:
426
+ # Combine main text with topic information for better searchability
427
+ texts = [f"{text} {df.iloc[i]['topic_1']} {df.iloc[i]['topic_2']} {df.iloc[i]['topic_3']}"
428
+ for i, text in enumerate(texts)]
429
+ if 'actionable_insights' in df.columns:
430
+ # Also include actionable insights in searchable text
431
+ texts = [f"{texts[i]} {df.iloc[i]['actionable_insights']}"
432
+ for i in range(len(texts))]
433
+
434
+ # Create TF-IDF matrix from all texts
435
+ self.tfidf_matrix = self.vectorizer.fit_transform(texts)
436
+
437
+ def search(self, query, top_k=10):
438
+ """
439
+ Enhanced search with semantic understanding
440
+ Returns top matching documents with similarity scores
441
+ """
442
+ # Check if index has been built
443
+ if self.tfidf_matrix is None:
444
+ return pd.DataFrame() # Return empty DataFrame if no index
445
+
446
+ # Expand query with synonyms for better semantic matching
447
+ expanded_query = self.expand_query_with_synonyms(query)
448
+
449
+ # Vectorize both original and expanded queries
450
+ query_vector = self.vectorizer.transform([query]) # Original query vector
451
+ expanded_vector = self.vectorizer.transform([expanded_query]) # Expanded query vector
452
+
453
+ # Calculate similarities for both queries against all documents
454
+ similarities_orig = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
455
+ similarities_exp = cosine_similarity(expanded_vector, self.tfidf_matrix).flatten()
456
+
457
+ # Combine scores (weighted average - original query gets more weight)
458
+ combined_similarities = (0.7 * similarities_orig + 0.3 * similarities_exp)
459
+
460
+ # Get top results
461
+ top_indices = combined_similarities.argsort()[-top_k:][::-1] # Get indices of top scores, reverse order
462
+ top_scores = combined_similarities[top_indices] # Get the actual scores
463
+
464
+ # Filter results with score > 0.05 (lower threshold for better recall)
465
+ valid_indices = [idx for idx, score in zip(top_indices, top_scores) if score > 0.05]
466
+
467
+ if valid_indices:
468
+ # Create results dataframe from valid matches
469
+ results = self.data.iloc[valid_indices].copy()
470
+ results['search_score'] = [combined_similarities[idx] for idx in valid_indices]
471
+
472
+ # Boost results that have exact matches in the text
473
+ query_lower = query.lower()
474
+ for idx in results.index:
475
+ if 'combined_text' in results.columns:
476
+ # If exact query appears in text, boost the score
477
+ if query_lower in str(results.at[idx, 'combined_text']).lower():
478
+ results.at[idx, 'search_score'] *= 1.5 # 50% boost for exact matches
479
+
480
+ return results.sort_values('search_score', ascending=False) # Return sorted by relevance
481
+
482
+ return pd.DataFrame() # Return empty DataFrame if no valid results
483
+
484
+ # ===== AI MODEL MANAGER CLASS =====
485
+ class AIModelManager:
486
+ """
487
+ Manages multiple AI model APIs and provides unified interface
488
+ Supports OpenAI, Anthropic, Deepseek, Groq, and Google Gemini
489
+ """
490
+
491
+ def __init__(self):
492
+ """Initialize the model manager and set up all available AI APIs"""
493
+ self.available_models = {} # Dictionary to store available models
494
+ self.clients = {} # Dictionary to store API clients
495
+ self.current_model = None # Currently selected model
496
+ self.initialize_apis() # Set up all APIs
497
+
498
+ def initialize_apis(self):
499
+ """Initialize all available AI APIs based on environment variables"""
500
+
501
+ # Anthropic Claude API setup
502
+ ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") # Get API key from environment
503
+ if ANTHROPIC_API_KEY: # If API key exists
504
+ try:
505
+ self.clients['anthropic'] = Anthropic(api_key=ANTHROPIC_API_KEY) # Create client
506
+ # Add Claude model to available models
507
+ self.available_models['Claude 3 Haiku'] = {
508
+ 'provider': 'anthropic',
509
+ 'model': 'claude-3-haiku-20240307'
510
+ }
511
+ print(f"Anthropic API Key exists and begins {ANTHROPIC_API_KEY[:4]}") # Confirm setup
512
+ except Exception as e:
513
+ print(f"Error initializing Anthropic: {e}")
514
+ else:
515
+ print("Anthropic API Key not set")
516
+
517
+ # OpenAI API setup
518
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
519
+ if OPENAI_API_KEY and OpenAI: # Check both API key and library availability
520
+ try:
521
+ self.clients['openai'] = OpenAI(api_key=OPENAI_API_KEY)
522
+ # Add multiple OpenAI models
523
+ self.available_models['GPT-4o-mini'] = {
524
+ 'provider': 'openai',
525
+ 'model': 'gpt-4o-mini'
526
+ }
527
+ self.available_models['GPT-3.5 Turbo'] = {
528
+ 'provider': 'openai',
529
+ 'model': 'gpt-3.5-turbo'
530
+ }
531
+ print(f"OpenAI API Key exists and begins {OPENAI_API_KEY[:7]}")
532
+ except Exception as e:
533
+ print(f"Error initializing OpenAI: {e}")
534
+ else:
535
+ print("OpenAI API Key not set or library not installed")
536
+
537
+ # Deepseek API setup (uses OpenAI-compatible API)
538
+ DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
539
+ if DEEPSEEK_API_KEY and OpenAI:
540
+ try:
541
+ # Deepseek uses OpenAI client with different base URL
542
+ self.clients['deepseek'] = OpenAI(
543
+ api_key=DEEPSEEK_API_KEY,
544
+ base_url="https://api.deepseek.com" # Deepseek's API endpoint
545
+ )
546
+ self.available_models['Deepseek Chat'] = {
547
+ 'provider': 'deepseek',
548
+ 'model': 'deepseek-chat'
549
+ }
550
+ print(f"Deepseek API Key exists and begins {DEEPSEEK_API_KEY[:7]}")
551
+ except Exception as e:
552
+ print(f"Error initializing Deepseek: {e}")
553
+ else:
554
+ print("Deepseek API Key not set or OpenAI library not installed")
555
+
556
+ # Groq API setup
557
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
558
+ if GROQ_API_KEY and Groq:
559
+ try:
560
+ self.clients['groq'] = Groq(api_key=GROQ_API_KEY)
561
+ # Add multiple Groq models
562
+ self.available_models['Llama 3.3 70B'] = {
563
+ 'provider': 'groq',
564
+ 'model': 'llama-3.3-70b-versatile'
565
+ }
566
+ self.available_models['Mixtral 8x7B'] = {
567
+ 'provider': 'groq',
568
+ 'model': 'mixtral-8x7b-32768'
569
+ }
570
+ print(f"Groq API Key exists and begins {GROQ_API_KEY[:4]}")
571
+ except Exception as e:
572
+ print(f"Error initializing Groq: {e}")
573
+ else:
574
+ print("Groq API Key not set or library not installed")
575
+
576
+ # Google Gemini API setup
577
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
578
+ if GOOGLE_API_KEY and genai:
579
+ try:
580
+ genai.configure(api_key=GOOGLE_API_KEY) # Configure Google AI
581
+ self.clients['google'] = genai # Store the configured module
582
+ # Add Google models
583
+ self.available_models['Gemini 1.5 Flash'] = {
584
+ 'provider': 'google',
585
+ 'model': 'gemini-1.5-flash'
586
+ }
587
+ self.available_models['Gemini 1.5 Pro'] = {
588
+ 'provider': 'google',
589
+ 'model': 'gemini-1.5-pro'
590
+ }
591
+ print(f"Google API Key exists and begins {GOOGLE_API_KEY[:2]}")
592
+ except Exception as e:
593
+ print(f"Error initializing Google Gemini: {e}")
594
+ else:
595
+ print("Google API Key not set or library not installed")
596
+
597
+ # Set default model to first available model
598
+ if self.available_models:
599
+ self.current_model = list(self.available_models.keys())[0]
600
+
601
+ def get_available_models(self):
602
+ """Return list of available model names"""
603
+ return list(self.available_models.keys())
604
+
605
+ def set_model(self, model_name):
606
+ """Set the current model for text generation"""
607
+ if model_name in self.available_models:
608
+ self.current_model = model_name
609
+ return True # Success
610
+ return False # Model not available
611
+
612
+ def generate_text(self, prompt, max_tokens=1000):
613
+ """
614
+ Generate text using the current model
615
+ Handles different API formats for each provider
616
+ """
617
+ # Check if we have a valid current model
618
+ if not self.current_model or self.current_model not in self.available_models:
619
+ return None
620
+
621
+ model_info = self.available_models[self.current_model] # Get model configuration
622
+ provider = model_info['provider'] # Which API provider to use
623
+ model = model_info['model'] # Specific model name
624
+
625
+ try:
626
+ # Handle Anthropic API format
627
+ if provider == 'anthropic':
628
+ client = self.clients['anthropic']
629
+ response = client.messages.create(
630
+ model=model,
631
+ max_tokens=max_tokens,
632
+ messages=[{"role": "user", "content": prompt}]
633
+ )
634
+ return response.content[0].text # Extract text from response
635
+
636
+ # Handle OpenAI and Deepseek API format (both use OpenAI-compatible format)
637
+ elif provider in ['openai', 'deepseek']:
638
+ client = self.clients[provider]
639
+ response = client.chat.completions.create(
640
+ model=model,
641
+ messages=[{"role": "user", "content": prompt}],
642
+ max_tokens=max_tokens
643
+ )
644
+ return response.choices[0].message.content # Extract text from response
645
+
646
+ # Handle Groq API format (similar to OpenAI)
647
+ elif provider == 'groq':
648
+ client = self.clients['groq']
649
+ response = client.chat.completions.create(
650
+ model=model,
651
+ messages=[{"role": "user", "content": prompt}],
652
+ max_tokens=max_tokens
653
+ )
654
+ return response.choices[0].message.content
655
+
656
+ # Handle Google Gemini API format
657
+ elif provider == 'google':
658
+ model_obj = genai.GenerativeModel(model) # Create model object
659
+ response = model_obj.generate_content(prompt) # Generate response
660
+ return response.text # Extract text
661
+
662
+ except Exception as e:
663
+ print(f"Error generating text with {self.current_model}: {e}")
664
+ return None
665
+
666
+ # Initialize the model manager globally so it can be used throughout the application
667
+ model_manager = AIModelManager()
668
+
669
+ # ===== ENHANCED TEXT ANALYZER CLASS =====
670
+ class EnhancedTextAnalyzer:
671
+ """
672
+ Main analysis engine with all enhanced features and multi-model support
673
+ This is the core class that orchestrates all text analysis functionality
674
+ """
675
+
676
+ def __init__(self, model_manager=None):
677
+ """Initialize the analyzer with all component classes"""
678
+ self.model_manager = model_manager # AI model manager for generating insights
679
+ self.column_detector = SmartColumnDetector() # Smart column detection
680
+ self.text_processor = EnhancedTextProcessor() # Text processing and insights
681
+ self.search_engine = TextSearchEngine() # Text search functionality
682
+ self.original_df = None # Store original data
683
+ self.processed_df = None # Store processed data
684
+ self.results = {} # Store analysis results
685
+ self.visualizations = {} # Store generated visualizations
686
+
687
+ def load_file(self, file):
688
+ """
689
+ Load data from various file formats (CSV, Excel, JSON)
690
+ Returns the loaded dataframe and a status message
691
+ """
692
+ try:
693
+ # Determine file type based on extension and load accordingly
694
+ if file.name.endswith('.csv'):
695
+ df = pd.read_csv(file.name) # Load CSV file
696
+ elif file.name.endswith(('.xlsx', '.xls')):
697
+ df = pd.read_excel(file.name) # Load Excel file
698
+ elif file.name.endswith('.json'):
699
+ df = pd.read_json(file.name) # Load JSON file
700
+ else:
701
+ return None, "Unsupported file format" # Return error for unsupported formats
702
+
703
+ return df, f"File loaded: {len(df)} records" # Return success message with record count
704
+ except Exception as e:
705
+ return None, f"Error loading file: {str(e)}" # Return error message
706
+
707
+ def process_data(self, df):
708
+ """
709
+ Process data with smart extraction and analysis
710
+ This is the main processing pipeline that analyzes the uploaded data
711
+ """
712
+ # Step 1: Extract relevant columns using smart detection
713
+ extracted_df, detected_columns = self.column_detector.extract_relevant_data(df)
714
+
715
+ # Step 2: Store processed data for later use
716
+ self.processed_df = extracted_df
717
+
718
+ # Step 3: Clean up memory by deleting original large dataframe
719
+ del df
720
+ gc.collect() # Force garbage collection to free memory
721
+
722
+ # Step 4: Add analysis columns if we have text data to analyze
723
+ if 'combined_text' in extracted_df.columns:
724
+ # Initialize lists to store analysis results for each row
725
+ sentiments = [] # Positive/Negative/Neutral sentiment classification
726
+ polarities = [] # Numerical sentiment scores (-1 to 1)
727
+ topics_1 = [] # Primary topic for each text
728
+ topics_2 = [] # Secondary topic for each text
729
+ topics_3 = [] # Tertiary topic for each text
730
+ insights = [] # Actionable insights for each text
731
+
732
+ # Process each text entry
733
+ for text in extracted_df['combined_text']:
734
+ # Sentiment analysis using TextBlob
735
+ blob = TextBlob(text)
736
+ polarity = blob.sentiment.polarity # Get numerical sentiment score
737
+
738
+ # Convert numerical score to categorical sentiment
739
+ if polarity > 0.1: # Positive threshold
740
+ sentiment = 'Positive'
741
+ elif polarity < -0.1: # Negative threshold
742
+ sentiment = 'Negative'
743
+ else: # Neutral range
744
+ sentiment = 'Neutral'
745
+
746
+ sentiments.append(sentiment) # Add categorical sentiment
747
+ polarities.append(polarity) # Add numerical score
748
+
749
+ # Extract specific topics (3 separate topics per text)
750
+ specific_topics = self.text_processor.extract_specific_topics(text)
751
+ topics_1.append(specific_topics[0]) # Primary topic
752
+ topics_2.append(specific_topics[1]) # Secondary topic
753
+ topics_3.append(specific_topics[2]) # Tertiary topic
754
+
755
+ # Extract actionable insights using dictionary matching
756
+ insight = self.text_processor.extract_actionable_insights(text)
757
+ insights.append(insight)
758
+
759
+ # Add all analysis results as new columns to the dataframe
760
+ extracted_df['sentiment'] = sentiments # Categorical sentiment
761
+ extracted_df['sentiment_score'] = polarities # Numerical sentiment score
762
+ extracted_df['topic_1'] = topics_1 # Primary topic
763
+ extracted_df['topic_2'] = topics_2 # Secondary topic
764
+ extracted_df['topic_3'] = topics_3 # Tertiary topic
765
+ extracted_df['actionable_insights'] = insights # Actionable insights
766
+
767
+ # Build search index with enhanced search capabilities
768
+ self.search_engine.build_index(extracted_df, 'combined_text')
769
+
770
+ # Step 5: Save processed data to Excel file for download
771
+ output_file = 'processed_data.xlsx'
772
+ extracted_df.to_excel(output_file, index=False)
773
+
774
+ # Return processed data, detected column info, and output file path
775
+ return extracted_df, detected_columns, output_file
776
+
777
+ def generate_ai_insights(self, df, num_samples=5):
778
+ """
779
+ Generate AI-powered insights using selected model
780
+ Takes sample texts and generates high-level insights using AI
781
+ """
782
+ # Check if AI model is available
783
+ if not self.model_manager or not self.model_manager.current_model:
784
+ return "No AI model available for generating insights"
785
+
786
+ # Check if we have text data to analyze
787
+ if 'combined_text' not in df.columns or df.empty:
788
+ return "No text data available for AI analysis"
789
+
790
+ # Sample some texts for analysis (to avoid sending too much data to AI)
791
+ sample_texts = df['combined_text'].dropna().head(num_samples).tolist()
792
+ if not sample_texts:
793
+ return "No valid text samples found"
794
+
795
+ # Create prompt for AI analysis
796
+ # This prompt asks the AI to analyze the customer feedback samples
797
+ prompt = f"""Analyze the following customer feedback samples and provide key insights:
798
+
799
+ Samples:
800
+ {chr(10).join([f"{i+1}. {text[:200]}..." if len(text) > 200 else f"{i+1}. {text}" for i, text in enumerate(sample_texts)])}
801
+
802
+ Please provide:
803
+ 1. Main themes and patterns
804
+ 2. Key sentiment indicators
805
+ 3. Actionable recommendations
806
+ 4. Areas of concern
807
+
808
+ Keep the response concise and focused on actionable insights."""
809
+
810
+ # Generate insights using selected model
811
+ try:
812
+ response = self.model_manager.generate_text(prompt, max_tokens=500)
813
+ if response:
814
+ return f"**AI Insights (using {self.model_manager.current_model}):**\n\n{response}"
815
+ else:
816
+ return "Failed to generate AI insights. Please check your API configuration."
817
+ except Exception as e:
818
+ return f"Error generating AI insights: {str(e)}"
819
+
820
+ def generate_visualizations(self, df):
821
+ """
822
+ Generate various visualizations from the analyzed data
823
+ Creates interactive charts using Plotly for better user experience
824
+ """
825
+ visualizations = {} # Dictionary to store all visualizations
826
+
827
+ # Generate sentiment distribution pie chart
828
+ if 'sentiment' in df.columns:
829
+ sentiment_counts = df['sentiment'].value_counts() # Count each sentiment category
830
+ fig_sentiment = px.pie(
831
+ values=sentiment_counts.values, # Values for pie slices
832
+ names=sentiment_counts.index, # Labels for pie slices
833
+ title="Sentiment Distribution", # Chart title
834
+ color_discrete_map={ # Custom colors for each sentiment
835
+ 'Positive': '#27AE60', # Green for positive
836
+ 'Negative': '#E74C3C', # Red for negative
837
+ 'Neutral': '#95A5A6' # Gray for neutral
838
+ }
839
+ )
840
+ visualizations['Sentiment Distribution'] = fig_sentiment
841
+
842
+ # Generate topic distribution bar chart
843
+ if 'topic_1' in df.columns:
844
+ # Combine all topics from all three topic columns
845
+ all_topics = []
846
+ for col in ['topic_1', 'topic_2', 'topic_3']:
847
+ if col in df.columns:
848
+ topics = df[col].dropna().tolist() # Get non-null topics
849
+ all_topics.extend([t for t in topics if t != '']) # Add non-empty topics
850
+
851
+ if all_topics:
852
+ topic_counts = Counter(all_topics) # Count topic frequencies
853
+ top_topics = dict(topic_counts.most_common(15)) # Get top 15 topics
854
+
855
+ fig_topics = px.bar(
856
+ x=list(top_topics.values()), # Frequency values
857
+ y=list(top_topics.keys()), # Topic names
858
+ orientation='h', # Horizontal bar chart
859
+ title="Top 15 Specific Topics", # Chart title
860
+ labels={'x': 'Count', 'y': 'Topic'} # Axis labels
861
+ )
862
+ visualizations['Topic Distribution'] = fig_topics
863
+
864
+ # Generate sentiment by topic heatmap
865
+ if 'sentiment' in df.columns and 'topic_1' in df.columns:
866
+ df_temp = df[df['topic_1'] != ''].copy() # Filter out empty topics
867
+ if not df_temp.empty:
868
+ # Get top 10 topics for cleaner visualization
869
+ top_topics = df_temp['topic_1'].value_counts().head(10).index
870
+ df_filtered = df_temp[df_temp['topic_1'].isin(top_topics)]
871
+
872
+ # Create cross-tabulation of topics vs sentiments
873
+ pivot_table = pd.crosstab(df_filtered['topic_1'], df_filtered['sentiment'])
874
+ fig_heatmap = px.imshow(
875
+ pivot_table, # Data for heatmap
876
+ labels=dict(x="Sentiment", y="Primary Topic", color="Count"), # Labels
877
+ title="Sentiment by Primary Topic Heatmap", # Title
878
+ color_continuous_scale="RdYlGn" # Color scale (red to green)
879
+ )
880
+ visualizations['Sentiment by Topic'] = fig_heatmap
881
+
882
+ # Generate sentiment timeline if date data is available
883
+ if 'date' in df.columns and 'sentiment' in df.columns:
884
+ df_time = df.copy()
885
+ df_time['date'] = pd.to_datetime(df_time['date']) # Ensure date format
886
+ # Group by month and sentiment to show trends over time
887
+ time_data = df_time.groupby([pd.Grouper(key='date', freq='M'), 'sentiment']).size().reset_index(name='count')
888
+
889
+ fig_timeline = px.line(
890
+ time_data,
891
+ x='date', # X-axis: time
892
+ y='count', # Y-axis: count
893
+ color='sentiment', # Different lines for each sentiment
894
+ title="Sentiment Trends Over Time", # Chart title
895
+ color_discrete_map={ # Custom colors
896
+ 'Positive': '#27AE60',
897
+ 'Negative': '#E74C3C',
898
+ 'Neutral': '#95A5A6'
899
+ }
900
+ )
901
+ visualizations['Sentiment Timeline'] = fig_timeline
902
+
903
+ # Generate actionable insights bar chart
904
+ if 'actionable_insights' in df.columns:
905
+ all_insights = [] # List to store all individual insights
906
+ for insight in df['actionable_insights']:
907
+ if insight and insight != "":
908
+ # Split by comma as we're now using comma-separated insights
909
+ all_insights.extend([i.strip() for i in insight.split(',')])
910
+
911
+ if all_insights:
912
+ insight_counts = Counter(all_insights) # Count insight frequencies
913
+ top_insights = dict(insight_counts.most_common(10)) # Get top 10 insights
914
+
915
+ fig_insights = px.bar(
916
+ x=list(top_insights.values()), # Frequency values
917
+ y=list(top_insights.keys()), # Insight names
918
+ orientation='h', # Horizontal bar chart
919
+ title="Top 10 Actionable Insights", # Chart title
920
+ labels={'x': 'Frequency', 'y': 'Insight'} # Axis labels
921
+ )
922
+ visualizations['Top Insights'] = fig_insights
923
+
924
+ return visualizations # Return dictionary of all generated visualizations
925
+
926
+ # ===== GRADIO INTERFACE FUNCTIONS =====
927
+ # Global variables to maintain state across function calls
928
+ analyzer = None # Main analyzer instance
929
+ current_data = None # Currently processed data
930
+ current_visualizations = None # Currently generated visualizations
931
+
932
+ def update_model(model_name):
933
+ """Update the selected AI model"""
934
+ global model_manager
935
+
936
+ if model_manager.set_model(model_name): # Try to set the new model
937
+ return f"βœ… Model switched to: {model_name}"
938
+ else:
939
+ return f"❌ Failed to switch to: {model_name}"
940
+
941
+ def process_file(file, model_name):
942
+ """
943
+ Process uploaded file with selected model
944
+ This is the main function called when user uploads a file
945
+ """
946
+ global analyzer, current_data, current_visualizations, model_manager
947
+
948
+ # Check if file was uploaded
949
+ if file is None:
950
+ return "Please upload a file", None, None, None, None, None, gr.update(choices=[])
951
+
952
+ try:
953
+ # Update model if changed
954
+ if model_name and model_manager:
955
+ model_manager.set_model(model_name)
956
+
957
+ # Create new analyzer instance
958
+ analyzer = EnhancedTextAnalyzer(model_manager)
959
+
960
+ # Load the uploaded file
961
+ df, message = analyzer.load_file(file)
962
+ if df is None: # If file loading failed
963
+ return message, None, None, None, None, None, gr.update(choices=[])
964
+
965
+ # Process the loaded data
966
+ processed_df, detected_cols, output_file = analyzer.process_data(df)
967
+ current_data = processed_df # Store for later use
968
+
969
+ # Generate visualizations from processed data
970
+ visualizations = analyzer.generate_visualizations(processed_df)
971
+ current_visualizations = visualizations # Store for later use
972
+
973
+ # Generate AI insights using the selected model
974
+ ai_insights = analyzer.generate_ai_insights(processed_df)
975
+
976
+ # Create summary of processing results
977
+ # Safely handle detected columns (convert to lists and limit length)
978
+ text_cols = list(detected_cols.get('text_columns', []))[:3] if detected_cols.get('text_columns') else []
979
+ id_cols = list(detected_cols.get('id_columns', []))[:3] if detected_cols.get('id_columns') else []
980
+ product_cols = list(detected_cols.get('product_columns', []))[:3] if detected_cols.get('product_columns') else []
981
+
982
+ summary = f"""
983
+ ### βœ… File Processing Complete!
984
+
985
+ **Detected Columns:**
986
+ - Text Columns: {', '.join(text_cols) if text_cols else 'None'}
987
+ - ID Columns: {', '.join(id_cols) if id_cols else 'Auto-generated'}
988
+ - Product Columns: {', '.join(product_cols) if product_cols else 'None'}
989
+
990
+ **Analysis Results:**
991
+ - Total Records: {len(processed_df)}
992
+ - Processed File Saved: {output_file}
993
+ - AI Model Used: {model_manager.current_model if model_manager else 'None'}
994
+ """
995
+
996
+ # Create data preview (first 10 rows for display)
997
+ preview = processed_df.head(10)
998
+
999
+ # Get first visualization for immediate display
1000
+ first_viz = list(visualizations.values())[0] if visualizations else None
1001
+
1002
+ # Return all results for the Gradio interface
1003
+ return (
1004
+ summary, # Processing status
1005
+ preview, # Data preview
1006
+ output_file, # Downloadable processed file
1007
+ ai_insights, # AI-generated insights
1008
+ first_viz, # First visualization
1009
+ "Ready for search", # Search status
1010
+ gr.update(choices=list(visualizations.keys())) # Update visualization dropdown
1011
+ )
1012
+
1013
+ except Exception as e:
1014
+ # Return error message if anything goes wrong
1015
+ return f"Error: {str(e)}", None, None, None, None, None, gr.update(choices=[])
1016
+
1017
+ def search_data(query):
1018
+ """
1019
+ Search through the data with enhanced semantic search
1020
+ Uses the built search engine to find relevant text entries
1021
+ """
1022
+ global analyzer, current_data
1023
+
1024
+ # Check if data has been processed
1025
+ if analyzer is None or current_data is None:
1026
+ return "Please process a file first", None, None
1027
+
1028
+ # Check if search query was provided
1029
+ if not query:
1030
+ return "Please enter a search query", None, None
1031
+
1032
+ try:
1033
+ # Perform the search using the search engine
1034
+ results = analyzer.search_engine.search(query, top_k=10)
1035
+
1036
+ # Check if any results were found
1037
+ if results.empty:
1038
+ return "No results found", None, None
1039
+
1040
+ # Select relevant columns for display (updated to include new topic columns)
1041
+ display_cols = ['unique_id', 'combined_text', 'sentiment', 'topic_1', 'topic_2', 'topic_3', 'actionable_insights', 'search_score']
1042
+ display_cols = [col for col in display_cols if col in results.columns] # Only include existing columns
1043
+
1044
+ results_display = results[display_cols] # Create display dataframe
1045
+
1046
+ # Save search results to file for download
1047
+ search_output = f"search_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
1048
+ results_display.to_excel(search_output, index=False)
1049
+
1050
+ # Return search results and status
1051
+ return f"Found {len(results)} results", results_display.head(10), search_output
1052
+
1053
+ except Exception as e:
1054
+ return f"Search error: {str(e)}", None, None
1055
+
1056
+ def update_visualization(viz_type):
1057
+ """
1058
+ Update displayed visualization based on user selection
1059
+ Called when user selects a different visualization from dropdown
1060
+ """
1061
+ global current_visualizations
1062
+
1063
+ # Check if visualization exists and return it
1064
+ if current_visualizations and viz_type in current_visualizations:
1065
+ return current_visualizations[viz_type]
1066
+ return None # Return None if visualization not found
1067
+
1068
+ def export_results(format_type):
1069
+ """
1070
+ Export processed data in different formats (Excel or CSV)
1071
+ Allows users to download their analyzed data
1072
+ """
1073
+ global current_data
1074
+
1075
+ # Check if there's data to export
1076
+ if current_data is None:
1077
+ return "No data to export", None
1078
+
1079
+ try:
1080
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # Create timestamp for unique filename
1081
+
1082
+ # Export based on selected format
1083
+ if format_type == "Excel":
1084
+ output_file = f"analysis_results_{timestamp}.xlsx"
1085
+ current_data.to_excel(output_file, index=False) # Save as Excel
1086
+ else: # CSV
1087
+ output_file = f"analysis_results_{timestamp}.csv"
1088
+ current_data.to_csv(output_file, index=False) # Save as CSV
1089
+
1090
+ return f"Data exported to {output_file}", output_file
1091
+
1092
+ except Exception as e:
1093
+ return f"Export error: {str(e)}", None
1094
+
1095
+ # ===== GRADIO INTERFACE CREATION =====
1096
+ def create_interface():
1097
+ """
1098
+ Create the Gradio interface with model selection
1099
+ This function builds the entire web interface using Gradio
1100
+ """
1101
+
1102
+ # Create the main Gradio application with soft theme
1103
+ with gr.Blocks(theme=gr.themes.Soft()) as app:
1104
+ # Main title and description
1105
+ gr.Markdown(
1106
+ """
1107
+ # πŸ“Š Enhanced Text Analytics AI Agent
1108
+ ### Smart Column Detection & Comprehensive Text Analysis with Multiple AI Models
1109
+
1110
+ **Features:**
1111
+ - πŸ€– Multiple AI Model Support (OpenAI, Anthropic, Deepseek, Groq, Google)
1112
+ - πŸ” Automatic detection of text, ID, and product columns
1113
+ - πŸ’Ύ Memory-efficient processing with automatic file cleanup
1114
+ - 😊 Sentiment analysis with scoring
1115
+ - 🎯 Topic/theme extraction
1116
+ - πŸ’‘ Actionable insights generation
1117
+ - πŸ”Ž Advanced text search with similarity scoring
1118
+ - πŸ“ˆ Multiple visualization options
1119
+ - πŸ“₯ Export results in Excel or CSV format
1120
+ """
1121
+ )
1122
+
1123
+ # Tab 1: Upload & Process
1124
+ with gr.Tab("πŸ“€ Upload & Process"):
1125
+ with gr.Row():
1126
+ with gr.Column(scale=1): # Left column for controls
1127
+ # Model selection dropdown
1128
+ model_dropdown = gr.Dropdown(
1129
+ label="πŸ€– Select AI Model",
1130
+ choices=model_manager.get_available_models(), # Get available models
1131
+ value=model_manager.current_model if model_manager.current_model else None,
1132
+ interactive=True
1133
+ )
1134
+
1135
+ # File upload component
1136
+ file_upload = gr.File(
1137
+ label="Upload Data File",
1138
+ file_types=[".csv", ".xlsx", ".xls", ".json"] # Supported file types
1139
+ )
1140
+
1141
+ # Process button
1142
+ process_btn = gr.Button("πŸš€ Process File", variant="primary")
1143
+
1144
+ with gr.Column(scale=2): # Right column for results
1145
+ status_output = gr.Markdown(label="Processing Status") # Processing status display
1146
+ ai_insights = gr.Markdown(label="AI-Generated Insights") # AI insights display
1147
+
1148
+ # Data preview section
1149
+ with gr.Row():
1150
+ data_preview = gr.Dataframe(
1151
+ label="Data Preview (First 10 rows)",
1152
+ interactive=False # Read-only display
1153
+ )
1154
+
1155
+ # Processed file download
1156
+ processed_file = gr.File(
1157
+ label="πŸ“ Processed Data File",
1158
+ interactive=False # Read-only, for download only
1159
+ )
1160
+
1161
+ # Tab 2: Search
1162
+ with gr.Tab("πŸ” Search"):
1163
+ gr.Markdown("### Search through your text data")
1164
+
1165
+ with gr.Row():
1166
+ # Search input box
1167
+ search_input = gr.Textbox(
1168
+ label="Enter search query",
1169
+ placeholder="Type keywords to search..."
1170
+ )
1171
+ # Search button
1172
+ search_btn = gr.Button("πŸ”Ž Search", variant="primary")
1173
+
1174
+ # Search results display
1175
+ search_status = gr.Markdown(label="Search Status") # Search status
1176
+ search_results = gr.Dataframe( # Search results table
1177
+ label="Search Results",
1178
+ interactive=False
1179
+ )
1180
+ search_file = gr.File( # Download search results
1181
+ label="πŸ“₯ Download Search Results",
1182
+ interactive=False
1183
+ )
1184
+
1185
+ # Tab 3: Visualizations
1186
+ with gr.Tab("πŸ“ˆ Visualizations"):
1187
+ with gr.Row():
1188
+ # Visualization selector dropdown
1189
+ viz_selector = gr.Dropdown(
1190
+ label="Select Visualization",
1191
+ choices=[], # Will be populated after processing
1192
+ interactive=True
1193
+ )
1194
+
1195
+ # Visualization display area
1196
+ viz_plot = gr.Plot(label="Visualization")
1197
+
1198
+ # Tab 4: Export
1199
+ with gr.Tab("πŸ“₯ Export"):
1200
+ gr.Markdown("### Export your analyzed data")
1201
+
1202
+ with gr.Row():
1203
+ # Export format selection
1204
+ export_format = gr.Radio(
1205
+ choices=["Excel", "CSV"],
1206
+ value="Excel",
1207
+ label="Export Format"
1208
+ )
1209
+ # Export button
1210
+ export_btn = gr.Button("πŸ“₯ Export Data", variant="primary")
1211
+
1212
+ # Export results display
1213
+ export_status = gr.Markdown(label="Export Status") # Export status
1214
+ export_file = gr.File( # Download exported file
1215
+ label="πŸ“ Download Exported File",
1216
+ interactive=False
1217
+ )
1218
+
1219
+ # ===== EVENT HANDLERS =====
1220
+ # These connect user interactions to the backend functions
1221
+
1222
+ # Model selection change handler
1223
+ model_dropdown.change(
1224
+ fn=update_model, # Function to call
1225
+ inputs=[model_dropdown], # Input components
1226
+ outputs=[status_output] # Output components
1227
+ )
1228
+
1229
+ # File processing button click handler
1230
+ process_btn.click(
1231
+ fn=process_file, # Function to call
1232
+ inputs=[file_upload, model_dropdown], # Input components
1233
+ outputs=[ # Output components
1234
+ status_output,
1235
+ data_preview,
1236
+ processed_file,
1237
+ ai_insights,
1238
+ viz_plot,
1239
+ search_status,
1240
+ viz_selector
1241
+ ]
1242
+ )
1243
+
1244
+ # Search button click handler
1245
+ search_btn.click(
1246
+ fn=search_data, # Function to call
1247
+ inputs=[search_input], # Input components
1248
+ outputs=[search_status, search_results, search_file] # Output components
1249
+ )
1250
+
1251
+ # Visualization selector change handler
1252
+ viz_selector.change(
1253
+ fn=update_visualization, # Function to call
1254
+ inputs=[viz_selector], # Input components
1255
+ outputs=[viz_plot] # Output components
1256
+ )
1257
+
1258
+ # Export button click handler
1259
+ export_btn.click(
1260
+ fn=export_results, # Function to call
1261
+ inputs=[export_format], # Input components
1262
+ outputs=[export_status, export_file] # Output components
1263
+ )
1264
+
1265
+ return app # Return the complete Gradio application
1266
+
1267
+ # ===== APPLICATION LAUNCH =====
1268
+ # Launch the application when script is run directly
1269
+ if __name__ == "__main__":
1270
+ app = create_interface() # Create the Gradio interface
1271
+ app.launch(share=True, debug=True) # Launch with public sharing and debug mode
requirements.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core data processing and analysis
2
+ pandas>=2.0.0
3
+ numpy>=1.24.0
4
+
5
+ # Environment and configuration
6
+ python-dotenv>=1.0.0
7
+
8
+ # AI/ML APIs
9
+ anthropic>=0.25.0
10
+ openai>=1.30.0
11
+ groq>=0.8.0
12
+ google-generativeai>=0.5.0
13
+
14
+ # Natural Language Processing
15
+ nltk>=3.8.0
16
+ textblob>=0.17.1
17
+
18
+ # Machine Learning
19
+ scikit-learn>=1.3.0
20
+
21
+ # Visualization
22
+ plotly>=5.15.0
23
+ matplotlib>=3.7.0
24
+ seaborn>=0.12.0
25
+
26
+ # Web Interface
27
+ gradio>=4.25.0
28
+
29
+ # File handling (additional support)
30
+ openpyxl>=3.1.0
31
+ xlrd>=2.0.0
32
+
33
+ # Optional: For better performance
34
+ numba>=0.57.0