Spaces:

Fola-AI
/

Multimodal_Text_Data_Analysis

Sleeping

App Files Files Community

Fola-AI commited on Sep 17, 2025

Commit

928639d

verified ·

1 Parent(s): 3a84dee

Upload 3 files

Browse files

Files changed (3) hide show

Multimodal_Text_Analytics.ipynb +1346 -0
app.py +1271 -0
requirements.txt +34 -0

Multimodal_Text_Analytics.ipynb ADDED Viewed

	@@ -0,0 +1,1346 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "3baa95af-73a1-4d3c-a562-f90777f1f0c0",
+   "metadata": {},
+   "source": [
+    "# Text Data Analysis AI Assistant with Gradio\n",
+    " - Intelligent Customer Feedback Analysis System with Multiple AI APIs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "31a6bbea-df57-40ed-afd3-4df75cc86d0a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package brown to /Users/fola-ai/nltk_data...\n",
+      "[nltk_data]   Package brown is already up-to-date!\n",
+      "[nltk_data] Downloading package punkt_tab to /Users/fola-\n",
+      "[nltk_data]     ai/nltk_data...\n",
+      "[nltk_data]   Package punkt_tab is already up-to-date!\n",
+      "[nltk_data] Downloading package wordnet to /Users/fola-ai/nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n",
+      "[nltk_data] Downloading package averaged_perceptron_tagger_eng to\n",
+      "[nltk_data]     /Users/fola-ai/nltk_data...\n",
+      "[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.\n",
+      "[nltk_data] Downloading package conll2000 to /Users/fola-\n",
+      "[nltk_data]     ai/nltk_data...\n",
+      "[nltk_data]   Unzipping corpora/conll2000.zip.\n",
+      "[nltk_data] Downloading package movie_reviews to /Users/fola-\n",
+      "[nltk_data]     ai/nltk_data...\n",
+      "[nltk_data]   Unzipping corpora/movie_reviews.zip.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Finished.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ===== IMPORTS SECTION =====\n",
+    "# Core libraries\n",
+    "import os\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "# Environment and API\n",
+    "from dotenv import load_dotenv\n",
+    "from anthropic import Anthropic\n",
+    "\n",
+    "# Additional AI APIs\n",
+    "try:\n",
+    "    from openai import OpenAI\n",
+    "except ImportError:\n",
+    "    OpenAI = None\n",
+    "    \n",
+    "try:\n",
+    "    from groq import Groq\n",
+    "except ImportError:\n",
+    "    Groq = None\n",
+    "    \n",
+    "try:\n",
+    "    import google.generativeai as genai\n",
+    "except ImportError:\n",
+    "    genai = None\n",
+    "\n",
+    "# Data processing\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from datetime import datetime, timedelta\n",
+    "import json\n",
+    "import gc  # For garbage collection\n",
+    "\n",
+    "# Natural Language Processing\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "from textblob import TextBlob\n",
+    "import re\n",
+    "from collections import Counter\n",
+    "\n",
+    "# Machine Learning\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
+    "from sklearn.decomposition import LatentDirichletAllocation\n",
+    "from sklearn.cluster import KMeans\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "\n",
+    "# Visualization\n",
+    "import plotly.express as px\n",
+    "import plotly.graph_objects as go\n",
+    "from plotly.subplots import make_subplots\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "# Web interface\n",
+    "import gradio as gr\n",
+    "\n",
+    "# Download required NLTK data\n",
+    "nltk.download('punkt', quiet=True)\n",
+    "nltk.download('punkt_tab', quiet=True)  # New tokenizer format\n",
+    "nltk.download('stopwords', quiet=True)\n",
+    "nltk.download('wordnet', quiet=True)\n",
+    "nltk.download('averaged_perceptron_tagger', quiet=True)\n",
+    "nltk.download('omw-1.4', quiet=True)  # For WordNet lemmatizer\n",
+    "nltk.download('brown', quiet=True)  # Required for TextBlob\n",
+    "\n",
+    "# Download TextBlob corpora\n",
+    "try:\n",
+    "    from textblob import download_corpora\n",
+    "    download_corpora.main()\n",
+    "except:\n",
+    "    # Alternative method if the above doesn't work\n",
+    "    import subprocess\n",
+    "    import sys\n",
+    "    try:\n",
+    "        subprocess.run([sys.executable, \"-m\", \"textblob.download_corpora\"], \n",
+    "                      capture_output=True, text=True, timeout=30)\n",
+    "    except:\n",
+    "        print(\"Warning: Could not download TextBlob corpora. Sentiment analysis may not work properly.\")\n",
+    "        print(\"Please run: python -m textblob.download_corpora\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "db7c1e72-7960-4968-9a72-0f62ca7140d9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "load_dotenv(override=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "bded62da-82ab-4e17-bbf5-3edfe1b39398",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ===== SMART COLUMN DETECTOR =====\n",
+    "class SmartColumnDetector:\n",
+    "    \"\"\"Intelligently detect and extract relevant columns from uploaded data\"\"\"\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        # Keywords for detecting different column types\n",
+    "        self.text_keywords = ['comment', 'feedback', 'review', 'description', 'text', \n",
+    "                             'response', 'opinion', 'message', 'notes', 'remarks']\n",
+    "        self.id_keywords = ['id', 'identifier', 'key', 'number', 'code', 'ref', \n",
+    "                           'reference', 'index', 'uuid']\n",
+    "        self.product_keywords = ['product', 'item', 'model', 'variant', 'type', \n",
+    "                                'category', 'brand', 'name', 'sku']\n",
+    "        self.date_keywords = ['date', 'time', 'created', 'updated', 'timestamp']\n",
+    "        \n",
+    "    def detect_column_types(self, df):\n",
+    "        \"\"\"Detect column types based on column names and content\"\"\"\n",
+    "        detected = {\n",
+    "            'text_columns': [],\n",
+    "            'id_columns': [],\n",
+    "            'product_columns': [],\n",
+    "            'date_columns': [],\n",
+    "            'other_columns': []\n",
+    "        }\n",
+    "        \n",
+    "        for col in df.columns:\n",
+    "            col_lower = col.lower()\n",
+    "            \n",
+    "            # Check for text columns\n",
+    "            if any(keyword in col_lower for keyword in self.text_keywords):\n",
+    "                detected['text_columns'].append(col)\n",
+    "            # Check for ID columns\n",
+    "            elif any(keyword in col_lower for keyword in self.id_keywords):\n",
+    "                detected['id_columns'].append(col)\n",
+    "            # Check for product columns\n",
+    "            elif any(keyword in col_lower for keyword in self.product_keywords):\n",
+    "                detected['product_columns'].append(col)\n",
+    "            # Check for date columns\n",
+    "            elif any(keyword in col_lower for keyword in self.date_keywords):\n",
+    "                detected['date_columns'].append(col)\n",
+    "            else:\n",
+    "                # Analyze content to determine type\n",
+    "                sample = df[col].dropna().head(100)\n",
+    "                if len(sample) > 0:\n",
+    "                    # Check if mostly text\n",
+    "                    if df[col].dtype == 'object':\n",
+    "                        avg_length = sample.astype(str).str.len().mean()\n",
+    "                        if avg_length > 50:  # Likely text content\n",
+    "                            detected['text_columns'].append(col)\n",
+    "                        elif avg_length < 20 and df[col].nunique() / len(df) > 0.5:\n",
+    "                            detected['id_columns'].append(col)\n",
+    "                        else:\n",
+    "                            detected['product_columns'].append(col)\n",
+    "                    else:\n",
+    "                        detected['other_columns'].append(col)\n",
+    "        \n",
+    "        return detected\n",
+    "    \n",
+    "    def extract_relevant_data(self, df):\n",
+    "        \"\"\"Extract only relevant columns and create optimized dataset\"\"\"\n",
+    "        detected = self.detect_column_types(df)\n",
+    "        \n",
+    "        # Create new dataframe with relevant columns\n",
+    "        extracted_data = pd.DataFrame()\n",
+    "        \n",
+    "        # Add unique identifier\n",
+    "        if detected['id_columns'] and len(detected['id_columns']) > 0:\n",
+    "            extracted_data['unique_id'] = df[detected['id_columns'][0]]\n",
+    "        else:\n",
+    "            extracted_data['unique_id'] = range(1, len(df) + 1)\n",
+    "        \n",
+    "        # Add product information\n",
+    "        if detected['product_columns'] and len(detected['product_columns']) > 0:\n",
+    "            # Convert to list if needed and limit to 2 product columns\n",
+    "            product_cols = list(detected['product_columns'])[:2]\n",
+    "            for col in product_cols:\n",
+    "                extracted_data[f'product_{col}'] = df[col]\n",
+    "        \n",
+    "        # Combine text columns\n",
+    "        if detected['text_columns'] and len(detected['text_columns']) > 0:\n",
+    "            text_cols = list(detected['text_columns'])  # Ensure it's a list\n",
+    "            text_data = []\n",
+    "            for idx in df.index:\n",
+    "                combined_text = ' '.join([\n",
+    "                    str(df.loc[idx, col]) \n",
+    "                    for col in text_cols \n",
+    "                    if col in df.columns and pd.notna(df.loc[idx, col])\n",
+    "                ])\n",
+    "                text_data.append(combined_text)\n",
+    "            extracted_data['combined_text'] = text_data\n",
+    "        else:\n",
+    "            # If no text columns detected, create empty combined_text\n",
+    "            extracted_data['combined_text'] = [''] * len(df)\n",
+    "        \n",
+    "        # Add date columns\n",
+    "        if detected['date_columns'] and len(detected['date_columns']) > 0:\n",
+    "            extracted_data['date'] = pd.to_datetime(df[detected['date_columns'][0]], errors='coerce')\n",
+    "        \n",
+    "        return extracted_data, detected"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "626af7bf-b4cf-4259-b409-18e5225555aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ===== ENHANCED TEXT PROCESSOR =====\n",
+    "class EnhancedTextProcessor:\n",
+    "    \"\"\"Enhanced text preprocessing with actionable insights extraction\"\"\"\n",
+    "\n",
+    "    def __init__(self):\n",
+    "        self.lemmatizer = WordNetLemmatizer()\n",
+    "        self.stop_words = set(stopwords.words('english'))\n",
+    "        \n",
+    "        # Initialize actionable insights dictionary with common customer feedback phrases\n",
+    "        self.actionable_dictionary = {\n",
+    "            'improve speed': ['slow', 'faster', 'quick', 'speed up', 'takes too long', 'waiting'],\n",
+    "            'better quality': ['poor quality', 'cheap', 'breaks', 'defective', 'flimsy', 'weak'],\n",
+    "            'enhance ui': ['confusing', 'hard to use', 'complicated', 'not intuitive', 'difficult to navigate'],\n",
+    "            'fix bugs': ['bug', 'error', 'crash', 'freeze', 'not working', 'glitch', 'broken'],\n",
+    "            'add features': ['missing', 'need', 'want', 'should have', 'would be nice', 'lacks'],\n",
+    "            'improve support': ['no response', 'unhelpful', 'rude', 'poor service', 'bad support'],\n",
+    "            'better packaging': ['damaged', 'poor packaging', 'arrived broken', 'not protected'],\n",
+    "            'clearer docs': ['unclear', 'no instructions', 'confusing manual', 'hard to understand'],\n",
+    "            'reduce price': ['expensive', 'overpriced', 'too costly', 'not worth', 'cheaper'],\n",
+    "            'faster delivery': ['late', 'delayed', 'slow shipping', 'took forever', 'still waiting'],\n",
+    "            'better communication': ['no updates', 'not informed', 'lack of communication', 'no tracking'],\n",
+    "            'improve reliability': ['unreliable', 'stops working', 'inconsistent', 'sometimes works'],\n",
+    "            'enhance performance': ['slow performance', 'laggy', 'sluggish', 'not responsive'],\n",
+    "            'better design': ['ugly', 'poor design', 'looks cheap', 'not attractive', 'outdated look'],\n",
+    "            'more options': ['limited options', 'no variety', 'need more choices', 'only one option']\n",
+    "        }\n",
+    "\n",
+    "    def clean_text(self, text):\n",
+    "        \"\"\"Clean and normalize text\"\"\"\n",
+    "        if pd.isna(text) or text == '':\n",
+    "            return \"\"\n",
+    "\n",
+    "        text = str(text).lower()\n",
+    "        text = re.sub(r'[^a-zA-Z0-9\\s]', '', text)\n",
+    "        text = ' '.join(text.split())\n",
+    "        return text\n",
+    "\n",
+    "    def extract_actionable_insights(self, text):\n",
+    "        \"\"\"Extract actionable insights using dictionary matching\"\"\"\n",
+    "        if pd.isna(text) or text == '':\n",
+    "            return \"\"\n",
+    "        \n",
+    "        text_lower = text.lower()\n",
+    "        found_insights = []\n",
+    "        \n",
+    "        # Check each actionable item against the text\n",
+    "        for action, keywords in self.actionable_dictionary.items():\n",
+    "            for keyword in keywords:\n",
+    "                if keyword in text_lower:\n",
+    "                    found_insights.append(action)\n",
+    "                    break  # Only add each action once\n",
+    "        \n",
+    "        # Return top 3 most relevant insights\n",
+    "        if found_insights:\n",
+    "            return ', '.join(found_insights[:3])\n",
+    "        return \"\"\n",
+    "\n",
+    "    def extract_specific_topics(self, text):\n",
+    "        \"\"\"Extract specific topics from text using keyword extraction\"\"\"\n",
+    "        if pd.isna(text) or text == '' or len(text) < 10:\n",
+    "            return ['', '', '']\n",
+    "        \n",
+    "        # Clean text first\n",
+    "        text_lower = text.lower()\n",
+    "        \n",
+    "        # Remove stopwords for better topic extraction\n",
+    "        words = word_tokenize(text_lower)\n",
+    "        filtered_words = [w for w in words if w not in self.stop_words and len(w) > 3]\n",
+    "        \n",
+    "        # Extract noun phrases and important terms\n",
+    "        blob = TextBlob(text)\n",
+    "        noun_phrases = blob.noun_phrases\n",
+    "        \n",
+    "        # Combine noun phrases with high-frequency meaningful words\n",
+    "        topics = []\n",
+    "        \n",
+    "        # Add noun phrases (these are usually good topics)\n",
+    "        for phrase in noun_phrases[:5]:  # Limit to top 5 noun phrases\n",
+    "            if len(phrase.split()) <= 3:  # Only short phrases\n",
+    "                topics.append(phrase)\n",
+    "        \n",
+    "        # Add frequent meaningful words if we don't have enough topics\n",
+    "        if len(topics) < 3:\n",
+    "            word_freq = Counter(filtered_words)\n",
+    "            for word, _ in word_freq.most_common(5):\n",
+    "                if word not in str(topics):  # Avoid duplicates\n",
+    "                    topics.append(word)\n",
+    "                if len(topics) >= 3:\n",
+    "                    break\n",
+    "        \n",
+    "        # Ensure we always return 3 items (empty string if not enough topics)\n",
+    "        topics = topics[:3]\n",
+    "        while len(topics) < 3:\n",
+    "            topics.append('')\n",
+    "        \n",
+    "        return topics\n",
+    "\n",
+    "    def determine_topic(self, text):\n",
+    "        \"\"\"Legacy method kept for compatibility - returns first specific topic\"\"\"\n",
+    "        topics = self.extract_specific_topics(text)\n",
+    "        return topics[0] if topics[0] else 'General'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "b2eb5f17-7400-4591-8c0e-de7645b87c72",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ===== SEARCH ENGINE =====\n",
+    "class TextSearchEngine:\n",
+    "    \"\"\"Advanced search functionality for text data with semantic capabilities\"\"\"\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        self.vectorizer = TfidfVectorizer(\n",
+    "            max_features=1000,\n",
+    "            ngram_range=(1, 3),  # Include unigrams, bigrams, and trigrams for better matching\n",
+    "            stop_words='english',\n",
+    "            use_idf=True,\n",
+    "            smooth_idf=True,\n",
+    "            sublinear_tf=True  # Apply sublinear tf scaling\n",
+    "        )\n",
+    "        self.tfidf_matrix = None\n",
+    "        self.data = None\n",
+    "        \n",
+    "        # Synonym dictionary for semantic search\n",
+    "        self.synonyms = {\n",
+    "            'fast': ['quick', 'rapid', 'speedy', 'swift', 'prompt'],\n",
+    "            'slow': ['sluggish', 'delayed', 'laggy', 'lengthy', 'prolonged'],\n",
+    "            'good': ['excellent', 'great', 'wonderful', 'fantastic', 'amazing', 'positive'],\n",
+    "            'bad': ['poor', 'terrible', 'awful', 'negative', 'horrible', 'disappointing'],\n",
+    "            'problem': ['issue', 'bug', 'error', 'defect', 'fault', 'glitch'],\n",
+    "            'help': ['support', 'assistance', 'aid', 'service'],\n",
+    "            'price': ['cost', 'fee', 'charge', 'rate', 'payment', 'expensive', 'cheap'],\n",
+    "            'quality': ['standard', 'grade', 'condition', 'caliber'],\n",
+    "            'delivery': ['shipping', 'dispatch', 'arrival', 'transport'],\n",
+    "            'easy': ['simple', 'straightforward', 'effortless', 'user-friendly'],\n",
+    "            'hard': ['difficult', 'complex', 'complicated', 'challenging'],\n",
+    "            'broken': ['damaged', 'defective', 'faulty', 'malfunctioning'],\n",
+    "            'love': ['like', 'enjoy', 'appreciate', 'adore'],\n",
+    "            'hate': ['dislike', 'despise', 'detest'],\n",
+    "            'feature': ['function', 'capability', 'option', 'characteristic'],\n",
+    "            'customer': ['client', 'buyer', 'purchaser', 'consumer', 'user']\n",
+    "        }\n",
+    "        \n",
+    "    def expand_query_with_synonyms(self, query):\n",
+    "        \"\"\"Expand search query with synonyms for better semantic matching\"\"\"\n",
+    "        query_words = query.lower().split()\n",
+    "        expanded_terms = []\n",
+    "        \n",
+    "        for word in query_words:\n",
+    "            # Add the original word\n",
+    "            expanded_terms.append(word)\n",
+    "            \n",
+    "            # Add synonyms if available\n",
+    "            if word in self.synonyms:\n",
+    "                expanded_terms.extend(self.synonyms[word])\n",
+    "            \n",
+    "            # Check if word is a synonym of something else\n",
+    "            for key, syns in self.synonyms.items():\n",
+    "                if word in syns:\n",
+    "                    expanded_terms.append(key)\n",
+    "                    expanded_terms.extend([s for s in syns if s != word])\n",
+    "        \n",
+    "        # Remove duplicates while preserving order\n",
+    "        seen = set()\n",
+    "        unique_terms = []\n",
+    "        for term in expanded_terms:\n",
+    "            if term not in seen:\n",
+    "                unique_terms.append(term)\n",
+    "                seen.add(term)\n",
+    "        \n",
+    "        return ' '.join(unique_terms)\n",
+    "        \n",
+    "    def build_index(self, df, text_column):\n",
+    "        \"\"\"Build search index from text data\"\"\"\n",
+    "        self.data = df.copy()\n",
+    "        texts = df[text_column].fillna('').tolist()\n",
+    "        \n",
+    "        # Add other searchable columns to improve search\n",
+    "        if 'topic_1' in df.columns:\n",
+    "            texts = [f\"{text} {df.iloc[i]['topic_1']} {df.iloc[i]['topic_2']} {df.iloc[i]['topic_3']}\" \n",
+    "                    for i, text in enumerate(texts)]\n",
+    "        if 'actionable_insights' in df.columns:\n",
+    "            texts = [f\"{texts[i]} {df.iloc[i]['actionable_insights']}\" \n",
+    "                    for i in range(len(texts))]\n",
+    "            \n",
+    "        self.tfidf_matrix = self.vectorizer.fit_transform(texts)\n",
+    "        \n",
+    "    def search(self, query, top_k=10):\n",
+    "        \"\"\"Enhanced search with semantic understanding\"\"\"\n",
+    "        if self.tfidf_matrix is None:\n",
+    "            return pd.DataFrame()\n",
+    "        \n",
+    "        # Expand query with synonyms\n",
+    "        expanded_query = self.expand_query_with_synonyms(query)\n",
+    "        \n",
+    "        # Vectorize both original and expanded queries\n",
+    "        query_vector = self.vectorizer.transform([query])\n",
+    "        expanded_vector = self.vectorizer.transform([expanded_query])\n",
+    "        \n",
+    "        # Calculate similarities for both\n",
+    "        similarities_orig = cosine_similarity(query_vector, self.tfidf_matrix).flatten()\n",
+    "        similarities_exp = cosine_similarity(expanded_vector, self.tfidf_matrix).flatten()\n",
+    "        \n",
+    "        # Combine scores (weighted average - original query gets more weight)\n",
+    "        combined_similarities = (0.7 * similarities_orig + 0.3 * similarities_exp)\n",
+    "        \n",
+    "        # Get top results\n",
+    "        top_indices = combined_similarities.argsort()[-top_k:][::-1]\n",
+    "        top_scores = combined_similarities[top_indices]\n",
+    "        \n",
+    "        # Filter results with score > 0.05 (lower threshold for better recall)\n",
+    "        valid_indices = [idx for idx, score in zip(top_indices, top_scores) if score > 0.05]\n",
+    "        \n",
+    "        if valid_indices:\n",
+    "            results = self.data.iloc[valid_indices].copy()\n",
+    "            results['search_score'] = [combined_similarities[idx] for idx in valid_indices]\n",
+    "            \n",
+    "            # Boost results that have exact matches\n",
+    "            query_lower = query.lower()\n",
+    "            for idx in results.index:\n",
+    "                if 'combined_text' in results.columns:\n",
+    "                    if query_lower in str(results.at[idx, 'combined_text']).lower():\n",
+    "                        results.at[idx, 'search_score'] *= 1.5  # Boost exact matches\n",
+    "                        \n",
+    "            return results.sort_values('search_score', ascending=False)\n",
+    "        \n",
+    "        return pd.DataFrame()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e8b88155-971f-4dd5-b26c-104a737bc426",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ===== API CONFIGURATION =====\n",
+    "class AIModelManager:\n",
+    "    \"\"\"Manages multiple AI model APIs and provides unified interface\"\"\"\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        self.available_models = {}\n",
+    "        self.clients = {}\n",
+    "        self.current_model = None\n",
+    "        self.initialize_apis()\n",
+    "        \n",
+    "    def initialize_apis(self):\n",
+    "        \"\"\"Initialize all available AI APIs\"\"\"\n",
+    "        \n",
+    "        # Anthropic\n",
+    "        ANTHROPIC_API_KEY = os.getenv(\"ANTHROPIC_API_KEY\")\n",
+    "        if ANTHROPIC_API_KEY:\n",
+    "            try:\n",
+    "                self.clients['anthropic'] = Anthropic(api_key=ANTHROPIC_API_KEY)\n",
+    "                self.available_models['Claude 3 Haiku'] = {\n",
+    "                    'provider': 'anthropic',\n",
+    "                    'model': 'claude-3-haiku-20240307'\n",
+    "                }\n",
+    "                print(f\"Anthropic API Key exists and begins {ANTHROPIC_API_KEY[:4]}\")\n",
+    "            except Exception as e:\n",
+    "                print(f\"Error initializing Anthropic: {e}\")\n",
+    "        else:\n",
+    "            print(\"Anthropic API Key not set\")\n",
+    "            \n",
+    "        # OpenAI\n",
+    "        OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
+    "        if OPENAI_API_KEY and OpenAI:\n",
+    "            try:\n",
+    "                self.clients['openai'] = OpenAI(api_key=OPENAI_API_KEY)\n",
+    "                self.available_models['GPT-4o-mini'] = {\n",
+    "                    'provider': 'openai',\n",
+    "                    'model': 'gpt-4o-mini'\n",
+    "                }\n",
+    "                self.available_models['GPT-3.5 Turbo'] = {\n",
+    "                    'provider': 'openai',\n",
+    "                    'model': 'gpt-3.5-turbo'\n",
+    "                }\n",
+    "                print(f\"OpenAI API Key exists and begins {OPENAI_API_KEY[:7]}\")\n",
+    "            except Exception as e:\n",
+    "                print(f\"Error initializing OpenAI: {e}\")\n",
+    "        else:\n",
+    "            print(\"OpenAI API Key not set or library not installed\")\n",
+    "            \n",
+    "        # Deepseek (uses OpenAI-compatible API)\n",
+    "        DEEPSEEK_API_KEY = os.getenv(\"DEEPSEEK_API_KEY\")\n",
+    "        if DEEPSEEK_API_KEY and OpenAI:\n",
+    "            try:\n",
+    "                self.clients['deepseek'] = OpenAI(\n",
+    "                    api_key=DEEPSEEK_API_KEY,\n",
+    "                    base_url=\"https://api.deepseek.com\"\n",
+    "                )\n",
+    "                self.available_models['Deepseek Chat'] = {\n",
+    "                    'provider': 'deepseek',\n",
+    "                    'model': 'deepseek-chat'\n",
+    "                }\n",
+    "                print(f\"Deepseek API Key exists and begins {DEEPSEEK_API_KEY[:7]}\")\n",
+    "            except Exception as e:\n",
+    "                print(f\"Error initializing Deepseek: {e}\")\n",
+    "        else:\n",
+    "            print(\"Deepseek API Key not set or OpenAI library not installed\")\n",
+    "            \n",
+    "        # Groq\n",
+    "        GROQ_API_KEY = os.getenv(\"GROQ_API_KEY\")\n",
+    "        if GROQ_API_KEY and Groq:\n",
+    "            try:\n",
+    "                self.clients['groq'] = Groq(api_key=GROQ_API_KEY)\n",
+    "                self.available_models['Llama 3.3 70B'] = {\n",
+    "                    'provider': 'groq',\n",
+    "                    'model': 'llama-3.3-70b-versatile'\n",
+    "                }\n",
+    "                self.available_models['Mixtral 8x7B'] = {\n",
+    "                    'provider': 'groq',\n",
+    "                    'model': 'mixtral-8x7b-32768'\n",
+    "                }\n",
+    "                print(f\"Groq API Key exists and begins {GROQ_API_KEY[:4]}\")\n",
+    "            except Exception as e:\n",
+    "                print(f\"Error initializing Groq: {e}\")\n",
+    "        else:\n",
+    "            print(\"Groq API Key not set or library not installed\")\n",
+    "            \n",
+    "        # Google Gemini\n",
+    "        GOOGLE_API_KEY = os.getenv(\"GOOGLE_API_KEY\")\n",
+    "        if GOOGLE_API_KEY and genai:\n",
+    "            try:\n",
+    "                genai.configure(api_key=GOOGLE_API_KEY)\n",
+    "                self.clients['google'] = genai\n",
+    "                self.available_models['Gemini 1.5 Flash'] = {\n",
+    "                    'provider': 'google',\n",
+    "                    'model': 'gemini-1.5-flash'\n",
+    "                }\n",
+    "                self.available_models['Gemini 1.5 Pro'] = {\n",
+    "                    'provider': 'google',\n",
+    "                    'model': 'gemini-1.5-pro'\n",
+    "                }\n",
+    "                print(f\"Google API Key exists and begins {GOOGLE_API_KEY[:2]}\")\n",
+    "            except Exception as e:\n",
+    "                print(f\"Error initializing Google Gemini: {e}\")\n",
+    "        else:\n",
+    "            print(\"Google API Key not set or library not installed\")\n",
+    "            \n",
+    "        # Set default model\n",
+    "        if self.available_models:\n",
+    "            self.current_model = list(self.available_models.keys())[0]\n",
+    "            \n",
+    "    def get_available_models(self):\n",
+    "        \"\"\"Return list of available model names\"\"\"\n",
+    "        return list(self.available_models.keys())\n",
+    "    \n",
+    "    def set_model(self, model_name):\n",
+    "        \"\"\"Set the current model\"\"\"\n",
+    "        if model_name in self.available_models:\n",
+    "            self.current_model = model_name\n",
+    "            return True\n",
+    "        return False\n",
+    "    \n",
+    "    def generate_text(self, prompt, max_tokens=1000):\n",
+    "        \"\"\"Generate text using the current model\"\"\"\n",
+    "        if not self.current_model or self.current_model not in self.available_models:\n",
+    "            return None\n",
+    "            \n",
+    "        model_info = self.available_models[self.current_model]\n",
+    "        provider = model_info['provider']\n",
+    "        model = model_info['model']\n",
+    "        \n",
+    "        try:\n",
+    "            if provider == 'anthropic':\n",
+    "                client = self.clients['anthropic']\n",
+    "                response = client.messages.create(\n",
+    "                    model=model,\n",
+    "                    max_tokens=max_tokens,\n",
+    "                    messages=[{\"role\": \"user\", \"content\": prompt}]\n",
+    "                )\n",
+    "                return response.content[0].text\n",
+    "                \n",
+    "            elif provider in ['openai', 'deepseek']:\n",
+    "                client = self.clients[provider]\n",
+    "                response = client.chat.completions.create(\n",
+    "                    model=model,\n",
+    "                    messages=[{\"role\": \"user\", \"content\": prompt}],\n",
+    "                    max_tokens=max_tokens\n",
+    "                )\n",
+    "                return response.choices[0].message.content\n",
+    "                \n",
+    "            elif provider == 'groq':\n",
+    "                client = self.clients['groq']\n",
+    "                response = client.chat.completions.create(\n",
+    "                    model=model,\n",
+    "                    messages=[{\"role\": \"user\", \"content\": prompt}],\n",
+    "                    max_tokens=max_tokens\n",
+    "                )\n",
+    "                return response.choices[0].message.content\n",
+    "                \n",
+    "            elif provider == 'google':\n",
+    "                model_obj = genai.GenerativeModel(model)\n",
+    "                response = model_obj.generate_content(prompt)\n",
+    "                return response.text\n",
+    "                \n",
+    "        except Exception as e:\n",
+    "            print(f\"Error generating text with {self.current_model}: {e}\")\n",
+    "            return None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "809f4c47-6ea8-4eaa-bac1-5ca83daac733",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Anthropic API Key exists and begins sk-a\n",
+      "OpenAI API Key exists and begins sk-proj\n",
+      "Deepseek API Key exists and begins sk-1099\n",
+      "Groq API Key exists and begins gsk_\n",
+      "Google API Key exists and begins AI\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Initialize the model manager globally\n",
+    "model_manager = AIModelManager()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ad5f99f2-efd9-4759-88dc-df7f2f5359fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ===== ENHANCED ANALYZER WITH MULTI-MODEL SUPPORT =====\n",
+    "\n",
+    "class EnhancedTextAnalyzer:\n",
+    "    \"\"\"Main analysis engine with all enhanced features and multi-model support\"\"\"\n",
+    "    \n",
+    "    def __init__(self, model_manager=None):\n",
+    "        self.model_manager = model_manager\n",
+    "        self.column_detector = SmartColumnDetector()\n",
+    "        self.text_processor = EnhancedTextProcessor()\n",
+    "        self.search_engine = TextSearchEngine()\n",
+    "        self.original_df = None\n",
+    "        self.processed_df = None\n",
+    "        self.results = {}\n",
+    "        self.visualizations = {}\n",
+    "        \n",
+    "    def load_file(self, file):\n",
+    "        \"\"\"Load data from various file formats\"\"\"\n",
+    "        try:\n",
+    "            if file.name.endswith('.csv'):\n",
+    "                df = pd.read_csv(file.name)\n",
+    "            elif file.name.endswith(('.xlsx', '.xls')):\n",
+    "                df = pd.read_excel(file.name)\n",
+    "            elif file.name.endswith('.json'):\n",
+    "                df = pd.read_json(file.name)\n",
+    "            else:\n",
+    "                return None, \"Unsupported file format\"\n",
+    "            \n",
+    "            return df, f\"File loaded: {len(df)} records\"\n",
+    "        except Exception as e:\n",
+    "            return None, f\"Error loading file: {str(e)}\"\n",
+    "    \n",
+    "    def process_data(self, df):\n",
+    "        \"\"\"Process data with smart extraction and analysis\"\"\"\n",
+    "        # Extract relevant columns\n",
+    "        extracted_df, detected_columns = self.column_detector.extract_relevant_data(df)\n",
+    "        \n",
+    "        # Store for reference\n",
+    "        self.processed_df = extracted_df\n",
+    "        \n",
+    "        # Clear original from memory\n",
+    "        del df\n",
+    "        gc.collect()\n",
+    "        \n",
+    "        # Add analysis columns\n",
+    "        if 'combined_text' in extracted_df.columns:\n",
+    "            # Sentiment analysis\n",
+    "            sentiments = []\n",
+    "            polarities = []\n",
+    "            topics_1 = []\n",
+    "            topics_2 = []\n",
+    "            topics_3 = []\n",
+    "            insights = []\n",
+    "            \n",
+    "            for text in extracted_df['combined_text']:\n",
+    "                # Sentiment\n",
+    "                blob = TextBlob(text)\n",
+    "                polarity = blob.sentiment.polarity\n",
+    "                if polarity > 0.1:\n",
+    "                    sentiment = 'Positive'\n",
+    "                elif polarity < -0.1:\n",
+    "                    sentiment = 'Negative'\n",
+    "                else:\n",
+    "                    sentiment = 'Neutral'\n",
+    "                \n",
+    "                sentiments.append(sentiment)\n",
+    "                polarities.append(polarity)\n",
+    "                \n",
+    "                # Extract specific topics (3 separate topics)\n",
+    "                specific_topics = self.text_processor.extract_specific_topics(text)\n",
+    "                topics_1.append(specific_topics[0])\n",
+    "                topics_2.append(specific_topics[1])\n",
+    "                topics_3.append(specific_topics[2])\n",
+    "                \n",
+    "                # Actionable insights using dictionary matching\n",
+    "                insight = self.text_processor.extract_actionable_insights(text)\n",
+    "                insights.append(insight)\n",
+    "            \n",
+    "            extracted_df['sentiment'] = sentiments\n",
+    "            extracted_df['sentiment_score'] = polarities\n",
+    "            extracted_df['topic_1'] = topics_1\n",
+    "            extracted_df['topic_2'] = topics_2\n",
+    "            extracted_df['topic_3'] = topics_3\n",
+    "            extracted_df['actionable_insights'] = insights\n",
+    "            \n",
+    "            # Build search index with enhanced search capabilities\n",
+    "            self.search_engine.build_index(extracted_df, 'combined_text')\n",
+    "        \n",
+    "        # Save processed data\n",
+    "        output_file = 'processed_data.xlsx'\n",
+    "        extracted_df.to_excel(output_file, index=False)\n",
+    "        \n",
+    "        return extracted_df, detected_columns, output_file\n",
+    "    \n",
+    "    def generate_ai_insights(self, df, num_samples=5):\n",
+    "        \"\"\"Generate AI-powered insights using selected model\"\"\"\n",
+    "        if not self.model_manager or not self.model_manager.current_model:\n",
+    "            return \"No AI model available for generating insights\"\n",
+    "        \n",
+    "        if 'combined_text' not in df.columns or df.empty:\n",
+    "            return \"No text data available for AI analysis\"\n",
+    "        \n",
+    "        # Sample some texts for analysis\n",
+    "        sample_texts = df['combined_text'].dropna().head(num_samples).tolist()\n",
+    "        if not sample_texts:\n",
+    "            return \"No valid text samples found\"\n",
+    "        \n",
+    "        # Create prompt for AI analysis\n",
+    "        prompt = f\"\"\"Analyze the following customer feedback samples and provide key insights:\n",
+    "\n",
+    "Samples:\n",
+    "{chr(10).join([f\"{i+1}. {text[:200]}...\" if len(text) > 200 else f\"{i+1}. {text}\" for i, text in enumerate(sample_texts)])}\n",
+    "\n",
+    "Please provide:\n",
+    "1. Main themes and patterns\n",
+    "2. Key sentiment indicators\n",
+    "3. Actionable recommendations\n",
+    "4. Areas of concern\n",
+    "\n",
+    "Keep the response concise and focused on actionable insights.\"\"\"\n",
+    "\n",
+    "        # Generate insights using selected model\n",
+    "        try:\n",
+    "            response = self.model_manager.generate_text(prompt, max_tokens=500)\n",
+    "            if response:\n",
+    "                return f\"**AI Insights (using {self.model_manager.current_model}):**\\n\\n{response}\"\n",
+    "            else:\n",
+    "                return \"Failed to generate AI insights. Please check your API configuration.\"\n",
+    "        except Exception as e:\n",
+    "            return f\"Error generating AI insights: {str(e)}\"\n",
+    "    \n",
+    "    def generate_visualizations(self, df):\n",
+    "        \"\"\"Generate various visualizations\"\"\"\n",
+    "        visualizations = {}\n",
+    "        \n",
+    "        if 'sentiment' in df.columns:\n",
+    "            # Sentiment distribution\n",
+    "            sentiment_counts = df['sentiment'].value_counts()\n",
+    "            fig_sentiment = px.pie(\n",
+    "                values=sentiment_counts.values,\n",
+    "                names=sentiment_counts.index,\n",
+    "                title=\"Sentiment Distribution\",\n",
+    "                color_discrete_map={\n",
+    "                    'Positive': '#27AE60',\n",
+    "                    'Negative': '#E74C3C',\n",
+    "                    'Neutral': '#95A5A6'\n",
+    "                }\n",
+    "            )\n",
+    "            visualizations['Sentiment Distribution'] = fig_sentiment\n",
+    "            \n",
+    "        if 'topic_1' in df.columns:\n",
+    "            # Combine all topics for overall topic distribution\n",
+    "            all_topics = []\n",
+    "            for col in ['topic_1', 'topic_2', 'topic_3']:\n",
+    "                if col in df.columns:\n",
+    "                    topics = df[col].dropna().tolist()\n",
+    "                    all_topics.extend([t for t in topics if t != ''])\n",
+    "            \n",
+    "            if all_topics:\n",
+    "                topic_counts = Counter(all_topics)\n",
+    "                top_topics = dict(topic_counts.most_common(15))\n",
+    "                \n",
+    "                fig_topics = px.bar(\n",
+    "                    x=list(top_topics.values()),\n",
+    "                    y=list(top_topics.keys()),\n",
+    "                    orientation='h',\n",
+    "                    title=\"Top 15 Specific Topics\",\n",
+    "                    labels={'x': 'Count', 'y': 'Topic'}\n",
+    "                )\n",
+    "                visualizations['Topic Distribution'] = fig_topics\n",
+    "            \n",
+    "        if 'sentiment' in df.columns and 'topic_1' in df.columns:\n",
+    "            # Sentiment by primary topic (topic_1)\n",
+    "            df_temp = df[df['topic_1'] != ''].copy()\n",
+    "            if not df_temp.empty:\n",
+    "                # Get top 10 topics for cleaner visualization\n",
+    "                top_topics = df_temp['topic_1'].value_counts().head(10).index\n",
+    "                df_filtered = df_temp[df_temp['topic_1'].isin(top_topics)]\n",
+    "                \n",
+    "                pivot_table = pd.crosstab(df_filtered['topic_1'], df_filtered['sentiment'])\n",
+    "                fig_heatmap = px.imshow(\n",
+    "                    pivot_table,\n",
+    "                    labels=dict(x=\"Sentiment\", y=\"Primary Topic\", color=\"Count\"),\n",
+    "                    title=\"Sentiment by Primary Topic Heatmap\",\n",
+    "                    color_continuous_scale=\"RdYlGn\"\n",
+    "                )\n",
+    "                visualizations['Sentiment by Topic'] = fig_heatmap\n",
+    "            \n",
+    "        if 'date' in df.columns and 'sentiment' in df.columns:\n",
+    "            # Sentiment over time\n",
+    "            df_time = df.copy()\n",
+    "            df_time['date'] = pd.to_datetime(df_time['date'])\n",
+    "            time_data = df_time.groupby([pd.Grouper(key='date', freq='M'), 'sentiment']).size().reset_index(name='count')\n",
+    "            \n",
+    "            fig_timeline = px.line(\n",
+    "                time_data,\n",
+    "                x='date',\n",
+    "                y='count',\n",
+    "                color='sentiment',\n",
+    "                title=\"Sentiment Trends Over Time\",\n",
+    "                color_discrete_map={\n",
+    "                    'Positive': '#27AE60',\n",
+    "                    'Negative': '#E74C3C',\n",
+    "                    'Neutral': '#95A5A6'\n",
+    "                }\n",
+    "            )\n",
+    "            visualizations['Sentiment Timeline'] = fig_timeline\n",
+    "        \n",
+    "        if 'actionable_insights' in df.columns:\n",
+    "            # Top actionable insights\n",
+    "            all_insights = []\n",
+    "            for insight in df['actionable_insights']:\n",
+    "                if insight and insight != \"\":\n",
+    "                    # Split by comma as we're now using comma-separated insights\n",
+    "                    all_insights.extend([i.strip() for i in insight.split(',')])\n",
+    "            \n",
+    "            if all_insights:\n",
+    "                insight_counts = Counter(all_insights)\n",
+    "                top_insights = dict(insight_counts.most_common(10))\n",
+    "                \n",
+    "                fig_insights = px.bar(\n",
+    "                    x=list(top_insights.values()),\n",
+    "                    y=list(top_insights.keys()),\n",
+    "                    orientation='h',\n",
+    "                    title=\"Top 10 Actionable Insights\",\n",
+    "                    labels={'x': 'Frequency', 'y': 'Insight'}\n",
+    "                )\n",
+    "                visualizations['Top Insights'] = fig_insights\n",
+    "        \n",
+    "        return visualizations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "5ee86a52-b195-4010-a2b7-3abf57bf9949",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ===== GRADIO INTERFACE =====\n",
+    "# Global variables\n",
+    "analyzer = None\n",
+    "current_data = None\n",
+    "current_visualizations = None\n",
+    "\n",
+    "def update_model(model_name):\n",
+    "    \"\"\"Update the selected AI model\"\"\"\n",
+    "    global model_manager\n",
+    "    \n",
+    "    if model_manager.set_model(model_name):\n",
+    "        return f\"✅ Model switched to: {model_name}\"\n",
+    "    else:\n",
+    "        return f\"❌ Failed to switch to: {model_name}\"\n",
+    "\n",
+    "def process_file(file, model_name):\n",
+    "    \"\"\"Process uploaded file with selected model\"\"\"\n",
+    "    global analyzer, current_data, current_visualizations, model_manager\n",
+    "    \n",
+    "    if file is None:\n",
+    "        return \"Please upload a file\", None, None, None, None, None, gr.update(choices=[])\n",
+    "    \n",
+    "    try:\n",
+    "        # Update model if changed\n",
+    "        if model_name and model_manager:\n",
+    "            model_manager.set_model(model_name)\n",
+    "        \n",
+    "        analyzer = EnhancedTextAnalyzer(model_manager)\n",
+    "        \n",
+    "        # Load file\n",
+    "        df, message = analyzer.load_file(file)\n",
+    "        if df is None:\n",
+    "            return message, None, None, None, None, None, gr.update(choices=[])\n",
+    "        \n",
+    "        # Process data\n",
+    "        processed_df, detected_cols, output_file = analyzer.process_data(df)\n",
+    "        current_data = processed_df\n",
+    "        \n",
+    "        # Generate visualizations\n",
+    "        visualizations = analyzer.generate_visualizations(processed_df)\n",
+    "        current_visualizations = visualizations\n",
+    "        \n",
+    "        # Generate AI insights\n",
+    "        ai_insights = analyzer.generate_ai_insights(processed_df)\n",
+    "        \n",
+    "        # Create summary - safely handle detected columns\n",
+    "        text_cols = list(detected_cols.get('text_columns', []))[:3] if detected_cols.get('text_columns') else []\n",
+    "        id_cols = list(detected_cols.get('id_columns', []))[:3] if detected_cols.get('id_columns') else []\n",
+    "        product_cols = list(detected_cols.get('product_columns', []))[:3] if detected_cols.get('product_columns') else []\n",
+    "        \n",
+    "        summary = f\"\"\"\n",
+    "        ### ✅ File Processing Complete!\n",
+    "        \n",
+    "        **Detected Columns:**\n",
+    "        - Text Columns: {', '.join(text_cols) if text_cols else 'None'}\n",
+    "        - ID Columns: {', '.join(id_cols) if id_cols else 'Auto-generated'}\n",
+    "        - Product Columns: {', '.join(product_cols) if product_cols else 'None'}\n",
+    "        \n",
+    "        **Analysis Results:**\n",
+    "        - Total Records: {len(processed_df)}\n",
+    "        - Processed File Saved: {output_file}\n",
+    "        - AI Model Used: {model_manager.current_model if model_manager else 'None'}\n",
+    "        \"\"\"\n",
+    "        \n",
+    "        # Data preview\n",
+    "        preview = processed_df.head(10)\n",
+    "        \n",
+    "        # Get first visualization\n",
+    "        first_viz = list(visualizations.values())[0] if visualizations else None\n",
+    "        \n",
+    "        return (\n",
+    "            summary,\n",
+    "            preview,\n",
+    "            output_file,\n",
+    "            ai_insights,\n",
+    "            first_viz,\n",
+    "            \"Ready for search\",\n",
+    "            gr.update(choices=list(visualizations.keys()))\n",
+    "        )\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        return f\"Error: {str(e)}\", None, None, None, None, None, gr.update(choices=[])\n",
+    "\n",
+    "def search_data(query):\n",
+    "    \"\"\"Search through the data with enhanced semantic search\"\"\"\n",
+    "    global analyzer, current_data\n",
+    "    \n",
+    "    if analyzer is None or current_data is None:\n",
+    "        return \"Please process a file first\", None, None\n",
+    "    \n",
+    "    if not query:\n",
+    "        return \"Please enter a search query\", None, None\n",
+    "    \n",
+    "    try:\n",
+    "        results = analyzer.search_engine.search(query, top_k=10)\n",
+    "        \n",
+    "        if results.empty:\n",
+    "            return \"No results found\", None, None\n",
+    "        \n",
+    "        # Select relevant columns for display (updated to include new topic columns)\n",
+    "        display_cols = ['unique_id', 'combined_text', 'sentiment', 'topic_1', 'topic_2', 'topic_3', 'actionable_insights', 'search_score']\n",
+    "        display_cols = [col for col in display_cols if col in results.columns]\n",
+    "        \n",
+    "        results_display = results[display_cols]\n",
+    "        \n",
+    "        # Save search results\n",
+    "        search_output = f\"search_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx\"\n",
+    "        results_display.to_excel(search_output, index=False)\n",
+    "        \n",
+    "        return f\"Found {len(results)} results\", results_display.head(10), search_output\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        return f\"Search error: {str(e)}\", None, None\n",
+    "\n",
+    "def update_visualization(viz_type):\n",
+    "    \"\"\"Update displayed visualization\"\"\"\n",
+    "    global current_visualizations\n",
+    "    \n",
+    "    if current_visualizations and viz_type in current_visualizations:\n",
+    "        return current_visualizations[viz_type]\n",
+    "    return None\n",
+    "\n",
+    "def export_results(format_type):\n",
+    "    \"\"\"Export processed data in different formats\"\"\"\n",
+    "    global current_data\n",
+    "    \n",
+    "    if current_data is None:\n",
+    "        return \"No data to export\", None\n",
+    "    \n",
+    "    try:\n",
+    "        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')\n",
+    "        \n",
+    "        if format_type == \"Excel\":\n",
+    "            output_file = f\"analysis_results_{timestamp}.xlsx\"\n",
+    "            current_data.to_excel(output_file, index=False)\n",
+    "        else:  # CSV\n",
+    "            output_file = f\"analysis_results_{timestamp}.csv\"\n",
+    "            current_data.to_csv(output_file, index=False)\n",
+    "        \n",
+    "        return f\"Data exported to {output_file}\", output_file\n",
+    "    \n",
+    "    except Exception as e:\n",
+    "        return f\"Export error: {str(e)}\", None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "38bf0375-9ef8-488c-821f-288c4f59ff5d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create Gradio interface\n",
+    "def create_interface():\n",
+    "    \"\"\"Create the Gradio interface with model selection\"\"\"\n",
+    "    \n",
+    "    with gr.Blocks(theme=gr.themes.Soft()) as app:\n",
+    "        gr.Markdown(\n",
+    "            \"\"\"\n",
+    "            # 📊 Enhanced Text Analytics AI Agent\n",
+    "            ### Smart Column Detection & Comprehensive Text Analysis with Multiple AI Models\n",
+    "            \n",
+    "            **Features:**\n",
+    "            - 🤖 Multiple AI Model Support (OpenAI, Anthropic, Deepseek, Groq, Google)\n",
+    "            - 🔍 Automatic detection of text, ID, and product columns\n",
+    "            - 💾 Memory-efficient processing with automatic file cleanup\n",
+    "            - 😊 Sentiment analysis with scoring\n",
+    "            - 🎯 Topic/theme extraction\n",
+    "            - 💡 Actionable insights generation\n",
+    "            - 🔎 Advanced text search with similarity scoring\n",
+    "            - 📈 Multiple visualization options\n",
+    "            - 📥 Export results in Excel or CSV format\n",
+    "            \"\"\"\n",
+    "        )\n",
+    "        \n",
+    "        with gr.Tab(\"📤 Upload & Process\"):\n",
+    "            with gr.Row():\n",
+    "                with gr.Column(scale=1):\n",
+    "                    # Model selection dropdown\n",
+    "                    model_dropdown = gr.Dropdown(\n",
+    "                        label=\"🤖 Select AI Model\",\n",
+    "                        choices=model_manager.get_available_models(),\n",
+    "                        value=model_manager.current_model if model_manager.current_model else None,\n",
+    "                        interactive=True\n",
+    "                    )\n",
+    "                    \n",
+    "                    file_upload = gr.File(\n",
+    "                        label=\"Upload Data File\",\n",
+    "                        file_types=[\".csv\", \".xlsx\", \".xls\", \".json\"]\n",
+    "                    )\n",
+    "                    process_btn = gr.Button(\"🚀 Process File\", variant=\"primary\")\n",
+    "                \n",
+    "                with gr.Column(scale=2):\n",
+    "                    status_output = gr.Markdown(label=\"Processing Status\")\n",
+    "                    ai_insights = gr.Markdown(label=\"AI-Generated Insights\")\n",
+    "            \n",
+    "            with gr.Row():\n",
+    "                data_preview = gr.Dataframe(\n",
+    "                    label=\"Data Preview (First 10 rows)\",\n",
+    "                    interactive=False\n",
+    "                )\n",
+    "            \n",
+    "            processed_file = gr.File(\n",
+    "                label=\"📁 Processed Data File\",\n",
+    "                interactive=False\n",
+    "            )\n",
+    "        \n",
+    "        with gr.Tab(\"🔍 Search\"):\n",
+    "            gr.Markdown(\"### Search through your text data\")\n",
+    "            \n",
+    "            with gr.Row():\n",
+    "                search_input = gr.Textbox(\n",
+    "                    label=\"Enter search query\",\n",
+    "                    placeholder=\"Type keywords to search...\"\n",
+    "                )\n",
+    "                search_btn = gr.Button(\"🔎 Search\", variant=\"primary\")\n",
+    "            \n",
+    "            search_status = gr.Markdown(label=\"Search Status\")\n",
+    "            search_results = gr.Dataframe(\n",
+    "                label=\"Search Results\",\n",
+    "                interactive=False\n",
+    "            )\n",
+    "            search_file = gr.File(\n",
+    "                label=\"📥 Download Search Results\",\n",
+    "                interactive=False\n",
+    "            )\n",
+    "        \n",
+    "        with gr.Tab(\"📈 Visualizations\"):\n",
+    "            with gr.Row():\n",
+    "                viz_selector = gr.Dropdown(\n",
+    "                    label=\"Select Visualization\",\n",
+    "                    choices=[],\n",
+    "                    interactive=True\n",
+    "                )\n",
+    "            \n",
+    "            viz_plot = gr.Plot(label=\"Visualization\")\n",
+    "        \n",
+    "        with gr.Tab(\"📥 Export\"):\n",
+    "            gr.Markdown(\"### Export your analyzed data\")\n",
+    "            \n",
+    "            with gr.Row():\n",
+    "                export_format = gr.Radio(\n",
+    "                    choices=[\"Excel\", \"CSV\"],\n",
+    "                    value=\"Excel\",\n",
+    "                    label=\"Export Format\"\n",
+    "                )\n",
+    "                export_btn = gr.Button(\"📥 Export Data\", variant=\"primary\")\n",
+    "            \n",
+    "            export_status = gr.Markdown(label=\"Export Status\")\n",
+    "            export_file = gr.File(\n",
+    "                label=\"📁 Download Exported File\",\n",
+    "                interactive=False\n",
+    "            )\n",
+    "        \n",
+    "        # Event handlers\n",
+    "        model_dropdown.change(\n",
+    "            fn=update_model,\n",
+    "            inputs=[model_dropdown],\n",
+    "            outputs=[status_output]\n",
+    "        )\n",
+    "        \n",
+    "        process_btn.click(\n",
+    "            fn=process_file,\n",
+    "            inputs=[file_upload, model_dropdown],\n",
+    "            outputs=[\n",
+    "                status_output,\n",
+    "                data_preview,\n",
+    "                processed_file,\n",
+    "                ai_insights,\n",
+    "                viz_plot,\n",
+    "                search_status,\n",
+    "                viz_selector\n",
+    "            ]\n",
+    "        )\n",
+    "        \n",
+    "        search_btn.click(\n",
+    "            fn=search_data,\n",
+    "            inputs=[search_input],\n",
+    "            outputs=[search_status, search_results, search_file]\n",
+    "        )\n",
+    "        \n",
+    "        viz_selector.change(\n",
+    "            fn=update_visualization,\n",
+    "            inputs=[viz_selector],\n",
+    "            outputs=[viz_plot]\n",
+    "        )\n",
+    "        \n",
+    "        export_btn.click(\n",
+    "            fn=export_results,\n",
+    "            inputs=[export_format],\n",
+    "            outputs=[export_status, export_file]\n",
+    "        )\n",
+    "    \n",
+    "    return app"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "6c5a0767-a788-43a8-911c-04e81814f4c4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* Running on local URL:  http://127.0.0.1:7861\n",
+      "* Running on public URL: https://8190830de481785995.gradio.live\n",
+      "\n",
+      "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"https://8190830de481785995.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Keyboard interruption in main thread... closing server.\n",
+      "Killing tunnel 127.0.0.1:7861 <> https://8190830de481785995.gradio.live\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Launch the application\n",
+    "if __name__ == \"__main__\":\n",
+    "    app = create_interface()\n",
+    "    app.launch(share=True, debug=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "4f382d04-cee3-40ea-9687-5f2dff2282f7",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "SyntaxError",
+     "evalue": "invalid syntax (2621292756.py, line 1)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;36m  Cell \u001b[0;32mIn[12], line 1\u001b[0;36m\u001b[0m\n\u001b[0;31m    python -m textblob.download_corpora\u001b[0m\n\u001b[0m              ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
+     ]
+    }
+   ],
+   "source": [
+    "python -m textblob.download_corpora"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63afdaca-562b-4846-8fb2-c699f7ab6615",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d82bb0bb-053e-4c29-af8b-b732dfcb47ad",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12da3957-a063-48f8-8916-e552cc317280",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

app.py ADDED Viewed

	@@ -0,0 +1,1271 @@

+# ===== MULTIMODAL TEXT ANALYTICS AI ASSISTANT =====
+# This is a comprehensive text analytics system with multiple AI API integrations
+# and smart column detection capabilities for customer feedback analysis
+# ===== IMPORTS SECTION =====
+# Core Python libraries for basic functionality
+import os  # Operating system interface for environment variables and file operations
+import warnings  # Python warnings control to suppress unnecessary warnings
+warnings.filterwarnings('ignore')  # Suppress all warnings to keep output clean
+# Environment and API management
+from dotenv import load_dotenv  # Load environment variables from .env file for API keys
+from anthropic import Anthropic  # Anthropic's Claude AI API client
+# Additional AI APIs - using try/except to handle missing dependencies gracefully
+try:
+    from openai import OpenAI  # OpenAI's GPT API client
+except ImportError:
+    OpenAI = None  # Set to None if not installed, will be checked later
+try:
+    from groq import Groq  # Groq's fast inference API client
+except ImportError:
+    Groq = None  # Set to None if not installed
+try:
+    import google.generativeai as genai  # Google's Gemini API client
+except ImportError:
+    genai = None  # Set to None if not installed
+# Data processing and manipulation libraries
+import pandas as pd  # Primary data manipulation library for DataFrames
+import numpy as np  # Numerical computing library for array operations
+from datetime import datetime, timedelta  # Date and time handling utilities
+import json  # JSON data format handling
+import gc  # Garbage collection for memory management - important for large datasets
+# Natural Language Processing libraries
+import nltk  # Natural Language Toolkit - comprehensive NLP library
+from nltk.corpus import stopwords  # Common words to filter out (the, and, or, etc.)
+from nltk.tokenize import word_tokenize  # Split text into individual words/tokens
+from nltk.stem import WordNetLemmatizer  # Reduce words to their root form (running -> run)
+from textblob import TextBlob  # Simple API for diving into common NLP tasks
+import re  # Regular expressions for text pattern matching and cleaning
+from collections import Counter  # Efficient counting of hashable objects
+# Machine Learning libraries for text analysis
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer  # Convert text to numerical features
+from sklearn.decomposition import LatentDirichletAllocation  # Topic modeling algorithm
+from sklearn.cluster import KMeans  # Clustering algorithm for grouping similar texts
+from sklearn.preprocessing import StandardScaler  # Normalize numerical features
+from sklearn.metrics.pairwise import cosine_similarity  # Measure similarity between text vectors
+# Visualization libraries for creating charts and graphs
+import plotly.express as px  # High-level plotting interface
+import plotly.graph_objects as go  # Low-level plotting interface for custom charts
+from plotly.subplots import make_subplots  # Create multiple charts in one figure
+import matplotlib.pyplot as plt  # Traditional plotting library
+import seaborn as sns  # Statistical data visualization built on matplotlib
+# Web interface framework
+import gradio as gr  # Create web interfaces for machine learning models
+# Download required NLTK data packages - these contain language models and corpora
+nltk.download('punkt', quiet=True)  # Sentence tokenizer models
+nltk.download('punkt_tab', quiet=True)  # New tokenizer format for latest NLTK versions
+nltk.download('stopwords', quiet=True)  # Lists of common words to filter out
+nltk.download('wordnet', quiet=True)  # Lexical database for lemmatization
+nltk.download('averaged_perceptron_tagger', quiet=True)  # Part-of-speech tagger
+nltk.download('omw-1.4', quiet=True)  # Open Multilingual Wordnet for lemmatizer
+nltk.download('brown', quiet=True)  # Brown corpus required for TextBlob
+# Download TextBlob corpora for sentiment analysis
+try:
+    from textblob import download_corpora  # Import corpora downloader
+    download_corpora.main()  # Download all required corpora
+except:
+    # Alternative method if the above doesn't work - use subprocess
+    import subprocess  # Execute shell commands from Python
+    import sys  # System-specific parameters and functions
+    try:
+        # Run TextBlob download command as subprocess with timeout
+        subprocess.run([sys.executable, "-m", "textblob.download_corpora"],
+                      capture_output=True, text=True, timeout=30)
+    except:
+        # If download fails, print warning but continue execution
+        print("Warning: Could not download TextBlob corpora. Sentiment analysis may not work properly.")
+        print("Please run: python -m textblob.download_corpora")
+# Load environment variables from .env file, override existing ones
+load_dotenv(override=True)
+# ===== SMART COLUMN DETECTOR CLASS =====
+class SmartColumnDetector:
+    """
+    Intelligently detect and extract relevant columns from uploaded data
+    This class automatically identifies what type of data each column contains
+    """
+    def __init__(self):
+        """Initialize the detector with keyword lists for different column types"""
+        # Keywords for detecting text/feedback columns - these usually contain the main content
+        self.text_keywords = ['comment', 'feedback', 'review', 'description', 'text',
+                             'response', 'opinion', 'message', 'notes', 'remarks']
+        # Keywords for detecting ID/identifier columns - these uniquely identify records
+        self.id_keywords = ['id', 'identifier', 'key', 'number', 'code', 'ref',
+                           'reference', 'index', 'uuid']
+        # Keywords for detecting product/category columns - these describe what's being reviewed
+        self.product_keywords = ['product', 'item', 'model', 'variant', 'type',
+                                'category', 'brand', 'name', 'sku']
+        # Keywords for detecting date/time columns - these show when feedback was given
+        self.date_keywords = ['date', 'time', 'created', 'updated', 'timestamp']
+    def detect_column_types(self, df):
+        """
+        Detect column types based on column names and content analysis
+        Returns a dictionary categorizing each column by its likely purpose
+        """
+        # Initialize results dictionary with empty lists for each category
+        detected = {
+            'text_columns': [],      # Columns containing feedback/comments
+            'id_columns': [],        # Columns containing unique identifiers
+            'product_columns': [],   # Columns describing products/categories
+            'date_columns': [],      # Columns containing dates/timestamps
+            'other_columns': []      # Everything else
+        }
+        # Iterate through each column in the dataframe
+        for col in df.columns:
+            col_lower = col.lower()  # Convert to lowercase for case-insensitive matching
+            # Check if column name contains text-related keywords
+            if any(keyword in col_lower for keyword in self.text_keywords):
+                detected['text_columns'].append(col)
+            # Check if column name contains ID-related keywords
+            elif any(keyword in col_lower for keyword in self.id_keywords):
+                detected['id_columns'].append(col)
+            # Check if column name contains product-related keywords
+            elif any(keyword in col_lower for keyword in self.product_keywords):
+                detected['product_columns'].append(col)
+            # Check if column name contains date-related keywords
+            elif any(keyword in col_lower for keyword in self.date_keywords):
+                detected['date_columns'].append(col)
+            else:
+                # If no keywords match, analyze the actual content to determine type
+                sample = df[col].dropna().head(100)  # Get first 100 non-null values
+                if len(sample) > 0:  # If we have sample data
+                    # Check if column contains text data (object dtype in pandas)
+                    if df[col].dtype == 'object':
+                        # Calculate average length of text in this column
+                        avg_length = sample.astype(str).str.len().mean()
+                        if avg_length > 50:  # Long text likely indicates feedback/comments
+                            detected['text_columns'].append(col)
+                        elif avg_length < 20 and df[col].nunique() / len(df) > 0.5:
+                            # Short, mostly unique values likely indicate IDs
+                            detected['id_columns'].append(col)
+                        else:
+                            # Short, non-unique text likely indicates categories/products
+                            detected['product_columns'].append(col)
+                    else:
+                        # Non-text columns go to 'other' category
+                        detected['other_columns'].append(col)
+        return detected  # Return the categorized column dictionary
+    def extract_relevant_data(self, df):
+        """
+        Extract only relevant columns and create optimized dataset for analysis
+        This reduces memory usage and focuses on important data
+        """
+        # First, detect what type each column is
+        detected = self.detect_column_types(df)
+        # Create new dataframe with only relevant columns
+        extracted_data = pd.DataFrame()
+        # Add unique identifier column - use existing ID or create one
+        if detected['id_columns'] and len(detected['id_columns']) > 0:
+            # Use first detected ID column
+            extracted_data['unique_id'] = df[detected['id_columns'][0]]
+        else:
+            # Create sequential ID numbers if no ID column exists
+            extracted_data['unique_id'] = range(1, len(df) + 1)
+        # Add product information columns (limit to first 2 to avoid too many columns)
+        if detected['product_columns'] and len(detected['product_columns']) > 0:
+            # Convert to list if needed and limit to 2 product columns
+            product_cols = list(detected['product_columns'])[:2]
+            for col in product_cols:
+                # Add with 'product_' prefix to make purpose clear
+                extracted_data[f'product_{col}'] = df[col]
+        # Combine all text columns into a single 'combined_text' column
+        if detected['text_columns'] and len(detected['text_columns']) > 0:
+            text_cols = list(detected['text_columns'])  # Ensure it's a list
+            text_data = []  # Initialize list to store combined text
+            # For each row, combine all text columns
+            for idx in df.index:
+                combined_text = ' '.join([
+                    str(df.loc[idx, col])  # Convert to string
+                    for col in text_cols   # For each text column
+                    if col in df.columns and pd.notna(df.loc[idx, col])  # If column exists and value is not null
+                ])
+                text_data.append(combined_text)  # Add to our list
+            extracted_data['combined_text'] = text_data  # Add as new column
+        else:
+            # If no text columns detected, create empty combined_text column
+            extracted_data['combined_text'] = [''] * len(df)
+        # Add date column if available (use first detected date column)
+        if detected['date_columns'] and len(detected['date_columns']) > 0:
+            # Convert to datetime format, handle errors gracefully
+            extracted_data['date'] = pd.to_datetime(df[detected['date_columns'][0]], errors='coerce')
+        # Return both the extracted data and the detection results
+        return extracted_data, detected
+# ===== ENHANCED TEXT PROCESSOR CLASS =====
+class EnhancedTextProcessor:
+    """
+    Enhanced text preprocessing with actionable insights extraction
+    This class handles text cleaning and extracts meaningful patterns from customer feedback
+    """
+    def __init__(self):
+        """Initialize the text processor with NLP tools and insight dictionaries"""
+        self.lemmatizer = WordNetLemmatizer()  # Tool to reduce words to root form
+        self.stop_words = set(stopwords.words('english'))  # Common words to ignore
+        # Dictionary mapping actionable items to keywords that indicate them
+        # This helps identify what customers want improved
+        self.actionable_dictionary = {
+            'improve speed': ['slow', 'faster', 'quick', 'speed up', 'takes too long', 'waiting'],
+            'better quality': ['poor quality', 'cheap', 'breaks', 'defective', 'flimsy', 'weak'],
+            'enhance ui': ['confusing', 'hard to use', 'complicated', 'not intuitive', 'difficult to navigate'],
+            'fix bugs': ['bug', 'error', 'crash', 'freeze', 'not working', 'glitch', 'broken'],
+            'add features': ['missing', 'need', 'want', 'should have', 'would be nice', 'lacks'],
+            'improve support': ['no response', 'unhelpful', 'rude', 'poor service', 'bad support'],
+            'better packaging': ['damaged', 'poor packaging', 'arrived broken', 'not protected'],
+            'clearer docs': ['unclear', 'no instructions', 'confusing manual', 'hard to understand'],
+            'reduce price': ['expensive', 'overpriced', 'too costly', 'not worth', 'cheaper'],
+            'faster delivery': ['late', 'delayed', 'slow shipping', 'took forever', 'still waiting'],
+            'better communication': ['no updates', 'not informed', 'lack of communication', 'no tracking'],
+            'improve reliability': ['unreliable', 'stops working', 'inconsistent', 'sometimes works'],
+            'enhance performance': ['slow performance', 'laggy', 'sluggish', 'not responsive'],
+            'better design': ['ugly', 'poor design', 'looks cheap', 'not attractive', 'outdated look'],
+            'more options': ['limited options', 'no variety', 'need more choices', 'only one option']
+        }
+    def clean_text(self, text):
+        """
+        Clean and normalize text for analysis
+        Removes special characters and standardizes format
+        """
+        # Handle null or empty text
+        if pd.isna(text) or text == '':
+            return ""
+        text = str(text).lower()  # Convert to lowercase string
+        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters, keep only letters, numbers, spaces
+        text = ' '.join(text.split())  # Remove extra whitespace
+        return text
+    def extract_actionable_insights(self, text):
+        """
+        Extract actionable insights using dictionary matching
+        Returns comma-separated list of suggested improvements
+        """
+        # Handle null or empty text
+        if pd.isna(text) or text == '':
+            return ""
+        text_lower = text.lower()  # Convert to lowercase for matching
+        found_insights = []  # List to store found actionable items
+        # Check each actionable item against the text
+        for action, keywords in self.actionable_dictionary.items():
+            for keyword in keywords:
+                if keyword in text_lower:  # If keyword found in text
+                    found_insights.append(action)  # Add the actionable item
+                    break  # Only add each action once per text
+        # Return top 3 most relevant insights to avoid overwhelming output
+        if found_insights:
+            return ', '.join(found_insights[:3])
+        return ""
+    def extract_specific_topics(self, text):
+        """
+        Extract specific topics from text using keyword extraction and noun phrase detection
+        Returns list of 3 topics (may include empty strings if not enough topics found)
+        """
+        # Handle null, empty, or very short text
+        if pd.isna(text) or text == '' or len(text) < 10:
+            return ['', '', '']  # Return 3 empty strings
+        text_lower = text.lower()  # Convert to lowercase
+        # Remove stopwords for better topic extraction
+        words = word_tokenize(text_lower)  # Split into individual words
+        # Filter out stopwords and very short words
+        filtered_words = [w for w in words if w not in self.stop_words and len(w) > 3]
+        # Extract noun phrases using TextBlob (these are usually good topics)
+        blob = TextBlob(text)
+        noun_phrases = blob.noun_phrases  # Get noun phrases from text
+        topics = []  # Initialize topics list
+        # Add noun phrases (these are usually good topics)
+        for phrase in noun_phrases[:5]:  # Limit to top 5 noun phrases
+            if len(phrase.split()) <= 3:  # Only include short phrases (3 words or less)
+                topics.append(phrase)
+        # Add frequent meaningful words if we don't have enough topics
+        if len(topics) < 3:
+            word_freq = Counter(filtered_words)  # Count word frequencies
+            for word, _ in word_freq.most_common(5):  # Get top 5 most common words
+                if word not in str(topics):  # Avoid duplicates
+                    topics.append(word)
+                if len(topics) >= 3:  # Stop when we have 3 topics
+                    break
+        # Ensure we always return exactly 3 items
+        topics = topics[:3]  # Take only first 3
+        while len(topics) < 3:  # Add empty strings if needed
+            topics.append('')
+        return topics
+    def determine_topic(self, text):
+        """
+        Legacy method kept for compatibility - returns first specific topic
+        This maintains backward compatibility with older versions
+        """
+        topics = self.extract_specific_topics(text)  # Get all topics
+        return topics[0] if topics[0] else 'General'  # Return first topic or 'General'
+# ===== SEARCH ENGINE CLASS =====
+class TextSearchEngine:
+    """
+    Advanced search functionality for text data with semantic capabilities
+    Uses TF-IDF vectorization and cosine similarity for intelligent text search
+    """
+    def __init__(self):
+        """Initialize the search engine with TF-IDF vectorizer and synonym dictionary"""
+        # TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer
+        # Converts text to numerical vectors for similarity calculations
+        self.vectorizer = TfidfVectorizer(
+            max_features=1000,        # Limit to top 1000 most important terms
+            ngram_range=(1, 3),       # Include unigrams, bigrams, and trigrams for better matching
+            stop_words='english',     # Remove common English words
+            use_idf=True,            # Use inverse document frequency weighting
+            smooth_idf=True,         # Add smoothing to IDF
+            sublinear_tf=True        # Apply sublinear tf scaling for better performance
+        )
+        self.tfidf_matrix = None     # Will store the TF-IDF matrix after building index
+        self.data = None             # Will store the original data
+        # Synonym dictionary for semantic search - helps find related terms
+        self.synonyms = {
+            'fast': ['quick', 'rapid', 'speedy', 'swift', 'prompt'],
+            'slow': ['sluggish', 'delayed', 'laggy', 'lengthy', 'prolonged'],
+            'good': ['excellent', 'great', 'wonderful', 'fantastic', 'amazing', 'positive'],
+            'bad': ['poor', 'terrible', 'awful', 'negative', 'horrible', 'disappointing'],
+            'problem': ['issue', 'bug', 'error', 'defect', 'fault', 'glitch'],
+            'help': ['support', 'assistance', 'aid', 'service'],
+            'price': ['cost', 'fee', 'charge', 'rate', 'payment', 'expensive', 'cheap'],
+            'quality': ['standard', 'grade', 'condition', 'caliber'],
+            'delivery': ['shipping', 'dispatch', 'arrival', 'transport'],
+            'easy': ['simple', 'straightforward', 'effortless', 'user-friendly'],
+            'hard': ['difficult', 'complex', 'complicated', 'challenging'],
+            'broken': ['damaged', 'defective', 'faulty', 'malfunctioning'],
+            'love': ['like', 'enjoy', 'appreciate', 'adore'],
+            'hate': ['dislike', 'despise', 'detest'],
+            'feature': ['function', 'capability', 'option', 'characteristic'],
+            'customer': ['client', 'buyer', 'purchaser', 'consumer', 'user']
+        }
+    def expand_query_with_synonyms(self, query):
+        """
+        Expand search query with synonyms for better semantic matching
+        This helps find relevant results even when different words are used
+        """
+        query_words = query.lower().split()  # Split query into individual words
+        expanded_terms = []  # List to store original words and synonyms
+        for word in query_words:
+            expanded_terms.append(word)  # Add the original word
+            # Add synonyms if available for this word
+            if word in self.synonyms:
+                expanded_terms.extend(self.synonyms[word])
+            # Check if word is a synonym of something else and add related terms
+            for key, syns in self.synonyms.items():
+                if word in syns:  # If current word is a synonym
+                    expanded_terms.append(key)  # Add the main term
+                    expanded_terms.extend([s for s in syns if s != word])  # Add other synonyms
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_terms = []
+        for term in expanded_terms:
+            if term not in seen:
+                unique_terms.append(term)
+                seen.add(term)
+        return ' '.join(unique_terms)  # Return expanded query as single string
+    def build_index(self, df, text_column):
+        """
+        Build search index from text data
+        Creates TF-IDF vectors for all documents to enable fast similarity search
+        """
+        self.data = df.copy()  # Store copy of the data
+        texts = df[text_column].fillna('').tolist()  # Get all text, fill nulls with empty string
+        # Add other searchable columns to improve search accuracy
+        if 'topic_1' in df.columns:
+            # Combine main text with topic information for better searchability
+            texts = [f"{text} {df.iloc[i]['topic_1']} {df.iloc[i]['topic_2']} {df.iloc[i]['topic_3']}"
+                    for i, text in enumerate(texts)]
+        if 'actionable_insights' in df.columns:
+            # Also include actionable insights in searchable text
+            texts = [f"{texts[i]} {df.iloc[i]['actionable_insights']}"
+                    for i in range(len(texts))]
+        # Create TF-IDF matrix from all texts
+        self.tfidf_matrix = self.vectorizer.fit_transform(texts)
+    def search(self, query, top_k=10):
+        """
+        Enhanced search with semantic understanding
+        Returns top matching documents with similarity scores
+        """
+        # Check if index has been built
+        if self.tfidf_matrix is None:
+            return pd.DataFrame()  # Return empty DataFrame if no index
+        # Expand query with synonyms for better semantic matching
+        expanded_query = self.expand_query_with_synonyms(query)
+        # Vectorize both original and expanded queries
+        query_vector = self.vectorizer.transform([query])  # Original query vector
+        expanded_vector = self.vectorizer.transform([expanded_query])  # Expanded query vector
+        # Calculate similarities for both queries against all documents
+        similarities_orig = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
+        similarities_exp = cosine_similarity(expanded_vector, self.tfidf_matrix).flatten()
+        # Combine scores (weighted average - original query gets more weight)
+        combined_similarities = (0.7 * similarities_orig + 0.3 * similarities_exp)
+        # Get top results
+        top_indices = combined_similarities.argsort()[-top_k:][::-1]  # Get indices of top scores, reverse order
+        top_scores = combined_similarities[top_indices]  # Get the actual scores
+        # Filter results with score > 0.05 (lower threshold for better recall)
+        valid_indices = [idx for idx, score in zip(top_indices, top_scores) if score > 0.05]
+        if valid_indices:
+            # Create results dataframe from valid matches
+            results = self.data.iloc[valid_indices].copy()
+            results['search_score'] = [combined_similarities[idx] for idx in valid_indices]
+            # Boost results that have exact matches in the text
+            query_lower = query.lower()
+            for idx in results.index:
+                if 'combined_text' in results.columns:
+                    # If exact query appears in text, boost the score
+                    if query_lower in str(results.at[idx, 'combined_text']).lower():
+                        results.at[idx, 'search_score'] *= 1.5  # 50% boost for exact matches
+            return results.sort_values('search_score', ascending=False)  # Return sorted by relevance
+        return pd.DataFrame()  # Return empty DataFrame if no valid results
+# ===== AI MODEL MANAGER CLASS =====
+class AIModelManager:
+    """
+    Manages multiple AI model APIs and provides unified interface
+    Supports OpenAI, Anthropic, Deepseek, Groq, and Google Gemini
+    """
+    def __init__(self):
+        """Initialize the model manager and set up all available AI APIs"""
+        self.available_models = {}  # Dictionary to store available models
+        self.clients = {}          # Dictionary to store API clients
+        self.current_model = None  # Currently selected model
+        self.initialize_apis()     # Set up all APIs
+    def initialize_apis(self):
+        """Initialize all available AI APIs based on environment variables"""
+        # Anthropic Claude API setup
+        ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")  # Get API key from environment
+        if ANTHROPIC_API_KEY:  # If API key exists
+            try:
+                self.clients['anthropic'] = Anthropic(api_key=ANTHROPIC_API_KEY)  # Create client
+                # Add Claude model to available models
+                self.available_models['Claude 3 Haiku'] = {
+                    'provider': 'anthropic',
+                    'model': 'claude-3-haiku-20240307'
+                }
+                print(f"Anthropic API Key exists and begins {ANTHROPIC_API_KEY[:4]}")  # Confirm setup
+            except Exception as e:
+                print(f"Error initializing Anthropic: {e}")
+        else:
+            print("Anthropic API Key not set")
+        # OpenAI API setup
+        OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+        if OPENAI_API_KEY and OpenAI:  # Check both API key and library availability
+            try:
+                self.clients['openai'] = OpenAI(api_key=OPENAI_API_KEY)
+                # Add multiple OpenAI models
+                self.available_models['GPT-4o-mini'] = {
+                    'provider': 'openai',
+                    'model': 'gpt-4o-mini'
+                }
+                self.available_models['GPT-3.5 Turbo'] = {
+                    'provider': 'openai',
+                    'model': 'gpt-3.5-turbo'
+                }
+                print(f"OpenAI API Key exists and begins {OPENAI_API_KEY[:7]}")
+            except Exception as e:
+                print(f"Error initializing OpenAI: {e}")
+        else:
+            print("OpenAI API Key not set or library not installed")
+        # Deepseek API setup (uses OpenAI-compatible API)
+        DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
+        if DEEPSEEK_API_KEY and OpenAI:
+            try:
+                # Deepseek uses OpenAI client with different base URL
+                self.clients['deepseek'] = OpenAI(
+                    api_key=DEEPSEEK_API_KEY,
+                    base_url="https://api.deepseek.com"  # Deepseek's API endpoint
+                )
+                self.available_models['Deepseek Chat'] = {
+                    'provider': 'deepseek',
+                    'model': 'deepseek-chat'
+                }
+                print(f"Deepseek API Key exists and begins {DEEPSEEK_API_KEY[:7]}")
+            except Exception as e:
+                print(f"Error initializing Deepseek: {e}")
+        else:
+            print("Deepseek API Key not set or OpenAI library not installed")
+        # Groq API setup
+        GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+        if GROQ_API_KEY and Groq:
+            try:
+                self.clients['groq'] = Groq(api_key=GROQ_API_KEY)
+                # Add multiple Groq models
+                self.available_models['Llama 3.3 70B'] = {
+                    'provider': 'groq',
+                    'model': 'llama-3.3-70b-versatile'
+                }
+                self.available_models['Mixtral 8x7B'] = {
+                    'provider': 'groq',
+                    'model': 'mixtral-8x7b-32768'
+                }
+                print(f"Groq API Key exists and begins {GROQ_API_KEY[:4]}")
+            except Exception as e:
+                print(f"Error initializing Groq: {e}")
+        else:
+            print("Groq API Key not set or library not installed")
+        # Google Gemini API setup
+        GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+        if GOOGLE_API_KEY and genai:
+            try:
+                genai.configure(api_key=GOOGLE_API_KEY)  # Configure Google AI
+                self.clients['google'] = genai  # Store the configured module
+                # Add Google models
+                self.available_models['Gemini 1.5 Flash'] = {
+                    'provider': 'google',
+                    'model': 'gemini-1.5-flash'
+                }
+                self.available_models['Gemini 1.5 Pro'] = {
+                    'provider': 'google',
+                    'model': 'gemini-1.5-pro'
+                }
+                print(f"Google API Key exists and begins {GOOGLE_API_KEY[:2]}")
+            except Exception as e:
+                print(f"Error initializing Google Gemini: {e}")
+        else:
+            print("Google API Key not set or library not installed")
+        # Set default model to first available model
+        if self.available_models:
+            self.current_model = list(self.available_models.keys())[0]
+    def get_available_models(self):
+        """Return list of available model names"""
+        return list(self.available_models.keys())
+    def set_model(self, model_name):
+        """Set the current model for text generation"""
+        if model_name in self.available_models:
+            self.current_model = model_name
+            return True  # Success
+        return False  # Model not available
+    def generate_text(self, prompt, max_tokens=1000):
+        """
+        Generate text using the current model
+        Handles different API formats for each provider
+        """
+        # Check if we have a valid current model
+        if not self.current_model or self.current_model not in self.available_models:
+            return None
+        model_info = self.available_models[self.current_model]  # Get model configuration
+        provider = model_info['provider']  # Which API provider to use
+        model = model_info['model']        # Specific model name
+        try:
+            # Handle Anthropic API format
+            if provider == 'anthropic':
+                client = self.clients['anthropic']
+                response = client.messages.create(
+                    model=model,
+                    max_tokens=max_tokens,
+                    messages=[{"role": "user", "content": prompt}]
+                )
+                return response.content[0].text  # Extract text from response
+            # Handle OpenAI and Deepseek API format (both use OpenAI-compatible format)
+            elif provider in ['openai', 'deepseek']:
+                client = self.clients[provider]
+                response = client.chat.completions.create(
+                    model=model,
+                    messages=[{"role": "user", "content": prompt}],
+                    max_tokens=max_tokens
+                )
+                return response.choices[0].message.content  # Extract text from response
+            # Handle Groq API format (similar to OpenAI)
+            elif provider == 'groq':
+                client = self.clients['groq']
+                response = client.chat.completions.create(
+                    model=model,
+                    messages=[{"role": "user", "content": prompt}],
+                    max_tokens=max_tokens
+                )
+                return response.choices[0].message.content
+            # Handle Google Gemini API format
+            elif provider == 'google':
+                model_obj = genai.GenerativeModel(model)  # Create model object
+                response = model_obj.generate_content(prompt)  # Generate response
+                return response.text  # Extract text
+        except Exception as e:
+            print(f"Error generating text with {self.current_model}: {e}")
+            return None
+# Initialize the model manager globally so it can be used throughout the application
+model_manager = AIModelManager()
+# ===== ENHANCED TEXT ANALYZER CLASS =====
+class EnhancedTextAnalyzer:
+    """
+    Main analysis engine with all enhanced features and multi-model support
+    This is the core class that orchestrates all text analysis functionality
+    """
+    def __init__(self, model_manager=None):
+        """Initialize the analyzer with all component classes"""
+        self.model_manager = model_manager                              # AI model manager for generating insights
+        self.column_detector = SmartColumnDetector()                   # Smart column detection
+        self.text_processor = EnhancedTextProcessor()                  # Text processing and insights
+        self.search_engine = TextSearchEngine()                       # Text search functionality
+        self.original_df = None                                        # Store original data
+        self.processed_df = None                                       # Store processed data
+        self.results = {}                                              # Store analysis results
+        self.visualizations = {}                                       # Store generated visualizations
+    def load_file(self, file):
+        """
+        Load data from various file formats (CSV, Excel, JSON)
+        Returns the loaded dataframe and a status message
+        """
+        try:
+            # Determine file type based on extension and load accordingly
+            if file.name.endswith('.csv'):
+                df = pd.read_csv(file.name)  # Load CSV file
+            elif file.name.endswith(('.xlsx', '.xls')):
+                df = pd.read_excel(file.name)  # Load Excel file
+            elif file.name.endswith('.json'):
+                df = pd.read_json(file.name)  # Load JSON file
+            else:
+                return None, "Unsupported file format"  # Return error for unsupported formats
+            return df, f"File loaded: {len(df)} records"  # Return success message with record count
+        except Exception as e:
+            return None, f"Error loading file: {str(e)}"  # Return error message
+    def process_data(self, df):
+        """
+        Process data with smart extraction and analysis
+        This is the main processing pipeline that analyzes the uploaded data
+        """
+        # Step 1: Extract relevant columns using smart detection
+        extracted_df, detected_columns = self.column_detector.extract_relevant_data(df)
+        # Step 2: Store processed data for later use
+        self.processed_df = extracted_df
+        # Step 3: Clean up memory by deleting original large dataframe
+        del df
+        gc.collect()  # Force garbage collection to free memory
+        # Step 4: Add analysis columns if we have text data to analyze
+        if 'combined_text' in extracted_df.columns:
+            # Initialize lists to store analysis results for each row
+            sentiments = []      # Positive/Negative/Neutral sentiment classification
+            polarities = []      # Numerical sentiment scores (-1 to 1)
+            topics_1 = []        # Primary topic for each text
+            topics_2 = []        # Secondary topic for each text
+            topics_3 = []        # Tertiary topic for each text
+            insights = []        # Actionable insights for each text
+            # Process each text entry
+            for text in extracted_df['combined_text']:
+                # Sentiment analysis using TextBlob
+                blob = TextBlob(text)
+                polarity = blob.sentiment.polarity  # Get numerical sentiment score
+                # Convert numerical score to categorical sentiment
+                if polarity > 0.1:      # Positive threshold
+                    sentiment = 'Positive'
+                elif polarity < -0.1:   # Negative threshold
+                    sentiment = 'Negative'
+                else:                   # Neutral range
+                    sentiment = 'Neutral'
+                sentiments.append(sentiment)    # Add categorical sentiment
+                polarities.append(polarity)     # Add numerical score
+                # Extract specific topics (3 separate topics per text)
+                specific_topics = self.text_processor.extract_specific_topics(text)
+                topics_1.append(specific_topics[0])  # Primary topic
+                topics_2.append(specific_topics[1])  # Secondary topic
+                topics_3.append(specific_topics[2])  # Tertiary topic
+                # Extract actionable insights using dictionary matching
+                insight = self.text_processor.extract_actionable_insights(text)
+                insights.append(insight)
+            # Add all analysis results as new columns to the dataframe
+            extracted_df['sentiment'] = sentiments           # Categorical sentiment
+            extracted_df['sentiment_score'] = polarities     # Numerical sentiment score
+            extracted_df['topic_1'] = topics_1              # Primary topic
+            extracted_df['topic_2'] = topics_2              # Secondary topic
+            extracted_df['topic_3'] = topics_3              # Tertiary topic
+            extracted_df['actionable_insights'] = insights   # Actionable insights
+            # Build search index with enhanced search capabilities
+            self.search_engine.build_index(extracted_df, 'combined_text')
+        # Step 5: Save processed data to Excel file for download
+        output_file = 'processed_data.xlsx'
+        extracted_df.to_excel(output_file, index=False)
+        # Return processed data, detected column info, and output file path
+        return extracted_df, detected_columns, output_file
+    def generate_ai_insights(self, df, num_samples=5):
+        """
+        Generate AI-powered insights using selected model
+        Takes sample texts and generates high-level insights using AI
+        """
+        # Check if AI model is available
+        if not self.model_manager or not self.model_manager.current_model:
+            return "No AI model available for generating insights"
+        # Check if we have text data to analyze
+        if 'combined_text' not in df.columns or df.empty:
+            return "No text data available for AI analysis"
+        # Sample some texts for analysis (to avoid sending too much data to AI)
+        sample_texts = df['combined_text'].dropna().head(num_samples).tolist()
+        if not sample_texts:
+            return "No valid text samples found"
+        # Create prompt for AI analysis
+        # This prompt asks the AI to analyze the customer feedback samples
+        prompt = f"""Analyze the following customer feedback samples and provide key insights:
+Samples:
+{chr(10).join([f"{i+1}. {text[:200]}..." if len(text) > 200 else f"{i+1}. {text}" for i, text in enumerate(sample_texts)])}
+Please provide:
+1. Main themes and patterns
+2. Key sentiment indicators
+3. Actionable recommendations
+4. Areas of concern
+Keep the response concise and focused on actionable insights."""
+        # Generate insights using selected model
+        try:
+            response = self.model_manager.generate_text(prompt, max_tokens=500)
+            if response:
+                return f"**AI Insights (using {self.model_manager.current_model}):**\n\n{response}"
+            else:
+                return "Failed to generate AI insights. Please check your API configuration."
+        except Exception as e:
+            return f"Error generating AI insights: {str(e)}"
+    def generate_visualizations(self, df):
+        """
+        Generate various visualizations from the analyzed data
+        Creates interactive charts using Plotly for better user experience
+        """
+        visualizations = {}  # Dictionary to store all visualizations
+        # Generate sentiment distribution pie chart
+        if 'sentiment' in df.columns:
+            sentiment_counts = df['sentiment'].value_counts()  # Count each sentiment category
+            fig_sentiment = px.pie(
+                values=sentiment_counts.values,     # Values for pie slices
+                names=sentiment_counts.index,       # Labels for pie slices
+                title="Sentiment Distribution",     # Chart title
+                color_discrete_map={                # Custom colors for each sentiment
+                    'Positive': '#27AE60',          # Green for positive
+                    'Negative': '#E74C3C',          # Red for negative
+                    'Neutral': '#95A5A6'            # Gray for neutral
+                }
+            )
+            visualizations['Sentiment Distribution'] = fig_sentiment
+        # Generate topic distribution bar chart
+        if 'topic_1' in df.columns:
+            # Combine all topics from all three topic columns
+            all_topics = []
+            for col in ['topic_1', 'topic_2', 'topic_3']:
+                if col in df.columns:
+                    topics = df[col].dropna().tolist()         # Get non-null topics
+                    all_topics.extend([t for t in topics if t != ''])  # Add non-empty topics
+            if all_topics:
+                topic_counts = Counter(all_topics)                    # Count topic frequencies
+                top_topics = dict(topic_counts.most_common(15))       # Get top 15 topics
+                fig_topics = px.bar(
+                    x=list(top_topics.values()),                     # Frequency values
+                    y=list(top_topics.keys()),                       # Topic names
+                    orientation='h',                                  # Horizontal bar chart
+                    title="Top 15 Specific Topics",                  # Chart title
+                    labels={'x': 'Count', 'y': 'Topic'}             # Axis labels
+                )
+                visualizations['Topic Distribution'] = fig_topics
+        # Generate sentiment by topic heatmap
+        if 'sentiment' in df.columns and 'topic_1' in df.columns:
+            df_temp = df[df['topic_1'] != ''].copy()              # Filter out empty topics
+            if not df_temp.empty:
+                # Get top 10 topics for cleaner visualization
+                top_topics = df_temp['topic_1'].value_counts().head(10).index
+                df_filtered = df_temp[df_temp['topic_1'].isin(top_topics)]
+                # Create cross-tabulation of topics vs sentiments
+                pivot_table = pd.crosstab(df_filtered['topic_1'], df_filtered['sentiment'])
+                fig_heatmap = px.imshow(
+                    pivot_table,                                      # Data for heatmap
+                    labels=dict(x="Sentiment", y="Primary Topic", color="Count"),  # Labels
+                    title="Sentiment by Primary Topic Heatmap",      # Title
+                    color_continuous_scale="RdYlGn"                  # Color scale (red to green)
+                )
+                visualizations['Sentiment by Topic'] = fig_heatmap
+        # Generate sentiment timeline if date data is available
+        if 'date' in df.columns and 'sentiment' in df.columns:
+            df_time = df.copy()
+            df_time['date'] = pd.to_datetime(df_time['date'])         # Ensure date format
+            # Group by month and sentiment to show trends over time
+            time_data = df_time.groupby([pd.Grouper(key='date', freq='M'), 'sentiment']).size().reset_index(name='count')
+            fig_timeline = px.line(
+                time_data,
+                x='date',                                             # X-axis: time
+                y='count',                                            # Y-axis: count
+                color='sentiment',                                    # Different lines for each sentiment
+                title="Sentiment Trends Over Time",                  # Chart title
+                color_discrete_map={                                 # Custom colors
+                    'Positive': '#27AE60',
+                    'Negative': '#E74C3C',
+                    'Neutral': '#95A5A6'
+                }
+            )
+            visualizations['Sentiment Timeline'] = fig_timeline
+        # Generate actionable insights bar chart
+        if 'actionable_insights' in df.columns:
+            all_insights = []  # List to store all individual insights
+            for insight in df['actionable_insights']:
+                if insight and insight != "":
+                    # Split by comma as we're now using comma-separated insights
+                    all_insights.extend([i.strip() for i in insight.split(',')])
+            if all_insights:
+                insight_counts = Counter(all_insights)                 # Count insight frequencies
+                top_insights = dict(insight_counts.most_common(10))    # Get top 10 insights
+                fig_insights = px.bar(
+                    x=list(top_insights.values()),                    # Frequency values
+                    y=list(top_insights.keys()),                      # Insight names
+                    orientation='h',                                   # Horizontal bar chart
+                    title="Top 10 Actionable Insights",               # Chart title
+                    labels={'x': 'Frequency', 'y': 'Insight'}        # Axis labels
+                )
+                visualizations['Top Insights'] = fig_insights
+        return visualizations  # Return dictionary of all generated visualizations
+# ===== GRADIO INTERFACE FUNCTIONS =====
+# Global variables to maintain state across function calls
+analyzer = None                # Main analyzer instance
+current_data = None           # Currently processed data
+current_visualizations = None # Currently generated visualizations
+def update_model(model_name):
+    """Update the selected AI model"""
+    global model_manager
+    if model_manager.set_model(model_name):  # Try to set the new model
+        return f"✅ Model switched to: {model_name}"
+    else:
+        return f"❌ Failed to switch to: {model_name}"
+def process_file(file, model_name):
+    """
+    Process uploaded file with selected model
+    This is the main function called when user uploads a file
+    """
+    global analyzer, current_data, current_visualizations, model_manager
+    # Check if file was uploaded
+    if file is None:
+        return "Please upload a file", None, None, None, None, None, gr.update(choices=[])
+    try:
+        # Update model if changed
+        if model_name and model_manager:
+            model_manager.set_model(model_name)
+        # Create new analyzer instance
+        analyzer = EnhancedTextAnalyzer(model_manager)
+        # Load the uploaded file
+        df, message = analyzer.load_file(file)
+        if df is None:  # If file loading failed
+            return message, None, None, None, None, None, gr.update(choices=[])
+        # Process the loaded data
+        processed_df, detected_cols, output_file = analyzer.process_data(df)
+        current_data = processed_df  # Store for later use
+        # Generate visualizations from processed data
+        visualizations = analyzer.generate_visualizations(processed_df)
+        current_visualizations = visualizations  # Store for later use
+        # Generate AI insights using the selected model
+        ai_insights = analyzer.generate_ai_insights(processed_df)
+        # Create summary of processing results
+        # Safely handle detected columns (convert to lists and limit length)
+        text_cols = list(detected_cols.get('text_columns', []))[:3] if detected_cols.get('text_columns') else []
+        id_cols = list(detected_cols.get('id_columns', []))[:3] if detected_cols.get('id_columns') else []
+        product_cols = list(detected_cols.get('product_columns', []))[:3] if detected_cols.get('product_columns') else []
+        summary = f"""
+        ### ✅ File Processing Complete!
+        **Detected Columns:**
+        - Text Columns: {', '.join(text_cols) if text_cols else 'None'}
+        - ID Columns: {', '.join(id_cols) if id_cols else 'Auto-generated'}
+        - Product Columns: {', '.join(product_cols) if product_cols else 'None'}
+        **Analysis Results:**
+        - Total Records: {len(processed_df)}
+        - Processed File Saved: {output_file}
+        - AI Model Used: {model_manager.current_model if model_manager else 'None'}
+        """
+        # Create data preview (first 10 rows for display)
+        preview = processed_df.head(10)
+        # Get first visualization for immediate display
+        first_viz = list(visualizations.values())[0] if visualizations else None
+        # Return all results for the Gradio interface
+        return (
+            summary,                                          # Processing status
+            preview,                                          # Data preview
+            output_file,                                      # Downloadable processed file
+            ai_insights,                                      # AI-generated insights
+            first_viz,                                        # First visualization
+            "Ready for search",                               # Search status
+            gr.update(choices=list(visualizations.keys()))    # Update visualization dropdown
+        )
+    except Exception as e:
+        # Return error message if anything goes wrong
+        return f"Error: {str(e)}", None, None, None, None, None, gr.update(choices=[])
+def search_data(query):
+    """
+    Search through the data with enhanced semantic search
+    Uses the built search engine to find relevant text entries
+    """
+    global analyzer, current_data
+    # Check if data has been processed
+    if analyzer is None or current_data is None:
+        return "Please process a file first", None, None
+    # Check if search query was provided
+    if not query:
+        return "Please enter a search query", None, None
+    try:
+        # Perform the search using the search engine
+        results = analyzer.search_engine.search(query, top_k=10)
+        # Check if any results were found
+        if results.empty:
+            return "No results found", None, None
+        # Select relevant columns for display (updated to include new topic columns)
+        display_cols = ['unique_id', 'combined_text', 'sentiment', 'topic_1', 'topic_2', 'topic_3', 'actionable_insights', 'search_score']
+        display_cols = [col for col in display_cols if col in results.columns]  # Only include existing columns
+        results_display = results[display_cols]  # Create display dataframe
+        # Save search results to file for download
+        search_output = f"search_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
+        results_display.to_excel(search_output, index=False)
+        # Return search results and status
+        return f"Found {len(results)} results", results_display.head(10), search_output
+    except Exception as e:
+        return f"Search error: {str(e)}", None, None
+def update_visualization(viz_type):
+    """
+    Update displayed visualization based on user selection
+    Called when user selects a different visualization from dropdown
+    """
+    global current_visualizations
+    # Check if visualization exists and return it
+    if current_visualizations and viz_type in current_visualizations:
+        return current_visualizations[viz_type]
+    return None  # Return None if visualization not found
+def export_results(format_type):
+    """
+    Export processed data in different formats (Excel or CSV)
+    Allows users to download their analyzed data
+    """
+    global current_data
+    # Check if there's data to export
+    if current_data is None:
+        return "No data to export", None
+    try:
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')  # Create timestamp for unique filename
+        # Export based on selected format
+        if format_type == "Excel":
+            output_file = f"analysis_results_{timestamp}.xlsx"
+            current_data.to_excel(output_file, index=False)  # Save as Excel
+        else:  # CSV
+            output_file = f"analysis_results_{timestamp}.csv"
+            current_data.to_csv(output_file, index=False)    # Save as CSV
+        return f"Data exported to {output_file}", output_file
+    except Exception as e:
+        return f"Export error: {str(e)}", None
+# ===== GRADIO INTERFACE CREATION =====
+def create_interface():
+    """
+    Create the Gradio interface with model selection
+    This function builds the entire web interface using Gradio
+    """
+    # Create the main Gradio application with soft theme
+    with gr.Blocks(theme=gr.themes.Soft()) as app:
+        # Main title and description
+        gr.Markdown(
+            """
+            # 📊 Enhanced Text Analytics AI Agent
+            ### Smart Column Detection & Comprehensive Text Analysis with Multiple AI Models
+            **Features:**
+            - 🤖 Multiple AI Model Support (OpenAI, Anthropic, Deepseek, Groq, Google)
+            - 🔍 Automatic detection of text, ID, and product columns
+            - 💾 Memory-efficient processing with automatic file cleanup
+            - 😊 Sentiment analysis with scoring
+            - 🎯 Topic/theme extraction
+            - 💡 Actionable insights generation
+            - 🔎 Advanced text search with similarity scoring
+            - 📈 Multiple visualization options
+            - 📥 Export results in Excel or CSV format
+            """
+        )
+        # Tab 1: Upload & Process
+        with gr.Tab("📤 Upload & Process"):
+            with gr.Row():
+                with gr.Column(scale=1):  # Left column for controls
+                    # Model selection dropdown
+                    model_dropdown = gr.Dropdown(
+                        label="🤖 Select AI Model",
+                        choices=model_manager.get_available_models(),  # Get available models
+                        value=model_manager.current_model if model_manager.current_model else None,
+                        interactive=True
+                    )
+                    # File upload component
+                    file_upload = gr.File(
+                        label="Upload Data File",
+                        file_types=[".csv", ".xlsx", ".xls", ".json"]  # Supported file types
+                    )
+                    # Process button
+                    process_btn = gr.Button("🚀 Process File", variant="primary")
+                with gr.Column(scale=2):  # Right column for results
+                    status_output = gr.Markdown(label="Processing Status")      # Processing status display
+                    ai_insights = gr.Markdown(label="AI-Generated Insights")   # AI insights display
+            # Data preview section
+            with gr.Row():
+                data_preview = gr.Dataframe(
+                    label="Data Preview (First 10 rows)",
+                    interactive=False  # Read-only display
+                )
+            # Processed file download
+            processed_file = gr.File(
+                label="📁 Processed Data File",
+                interactive=False  # Read-only, for download only
+            )
+        # Tab 2: Search
+        with gr.Tab("🔍 Search"):
+            gr.Markdown("### Search through your text data")
+            with gr.Row():
+                # Search input box
+                search_input = gr.Textbox(
+                    label="Enter search query",
+                    placeholder="Type keywords to search..."
+                )
+                # Search button
+                search_btn = gr.Button("🔎 Search", variant="primary")
+            # Search results display
+            search_status = gr.Markdown(label="Search Status")       # Search status
+            search_results = gr.Dataframe(                          # Search results table
+                label="Search Results",
+                interactive=False
+            )
+            search_file = gr.File(                                   # Download search results
+                label="📥 Download Search Results",
+                interactive=False
+            )
+        # Tab 3: Visualizations
+        with gr.Tab("📈 Visualizations"):
+            with gr.Row():
+                # Visualization selector dropdown
+                viz_selector = gr.Dropdown(
+                    label="Select Visualization",
+                    choices=[],          # Will be populated after processing
+                    interactive=True
+                )
+            # Visualization display area
+            viz_plot = gr.Plot(label="Visualization")
+        # Tab 4: Export
+        with gr.Tab("📥 Export"):
+            gr.Markdown("### Export your analyzed data")
+            with gr.Row():
+                # Export format selection
+                export_format = gr.Radio(
+                    choices=["Excel", "CSV"],
+                    value="Excel",
+                    label="Export Format"
+                )
+                # Export button
+                export_btn = gr.Button("📥 Export Data", variant="primary")
+            # Export results display
+            export_status = gr.Markdown(label="Export Status")      # Export status
+            export_file = gr.File(                                  # Download exported file
+                label="📁 Download Exported File",
+                interactive=False
+            )
+        # ===== EVENT HANDLERS =====
+        # These connect user interactions to the backend functions
+        # Model selection change handler
+        model_dropdown.change(
+            fn=update_model,                # Function to call
+            inputs=[model_dropdown],        # Input components
+            outputs=[status_output]         # Output components
+        )
+        # File processing button click handler
+        process_btn.click(
+            fn=process_file,               # Function to call
+            inputs=[file_upload, model_dropdown],  # Input components
+            outputs=[                      # Output components
+                status_output,
+                data_preview,
+                processed_file,
+                ai_insights,
+                viz_plot,
+                search_status,
+                viz_selector
+            ]
+        )
+        # Search button click handler
+        search_btn.click(
+            fn=search_data,                # Function to call
+            inputs=[search_input],         # Input components
+            outputs=[search_status, search_results, search_file]  # Output components
+        )
+        # Visualization selector change handler
+        viz_selector.change(
+            fn=update_visualization,       # Function to call
+            inputs=[viz_selector],         # Input components
+            outputs=[viz_plot]             # Output components
+        )
+        # Export button click handler
+        export_btn.click(
+            fn=export_results,             # Function to call
+            inputs=[export_format],        # Input components
+            outputs=[export_status, export_file]  # Output components
+        )
+    return app  # Return the complete Gradio application
+# ===== APPLICATION LAUNCH =====
+# Launch the application when script is run directly
+if __name__ == "__main__":
+    app = create_interface()                    # Create the Gradio interface
+    app.launch(share=True, debug=True)          # Launch with public sharing and debug mode

requirements.txt ADDED Viewed

	@@ -0,0 +1,34 @@

+# Core data processing and analysis
+pandas>=2.0.0
+numpy>=1.24.0
+# Environment and configuration
+python-dotenv>=1.0.0
+# AI/ML APIs
+anthropic>=0.25.0
+openai>=1.30.0
+groq>=0.8.0
+google-generativeai>=0.5.0
+# Natural Language Processing
+nltk>=3.8.0
+textblob>=0.17.1
+# Machine Learning
+scikit-learn>=1.3.0
+# Visualization
+plotly>=5.15.0
+matplotlib>=3.7.0
+seaborn>=0.12.0
+# Web Interface
+gradio>=4.25.0
+# File handling (additional support)
+openpyxl>=3.1.0
+xlrd>=2.0.0
+# Optional: For better performance
+numba>=0.57.0